[med-svn] [python-pbcore] 01/04: Imported Upstream version 1.2.11+dfsg

Afif Elghraoui afif at moszumanska.debian.org
Thu Jan 26 08:28:50 UTC 2017


This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository python-pbcore.

commit 114e57037464896e7e8bf654d91f7e489f694278
Author: Afif Elghraoui <afif at debian.org>
Date:   Wed Jan 25 22:47:43 2017 -0800

    Imported Upstream version 1.2.11+dfsg
---
 .circleci/installHDF5.sh                           |    2 +-
 CHANGELOG.org                                      |    5 +
 Makefile                                           |    5 +
 pbcore/__init__.py                                 |    2 +-
 pbcore/chemistry/resources/mapping.xml             |   60 +-
 pbcore/data/__init__.py                            |    3 +
 pbcore/data/empty.aligned_subreads.bam             |  Bin 0 -> 728 bytes
 pbcore/data/empty.aligned_subreads.bam.pbi         |  Bin 0 -> 67 bytes
 pbcore/io/GffIO.py                                 |   70 +-
 pbcore/io/align/BamIO.py                           |   26 +-
 pbcore/io/align/CmpH5IO.py                         |    6 +-
 pbcore/io/align/PacBioBamIndex.py                  |   10 +-
 pbcore/io/dataset/DataSetErrors.py                 |   34 +
 pbcore/io/dataset/DataSetIO.py                     |  339 +++---
 pbcore/io/dataset/DataSetMembers.py                |  850 +++++++++------
 pbcore/io/dataset/DataSetMetaTypes.py              |   86 ++
 pbcore/io/dataset/DataSetReader.py                 |   46 +-
 pbcore/{__init__.py => io/dataset/DataSetUtils.py} |   61 +-
 pbcore/io/dataset/DataSetValidator.py              |   41 +-
 pbcore/io/dataset/DataSetWriter.py                 |   43 +-
 pbcore/io/dataset/__init__.py                      |   35 +
 pbcore/io/dataset/utils.py                         |  172 ++-
 requirements-dev.txt                               |    2 +-
 tests/test_pbcore_io_AlnFileReaders.py             |   12 +-
 tests/test_pbcore_io_GffIO.py                      |   11 +
 tests/test_pbdataset.py                            | 1090 ++++----------------
 tests/test_pbdataset_filters.py                    |  373 +++++++
 tests/test_pbdataset_metadata.py                   |  114 ++
 tests/test_pbdataset_split.py                      |  506 +++++++++
 tests/test_pbdataset_subtypes.py                   |  205 ++--
 tests/test_pbdataset_utils.py                      |  183 ++++
 tests/utils.py                                     |   40 +
 32 files changed, 2842 insertions(+), 1590 deletions(-)

diff --git a/.circleci/installHDF5.sh b/.circleci/installHDF5.sh
index 3e7ab39..15b9049 100644
--- a/.circleci/installHDF5.sh
+++ b/.circleci/installHDF5.sh
@@ -1,7 +1,7 @@
 set -x
 set -e
 if [ ! -e prefix/lib/libhdf5.so ]; then
-  wget https://www.hdfgroup.org/ftp/HDF5//releases/hdf5-1.8.12/src/hdf5-1.8.12.tar.gz
+  wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8.12/src/hdf5-1.8.12.tar.gz
   tar xzf hdf5-1.8.12.tar.gz
   mkdir -p prefix
   PREFIX=$PWD/prefix
diff --git a/CHANGELOG.org b/CHANGELOG.org
index daccd48..baf6708 100644
--- a/CHANGELOG.org
+++ b/CHANGELOG.org
@@ -1,3 +1,8 @@
+* Version 1.2.11
+  - Support for 3.2 basecaller and chemistry
+  - GFF bug fixes
+  - Misc fixes and enhancements
+
 * Version 1.2.10 (> SMRTanalysis 3.1)
   - Update to pysam 0.9.0
   - Recognize new and prospective future partnumbers for the 3.1 timeframe
diff --git a/Makefile b/Makefile
index 64e10c7..2ea1010 100644
--- a/Makefile
+++ b/Makefile
@@ -50,3 +50,8 @@ pip-install:
       && pip uninstall -y pbcore \
       || echo -n ''
 	@pip install --no-index ./
+
+
+publish-to-pypi:
+	@echo "I'm not going to do this for you, but you can do it by:"
+	@echo "    % python setup.py sdist upload -r pypi"
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
index 7375688..3e5263b 100644
--- a/pbcore/__init__.py
+++ b/pbcore/__init__.py
@@ -28,4 +28,4 @@
 # POSSIBILITY OF SUCH DAMAGE.
 #################################################################################
 
-__VERSION__ = "1.2.10"
+__VERSION__ = "1.2.11"
diff --git a/pbcore/chemistry/resources/mapping.xml b/pbcore/chemistry/resources/mapping.xml
index 0f7e95b..528b7a4 100644
--- a/pbcore/chemistry/resources/mapping.xml
+++ b/pbcore/chemistry/resources/mapping.xml
@@ -175,6 +175,8 @@
     <SequencingKit>100612400</SequencingKit>
     <SoftwareVersion>2.3</SoftwareVersion>
   </Mapping>
+
+  <!-- 3.0 ("Dromedary") chemistry -->
   <Mapping>
     <SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
     <BindingKit>100-619-300</BindingKit>
@@ -188,71 +190,41 @@
     <SoftwareVersion>3.1</SoftwareVersion>
   </Mapping>
 
-  <!-- 1.0/1.1: a 1/1 compabible new SK condition  -->
+  <!-- 3.1 ("Echidna") chemistry -->
   <Mapping>
-    <SequencingChemistry>S/P1-C1</SequencingChemistry>
+    <SequencingChemistry>S/P1-C1.1</SequencingChemistry>
     <BindingKit>100-619-300</BindingKit>
     <SequencingKit>100-867-300</SequencingKit>
     <SoftwareVersion>3.1</SoftwareVersion>
   </Mapping>
 
-  <!-- 1.1/1.1: a 1/1 compatible new BK and SK condition -->
   <Mapping>
-    <SequencingChemistry>S/P1-C1</SequencingChemistry>
-    <BindingKit>100-867-500</BindingKit>
+    <SequencingChemistry>S/P1-C1.1</SequencingChemistry>
+    <BindingKit>100-619-300</BindingKit>
     <SequencingKit>100-867-300</SequencingKit>
-    <SoftwareVersion>3.1</SoftwareVersion>
-  </Mapping>
-
-  <!-- 2.0/2.0: new kits... 1/1 compatible... -->
-  <Mapping>
-    <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
-    <BindingKit>100-864-400</BindingKit>
-    <SequencingKit>100-864-100</SequencingKit>
-    <SoftwareVersion>3.1</SoftwareVersion>
+    <SoftwareVersion>3.2</SoftwareVersion>
   </Mapping>
 
-  <!-- 2.1/2.1, ibid -->
+  <!-- 3.1.1 ("Flea") chemistry -->
   <Mapping>
-    <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
-    <BindingKit>100-864-600</BindingKit>
-    <SequencingKit>100-864-200</SequencingKit>
+    <SequencingChemistry>S/P1-C1.2</SequencingChemistry>
+    <BindingKit>100-619-300</BindingKit>
+    <SequencingKit>100-902-100</SequencingKit>
     <SoftwareVersion>3.1</SoftwareVersion>
   </Mapping>
 
-  <!-- 3.2 basecaller support -->
   <Mapping>
-    <SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
+    <SequencingChemistry>S/P1-C1.2</SequencingChemistry>
     <BindingKit>100-619-300</BindingKit>
-    <SequencingKit>100-620-000</SequencingKit>
+    <SequencingKit>100-902-100</SequencingKit>
     <SoftwareVersion>3.2</SoftwareVersion>
   </Mapping>
 
+  <!---3.2 ("Goat") chemistry -->
   <Mapping>
-    <SequencingChemistry>S/P1-C1</SequencingChemistry>
+    <SequencingChemistry>S/P1-C1.3</SequencingChemistry>
     <BindingKit>100-619-300</BindingKit>
-    <SequencingKit>100-867-300</SequencingKit>
-    <SoftwareVersion>3.2</SoftwareVersion>
-  </Mapping>
-
-  <Mapping>
-    <SequencingChemistry>S/P1-C1</SequencingChemistry>
-    <BindingKit>100-867-500</BindingKit>
-    <SequencingKit>100-867-300</SequencingKit>
-    <SoftwareVersion>3.2</SoftwareVersion>
-  </Mapping>
-
-  <Mapping>
-    <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
-    <BindingKit>100-864-400</BindingKit>
-    <SequencingKit>100-864-100</SequencingKit>
-    <SoftwareVersion>3.2</SoftwareVersion>
-  </Mapping>
-
-  <Mapping>
-    <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
-    <BindingKit>100-864-600</BindingKit>
-    <SequencingKit>100-864-200</SequencingKit>
+    <SequencingKit>100-972-200</SequencingKit>
     <SoftwareVersion>3.2</SoftwareVersion>
   </Mapping>
 
diff --git a/pbcore/data/__init__.py b/pbcore/data/__init__.py
index 433b5bf..eec7adf 100644
--- a/pbcore/data/__init__.py
+++ b/pbcore/data/__init__.py
@@ -171,3 +171,6 @@ def getUnalignedBam():
 
 def getEmptyBam():
     return _getAbsPath("empty.ccs.bam")
+
+def getEmptyAlignedBam():
+    return _getAbsPath("empty.aligned_subreads.bam")
diff --git a/pbcore/data/empty.aligned_subreads.bam b/pbcore/data/empty.aligned_subreads.bam
new file mode 100644
index 0000000..18f0531
Binary files /dev/null and b/pbcore/data/empty.aligned_subreads.bam differ
diff --git a/pbcore/data/empty.aligned_subreads.bam.pbi b/pbcore/data/empty.aligned_subreads.bam.pbi
new file mode 100644
index 0000000..e398d79
Binary files /dev/null and b/pbcore/data/empty.aligned_subreads.bam.pbi differ
diff --git a/pbcore/io/GffIO.py b/pbcore/io/GffIO.py
index a08f0b7..c7decaf 100644
--- a/pbcore/io/GffIO.py
+++ b/pbcore/io/GffIO.py
@@ -42,8 +42,9 @@ __all__ = [ "Gff3Record",
             "GffWriter" ]
 
 from .base import ReaderBase, WriterBase
-from collections import OrderedDict, defaultdict
+from collections import OrderedDict, defaultdict, namedtuple
 from copy import copy as shallow_copy
+import logging
 import tempfile
 import os.path
 
@@ -295,46 +296,37 @@ def _merge_gff_headers(gff_files):
     return headers, header_keys
 
 
-def _merge_gffs_sorted(gff1, gff2, out_file):
-    import logging
-    logging.warn("Merging %s and %s" % (gff1, gff2))
-    with GffWriter(out_file) as out:
-        n_rec = 0
-        headers, header_keys = _merge_gff_headers([gff1, gff2])
-        with GffReader(gff1) as f1:
-            for key in header_keys:
-                for value in headers[key]:
-                    out.writeHeader("##%s %s" % (key, value))
-            with GffReader(gff2) as f2:
-                rec1 = [ rec for rec in f1 ]
-                rec2 = [ rec for rec in f2 ]
-                i = j = 0
-                while i < len(rec1) and j < len(rec2):
-                    if (rec1[i] > rec2[j]):
-                        out.writeRecord(rec2[j])
-                        j += 1
-                    else:
-                        out.writeRecord(rec1[i])
-                        i += 1
-                for rec in rec1[i:]:
-                    out.writeRecord(rec)
-                    i += 1
-                for rec in rec2[j:]:
-                    out.writeRecord(rec)
-                    j += 2
-        return i + j
-
+GffHead = namedtuple("GffHead", ("file_name", "record", "headers"))
 
 def merge_gffs_sorted(gff_files, output_file_name):
     """
     Utility function to combine a set of N (>= 2) GFF files, with records
-    ordered by genomic location.  (Assuming each input file is already sorted.)
+    ordered by genomic location.  This assumes that each input file is already
+    sorted, and forms a contiguous set of records.
     """
-    if len(gff_files) == 1: return gff_files[0]
-    while len(gff_files) > 2:
-        tmpout = tempfile.NamedTemporaryFile(suffix=".gff").name
-        _merge_gffs_sorted(gff_files[0], gff_files[1], tmpout)
-        gff_files = [tmpout] + gff_files[2:]
-    with open(output_file_name, "w") as f:
-        _merge_gffs_sorted(gff_files[0], gff_files[1], f)
-    return output_file_name
+    # collect the very first record of each GFF and use that to sort the files
+    first_records = []
+    empty_files = []
+    for file_name in gff_files:
+        with GffReader(file_name) as f:
+            for rec in f:
+                first_records.append(GffHead(file_name, rec, f.headers))
+                break
+            else:
+                empty_files.append(file_name)
+    first_records.sort(lambda a,b: cmp(a.record, b.record))
+    gff_files = [f.file_name for f in first_records]
+    gff_files.extend(empty_files)
+    headers, header_keys = _merge_gff_headers(gff_files)
+    nrec = 0
+    with GffWriter(output_file_name) as out:
+        for key in header_keys:
+            for value in headers[key]:
+                out.writeHeader("##%s %s" % (key, value))
+        for file_name in gff_files:
+            logging.info("Merging {f}".format(f=file_name))
+            with GffReader(file_name) as gff:
+                for rec in gff:
+                    out.writeRecord(rec)
+                    nrec += 1
+    return nrec
diff --git a/pbcore/io/align/BamIO.py b/pbcore/io/align/BamIO.py
index a9e7fa7..f0d5d99 100644
--- a/pbcore/io/align/BamIO.py
+++ b/pbcore/io/align/BamIO.py
@@ -114,7 +114,6 @@ class _BamReaderBase(ReaderBase):
             rgChem = decodeTriple(*triple)
             rgReadType = ds["READTYPE"]
             rgFrameRate = ds["FRAMERATEHZ"]
-            readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate))
 
             # Look for the features manifest entries within the DS tag,
             # and build an "indirection layer", i.e. to get from
@@ -130,13 +129,17 @@ class _BamReaderBase(ReaderBase):
             self._baseFeatureNameMappings[rgID]  = baseFeatureNameMapping
             self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping
 
+            readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate,
+                                    frozenset(baseFeatureNameMapping.iterkeys())))
+
         self._readGroupTable = np.rec.fromrecords(
             readGroupTable_,
             dtype=[("ID"                 , np.int32),
                    ("MovieName"          , "O"),
                    ("ReadType"           , "O"),
                    ("SequencingChemistry", "O"),
-                   ("FrameRate",           float)])
+                   ("FrameRate",           float),
+                   ("BaseFeatures",        "O")])
         assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
             "First 8 chars of read group IDs must be unique!"
 
@@ -176,19 +179,16 @@ class _BamReaderBase(ReaderBase):
     def _checkFileCompatibility(self):
         # Verify that this is a "pacbio" BAM file of version at least
         # 3.0.1
-        try:
-            checkedVersion = self.version
-            if "b" in checkedVersion:
-                raise Exception()
-            else:
-                major, minor, patch = checkedVersion.split('.')
-                assert major >= 3
-                assert minor >= 0
-                assert patch >= 1
-        except:
-            raise IncompatibleFile(
+        badVersionException = IncompatibleFile(
                 "This BAM file is incompatible with this API " +
                 "(only PacBio BAM files version >= 3.0.1 are supported)")
+        checkedVersion = self.version
+        if "b" in checkedVersion:
+            raise badVersionException
+        else:
+            major, minor, patch = checkedVersion.split('.')
+            if not (major, minor, patch) >= (3, 0, 1):
+                raise badVersionException
 
     def __init__(self, fname, referenceFastaFname=None):
         self.filename = fname = abspath(expanduser(fname))
diff --git a/pbcore/io/align/CmpH5IO.py b/pbcore/io/align/CmpH5IO.py
index aa5e83b..2e18108 100755
--- a/pbcore/io/align/CmpH5IO.py
+++ b/pbcore/io/align/CmpH5IO.py
@@ -796,12 +796,14 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
                 self._movieInfoTable.Name,
                 [self.readType] * len(self._movieInfoTable.ID),
                 self.sequencingChemistry,
-                self._movieInfoTable.FrameRate),
+                self._movieInfoTable.FrameRate,
+                [frozenset(self.baseFeaturesAvailable())] * len(self._movieInfoTable.ID)),
             dtype=[("ID"                 , np.int32),
                    ("MovieName"          , "O"),
                    ("ReadType"           , "O"),
                    ("SequencingChemistry", "O"),
-                   ("FrameRate"          , float)])
+                   ("FrameRate"          , float),
+                   ("BaseFeatures"       , "O")])
         self._readGroupDict = { rg.ID : rg
                                 for rg in self._readGroupTable }
 
diff --git a/pbcore/io/align/PacBioBamIndex.py b/pbcore/io/align/PacBioBamIndex.py
index 89ae316..c959c4d 100644
--- a/pbcore/io/align/PacBioBamIndex.py
+++ b/pbcore/io/align/PacBioBamIndex.py
@@ -71,7 +71,15 @@ class PacBioBamIndex(object):
 
     @property
     def hasMappingInfo(self):
-        return (self.pbiFlags & PBI_FLAGS_MAPPED)
+        # N.B.: Or'ing in (nReads==0) is HACKish fix for issue with
+        # empty mapped BAM files---the "Mapped" bit doesn't get set
+        # correctly in pbi_flags if the file is empty. So in the empty
+        # file case, assume it may be mapped.  The downside is we
+        # don't get an exception on attempting to access the mapped
+        # columns for empty unmapped BAM pbis.  Not a big deal, I
+        # suppose
+        return ((self.nReads == 0) or
+                (self.pbiFlags & PBI_FLAGS_MAPPED))
 
     @property
     def hasCoordinateSortedInfo(self):
diff --git a/pbcore/io/dataset/DataSetErrors.py b/pbcore/io/dataset/DataSetErrors.py
index 4695df9..1623475 100644
--- a/pbcore/io/dataset/DataSetErrors.py
+++ b/pbcore/io/dataset/DataSetErrors.py
@@ -1,3 +1,37 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
 
 class InvalidDataSetIOError(Exception):
     """The base class for all DataSetIO related custom exceptions
diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py
index cbde652..3ad720a 100755
--- a/pbcore/io/dataset/DataSetIO.py
+++ b/pbcore/io/dataset/DataSetIO.py
@@ -1,9 +1,43 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
 """
 Classes representing DataSets of various types.
 """
 
 import hashlib
-import datetime
 import copy
 import os, sys
 import re
@@ -39,39 +73,23 @@ from pbcore.io.dataset.DataSetMembers import (DataSetMetadata,
                                               ExternalResource, Filters)
 from pbcore.io.dataset.utils import (consolidateBams, _infixFname, _pbindexBam,
                                      _indexBam, _indexFasta, _fileCopy,
-                                     _swapPath, which, consolidateXml)
+                                     _swapPath, which, consolidateXml,
+                                     getTimeStampedName)
 from pbcore.io.dataset.DataSetErrors import (InvalidDataSetIOError,
                                              ResourceMismatchError)
+from pbcore.io.dataset.DataSetMetaTypes import (DataSetMetaTypes, toDsId,
+                                                dsIdToSuffix)
+from pbcore.io.dataset.DataSetUtils import fileType
 
 
 log = logging.getLogger(__name__)
 
-def filtered(generator):
-    """Wrap a generator with postfiltering"""
-    @wraps(generator)
-    def wrapper(dset, *args, **kwargs):
-        filter_tests = dset.processFilters()
-        no_filter = dset.noFiltering
-        if no_filter:
-            for read in generator(dset, *args, **kwargs):
-                yield read
-        else:
-            for read in generator(dset, *args, **kwargs):
-                if any(filt(read) for filt in filter_tests):
-                    yield read
-    return wrapper
-
-def _getTimeStampedName(mType):
-    """Generate a timestamped name using the given metatype 'mType' and the
-    current UTC time"""
-    mType = mType.lower()
-    mType = '_'.join(mType.split('.'))
-    time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
-    return "{m}-{t}".format(m=mType, t=time)
-
-def _toDsId(name):
-    """Translate a class name into a MetaType/ID"""
-    return "PacBio.DataSet.{x}".format(x=name)
+def openDataSet(*files, **kwargs):
+    """Factory function for DataSet types as suggested by the first file"""
+    if files[0].endswith('xml'):
+        tbrType = _typeDataSet(files[0])
+        return tbrType(*files, **kwargs)
+    return openDataFile(*files, **kwargs)
 
 def _dsIdToName(dsId):
     """Translate a MetaType/ID into a class name"""
@@ -88,23 +106,10 @@ def _dsIdToType(dsId):
     else:
         raise InvalidDataSetIOError("Invalid DataSet MetaType")
 
-def _dsIdToSuffix(dsId):
-    """Translate a MetaType/ID into a file suffix"""
-    dsIds = DataSetMetaTypes.ALL
-    suffixMap = {dsId: _dsIdToName(dsId) for dsId in dsIds}
-    suffixMap[_toDsId("DataSet")] = 'DataSet'
-    if DataSetMetaTypes.isValid(dsId):
-        suffix = suffixMap[dsId]
-        suffix = suffix.lower()
-        suffix += '.xml'
-        return suffix
-    else:
-        raise InvalidDataSetIOError("Invalid DataSet MetaType")
-
 def _typeDataSet(dset):
     """Determine the type of a dataset from the xml file without opening it"""
     xml_rt = xmlRootType(dset)
-    dsId = _toDsId(xml_rt)
+    dsId = toDsId(xml_rt)
     tbrType = _dsIdToType(dsId)
     return tbrType
 
@@ -116,39 +121,6 @@ def isDataSet(xmlfile):
     except Exception:
         return False
 
-def getDataSetUuid(xmlfile):
-    """
-    Quickly retrieve the uuid from the root element of a dataset XML file,
-    using a streaming parser to avoid loading the entire dataset into memory.
-    Returns None if the parsing fails.
-    """
-    try:
-        import xml.etree.cElementTree as ET
-        for event, element in ET.iterparse(xmlfile, events=("start",)):
-            return element.get("UniqueId")
-    except Exception:
-        return None
-
-
-def getDataSetMetaType(xmlfile):
-    """
-    Quickly retrieve the MetaType from the root element of a dataset XML file,
-    using a streaming parser to avoid loading the entire dataset into memory.
-    Returns None if the parsing fails.
-    """
-    try:
-        import xml.etree.cElementTree as ET
-        for event, element in ET.iterparse(xmlfile, events=("start",)):
-            return element.get("MetaType")
-    except Exception:
-        return None
-
-
-def openDataSet(*files, **kwargs):
-    """Factory function for DataSet types as suggested by the first file"""
-    tbrType = _typeDataSet(files[0])
-    return tbrType(*files, **kwargs)
-
 def openDataFile(*files, **kwargs):
     """Factory function for DataSet types determined by the first data file"""
     possibleTypes = [AlignmentSet, SubreadSet, ConsensusReadSet,
@@ -158,10 +130,10 @@ def openDataFile(*files, **kwargs):
     for dstype in possibleTypes:
         for ftype in dstype._metaTypeMapping():
             fileMap[ftype].append(dstype)
-    ext = _fileType(files[0])
+    ext = fileType(files[0])
     if ext == 'fofn':
         files = openFofnFile(files[0])
-        ext = _fileType(files[0])
+        ext = fileType(files[0])
     if ext == 'xml':
         dsType = _typeDataSet(files[0])
         return dsType(*origFiles, **kwargs)
@@ -191,11 +163,40 @@ def openDataFile(*files, **kwargs):
                 else:
                     return SubreadSet(*origFiles, **kwargs)
 
+def filtered(generator):
+    """Wrap a generator with postfiltering"""
+    @wraps(generator)
+    def wrapper(dset, *args, **kwargs):
+        filter_tests = dset.processFilters()
+        no_filter = dset.noFiltering
+        if no_filter:
+            for read in generator(dset, *args, **kwargs):
+                yield read
+        else:
+            for read in generator(dset, *args, **kwargs):
+                if any(filt(read) for filt in filter_tests):
+                    yield read
+    return wrapper
+
 def _stackRecArrays(recArrays):
     """Stack recarrays into a single larger recarray"""
-    tbr = np.concatenate(recArrays)
-    tbr = tbr.view(np.recarray)
-    return tbr
+
+    empties = []
+    nonempties = []
+    for array in recArrays:
+        if len(array) == 0:
+            empties.append(array)
+        else:
+            nonempties.append(array)
+    if nonempties:
+        tbr = np.concatenate(nonempties)
+        tbr = tbr.view(np.recarray)
+        return tbr
+    else:
+        # this will check dtypes...
+        tbr = np.concatenate(empties)
+        tbr = tbr.view(np.recarray)
+        return tbr
 
 def _uniqueRecords(recArray):
     """Remove duplicate records"""
@@ -256,46 +257,12 @@ def keysToRanges(keys):
     key_ranges = [[min(k), max(k)] for k in keys]
     return key_ranges
 
-class DataSetMetaTypes(object):
-    """
-    This mirrors the PacBioSecondaryDataModel.xsd definitions and be used
-    to reference a specific dataset type.
-    """
-    SUBREAD = _toDsId("SubreadSet")
-    HDF_SUBREAD = _toDsId("HdfSubreadSet")
-    ALIGNMENT = _toDsId("AlignmentSet")
-    BARCODE = _toDsId("BarcodeSet")
-    CCS_ALIGNMENT = _toDsId("ConsensusAlignmentSet")
-    CCS = _toDsId("ConsensusReadSet")
-    REFERENCE = _toDsId("ReferenceSet")
-    CONTIG = _toDsId("ContigSet")
-
-    ALL = (SUBREAD, HDF_SUBREAD, ALIGNMENT,
-           BARCODE, CCS, CCS_ALIGNMENT, REFERENCE, CONTIG)
-
-    @classmethod
-    def isValid(cls, dsId):
-        return dsId in cls.ALL
-
-
-def _fileType(fname):
-    """Get the extension of fname (with h5 type)"""
-    remainder, ftype = os.path.splitext(fname)
-    if ftype == '.h5':
-        _, prefix = os.path.splitext(remainder)
-        ftype = prefix + ftype
-    elif ftype == '.index':
-        _, prefix = os.path.splitext(remainder)
-        if prefix == '.contig':
-            ftype = prefix + ftype
-    ftype = ftype.strip('.')
-    return ftype
 
 def _fileExists(fname):
     """Assert that a file exists with a useful failure mode"""
     if not isinstance(fname, str):
         fname = fname.resourceId
-    if not os.path.isfile(fname):
+    if not os.path.exists(fname):
         raise InvalidDataSetIOError("Resource {f} not found".format(f=fname))
     return True
 
@@ -318,21 +285,17 @@ def _pathChanger(osPathFunc, metaTypeFunc, resource):
         metaTypeFunc(resource)
 
 def _copier(dest, resource, subfolder=None):
-    """Apply these two functions to the resource or ResourceId"""
     if subfolder is None:
         subfolder = [uuid.uuid4()]
     resId = resource.resourceId
     currentPath = urlparse(resId)
     if currentPath.scheme == 'file' or not currentPath.scheme:
         currentPath = currentPath.path
-        try:
+        if resource.KEEP_WITH_PARENT:
+            currentPath = _fileCopy(dest, currentPath, uuid=subfolder[0])
+        else:
             currentPath = _fileCopy(dest, currentPath, uuid=resource.uniqueId)
             subfolder[0] = resource.uniqueId
-        except AttributeError:
-            if subfolder:
-                currentPath = _fileCopy(dest, currentPath, uuid=subfolder[0])
-            else:
-                raise
         resource.resourceId = currentPath
 
 
@@ -450,9 +413,9 @@ class DataSet(object):
             dsType = self.objMetadata.setdefault("MetaType", self.datasetType)
         else:
             dsType = self.objMetadata.setdefault("MetaType",
-                                                 _toDsId('DataSet'))
+                                                 toDsId('DataSet'))
         if not "TimeStampedName" in self.objMetadata:
-            self.objMetadata["TimeStampedName"] = _getTimeStampedName(
+            self.objMetadata["TimeStampedName"] = getTimeStampedName(
                 self.objMetadata["MetaType"])
         self.objMetadata.setdefault("Name",
                                     self.objMetadata["TimeStampedName"])
@@ -514,7 +477,7 @@ class DataSet(object):
         if _induceIndices:
             self.induceIndices()
 
-    def induceIndices(self):
+    def induceIndices(self, force=False):
         """Generate indices for ExternalResources.
 
         Not compatible with DataSet base type"""
@@ -709,7 +672,7 @@ class DataSet(object):
             self.updateCounts()
         return self.numRecords
 
-    def newUuid(self, setter=True):
+    def newUuid(self, setter=True, random=False):
         """Generate and enforce the uniqueness of an ID for a new DataSet.
         While user setable fields are stripped out of the Core DataSet object
         used for comparison, the previous UniqueId is not. That means that
@@ -730,12 +693,15 @@ class DataSet(object):
             >>> old != ds.uuid
             True
         """
-        coreXML = toXml(self, core=True)
-        newId = str(hashlib.md5(coreXML).hexdigest())
+        if random:
+            newId = str(uuid.uuid4())
+        else:
+            coreXML = toXml(self, core=True)
+            newId = str(hashlib.md5(coreXML).hexdigest())
 
-        # Group appropriately
-        newId = '-'.join([newId[:8], newId[8:12], newId[12:16], newId[16:20],
-                          newId[20:]])
+            # Group appropriately
+            newId = '-'.join([newId[:8], newId[8:12], newId[12:16],
+                              newId[16:20], newId[20:]])
 
         if setter:
             self.objMetadata['UniqueId'] = newId
@@ -863,14 +829,14 @@ class DataSet(object):
 
         Args:
             :chunks: the number of chunks to split the DataSet.
-            :ignoreSubDatasets: (True) do not split on subdatasets
-            :contigs: split on contigs instead of external resources etc
+            :ignoreSubDatasets: (True) do not split by subdatasets
+            :contigs: split on contigs instead of external resources
+            :zmws: Split by zmws instead of external resources
+            :barcodes: Split by barcodes instead of external resources
             :maxChunks: The upper limit on the number of chunks.
-            :breakContigs: Whether or not to break contigs when using maxChunks
-            :targetSize: The target number of reads per chunk
-            :zmws: Split by zmws
-            :barcodes: Split by barcodes
+            :breakContigs: Whether or not to break contigs
             :byRecords: Split contigs by mapped records, rather than ref length
+            :targetSize: The target minimum number of reads per chunk
             :updateCounts: Update the count metadata in each chunk
 
         Returns:
@@ -975,6 +941,8 @@ class DataSet(object):
         # whole, therefore we need to regenerate it again here
         log.debug("Generating new UUID")
         for result in results:
+            if updateCounts:
+                result.updateCounts()
             result.newUuid()
 
         # Update the basic metadata for the new DataSets from external
@@ -1084,11 +1052,11 @@ class DataSet(object):
                 validation_errors.append(e)
         fileName = urlparse(outFile).path.strip()
         if self._strict and not isinstance(self.datasetType, tuple):
-            if not fileName.endswith(_dsIdToSuffix(self.datasetType)):
+            if not fileName.endswith(dsIdToSuffix(self.datasetType)):
                 raise IOError(errno.EIO,
                               "Given filename does not meet standards, "
                               "should end with {s}".format(
-                                  s=_dsIdToSuffix(self.datasetType)),
+                                  s=dsIdToSuffix(self.datasetType)),
                               fileName)
         with open(fileName, 'w') as outFile:
             outFile.writelines(xml_string)
@@ -1127,11 +1095,6 @@ class DataSet(object):
         else:
             statsMetadata = filename
         if self.metadata.summaryStats:
-            newSub = self.copy()
-            newSub.metadata.removeChildren('SummaryStats')
-            newSub.loadStats(statsMetadata)
-            self.addDatasets(self.copy())
-            self.addDatasets(newSub)
             self.metadata.summaryStats.merge(statsMetadata)
         else:
             self.metadata.append(statsMetadata)
@@ -1142,15 +1105,22 @@ class DataSet(object):
         return sorted(reads, key=lambda a: a.readStart)
 
     def loadMetadata(self, filename):
-        """Load pipeline metadata from a <moviename>.run.metadata.xml file.
+        """Load pipeline metadata from a <moviename>.metadata.xml file (or
+        other DataSet)
 
         Args:
-            :filename: the filename of a <moviename>.run.metadata.xml file
+            :filename: the filename of a <moviename>.metadata.xml file
 
         """
         if isinstance(filename, basestring):
-            metadata = parseMetadata(str(filename))
+            if isDataSet(filename):
+                # path to DataSet
+                metadata = openDataSet(filename).metadata
+            else:
+                # path to metadata.xml
+                metadata = parseMetadata(str(filename))
         else:
+            # DataSetMetadata object
             metadata = filename
         self.addMetadata(metadata)
         self.updateCounts()
@@ -1250,11 +1220,11 @@ class DataSet(object):
         """Infer and set the metatype of 'resource' if it doesn't already have
         one."""
         if not resource.metaType:
-            file_type = _fileType(resource.resourceId)
+            file_type = fileType(resource.resourceId)
             resource.metaType = self._metaTypeMapping().get(file_type, "")
         if not resource.timeStampedName:
             mtype = resource.metaType
-            tsName = _getTimeStampedName(mtype)
+            tsName = getTimeStampedName(mtype)
             resource.timeStampedName = tsName
 
     @staticmethod
@@ -1633,7 +1603,7 @@ class DataSet(object):
         if isinstance(self.datasetType, tuple):
             return self.datasetType
         else:
-            return (_toDsId('DataSet'), self.datasetType)
+            return (toDsId('DataSet'), self.datasetType)
 
     @classmethod
     def castableTypes(cls):
@@ -1655,6 +1625,7 @@ class DataSet(object):
                 'ConsensusReadSet': ConsensusReadSet,
                 'ConsensusAlignmentSet': ConsensusAlignmentSet,
                 'ReferenceSet': ReferenceSet,
+                'GmapReferenceSet': GmapReferenceSet,
                 'BarcodeSet': BarcodeSet}
     @property
     def metadata(self):
@@ -1739,6 +1710,11 @@ class DataSet(object):
         return self.objMetadata.get('UniqueId')
 
     @property
+    def uniqueId(self):
+        """The UniqueId of this DataSet"""
+        return self.objMetadata.get('UniqueId')
+
+    @property
     def name(self):
         """The name of this DataSet"""
         return self.objMetadata.get('Name', '')
@@ -1900,20 +1876,27 @@ class ReadSet(DataSet):
         super(ReadSet, self).__init__(*files, **kwargs)
         self._metadata = SubreadSetMetadata(self._metadata)
 
-    def induceIndices(self):
+    def induceIndices(self, force=False):
         for res in self.externalResources:
             fname = res.resourceId
             newInds = []
             if not res.pbi:
-                newInds.append(_pbindexBam(fname))
+                iname = fname + '.pbi'
+                if not os.path.isfile(iname) or force:
+                    iname = _pbindexBam(fname)
+                newInds.append(iname)
                 self.close()
             if not res.bai:
-                newInds.append(_indexBam(fname))
+                iname = fname + '.bai'
+                if not os.path.isfile(iname) or force:
+                    iname = _indexBam(fname)
+                newInds.append(iname)
                 self.close()
             if newInds:
                 res.addIndices(newInds)
         self._populateMetaTypes()
         self.updateCounts()
+        return newInds
 
     @property
     def isMapped(self):
@@ -2252,8 +2235,6 @@ class ReadSet(DataSet):
         _indexMap = []
         for rrNum, rr in enumerate(self.resourceReaders()):
             indices = rr.index
-            if len(indices) == 0:
-                continue
 
             self._fixQIds(indices, rr)
 
@@ -2433,11 +2414,11 @@ class HdfSubreadSet(ReadSet):
         # The metatype for this dataset type is inconsistent, plaster over it
         # here:
         self.objMetadata["MetaType"] = "PacBio.DataSet.HdfSubreadSet"
-        self.objMetadata["TimeStampedName"] = _getTimeStampedName(
+        self.objMetadata["TimeStampedName"] = getTimeStampedName(
             self.objMetadata["MetaType"])
 
-    def induceIndices(self):
-        log.debug("Bax files already indexed")
+    def induceIndices(self, force=False):
+        log.debug("Bax files don't have external indices")
 
     @property
     def isIndexed(self):
@@ -2605,11 +2586,11 @@ class AlignmentSet(ReadSet):
         else:
             return super(AlignmentSet, self).consolidate(*args, **kwargs)
 
-    def induceIndices(self):
+    def induceIndices(self, force=False):
         if self.isCmpH5:
             log.debug("Cmp.h5 files already indexed")
         else:
-            return super(AlignmentSet, self).induceIndices()
+            return super(AlignmentSet, self).induceIndices(force=force)
 
     @property
     def isIndexed(self):
@@ -2662,8 +2643,6 @@ class AlignmentSet(ReadSet):
         for rrNum, rr in enumerate(self.resourceReaders()):
             indices = rr.index
             # pbi files lack e.g. mapping cols when bam emtpy, ignore
-            if len(indices) == 0:
-                continue
             # TODO(mdsmith)(2016-01-19) rename the fields instead of branching:
             #if self.isCmpH5:
             #    _renameField(indices, 'MovieID', 'qId')
@@ -3833,10 +3812,13 @@ class ContigSet(DataSet):
             return '_'.join(name.split('_')[:-2]) + suff
         return name
 
-    def induceIndices(self):
+    def induceIndices(self, force=False):
         if not self.isIndexed:
             for extRes in self.externalResources:
-                extRes.addIndices([_indexFasta(extRes.resourceId)])
+                iname = extRes.resourceId + '.fai'
+                if not os.path.isfile(iname) or force:
+                    iname = _indexFasta(extRes.resourceId)
+                extRes.addIndices([iname])
             self.close()
         self._populateMetaTypes()
         self.updateCounts()
@@ -4154,6 +4136,35 @@ class ReferenceSet(ContigSet):
                }
 
 
+class GmapReferenceSet(ReferenceSet):
+    """DataSet type specific to GMAP References"""
+
+    datasetType = DataSetMetaTypes.GMAPREFERENCE
+
+    def __init__(self, *files, **kwargs):
+        super(GmapReferenceSet, self).__init__(*files, **kwargs)
+
+    @property
+    def gmap(self):
+        return self.externalResources[0].gmap
+
+    @gmap.setter
+    def gmap(self, value):
+        self.externalResources[0].gmap = value
+
+    @staticmethod
+    def _metaTypeMapping():
+        return {'fasta':'PacBio.ContigFile.ContigFastaFile',
+                'fa':'PacBio.ContigFile.ContigFastaFile',
+                'fas':'PacBio.ContigFile.ContigFastaFile',
+                'fai':'PacBio.Index.SamIndex',
+                'contig.index':'PacBio.Index.FastaContigIndex',
+                'index':'PacBio.Index.Indexer',
+                'sa':'PacBio.Index.SaWriterIndex',
+                'gmap': 'PacBio.GmapDB.GmapDBPath',
+               }
+
+
 class BarcodeSet(ContigSet):
     """DataSet type specific to Barcodes"""
 
diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py
index 99090ae..6ef9449 100755
--- a/pbcore/io/dataset/DataSetMembers.py
+++ b/pbcore/io/dataset/DataSetMembers.py
@@ -1,3 +1,43 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED
+# BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
+# BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES
+# OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
 """DataSetMetadata (also the tag of the Element in the DataSet XML
 representation) is somewhat challening to store, access, and (de)serialize
 efficiently. Here, we maintain a bulk representation of all of the dataset
@@ -49,29 +89,29 @@ serve two pruposes:
                               ['WellSample']['BioSamplePointers']\
                               ['BioSamplePointer'].record['text'])
 
+Notes:
+    - If you want temporary children to be retained for a classes's children,
+      pass parent=self to the child's constructor.
+        - it helps to add a TAG member...
+
 """
 
-#import hashlib
 import ast
 import uuid
-import datetime
 import copy
+import logging
 import operator as OP
 import numpy as np
-import logging
 from functools import partial as P
-from collections import Counter
-from pbcore.io.dataset.DataSetWriter import namespaces
+from collections import Counter, defaultdict
+from pbcore.io.dataset.utils import getTimeStampedName
+from pbcore.io.dataset.DataSetUtils import getDataSetUuid
 
 log = logging.getLogger(__name__)
 
-DISTLIST = ["ProdDist", "ReadTypeDist", "ReadLenDist", "ReadQualDist",
-            "MedianInsertDist", "InsertReadQualDist",
-            "InsertReadLenDist", "ControlReadQualDist",
-            "ControlReadLenDist"]
-
 def newUuid(record):
     # At some point the uuid may need to be a digest
+    #import hashlib
     #newId = str(hashlib.md5(str(record)).hexdigest())
 
     # Group appropriately
@@ -82,12 +122,6 @@ def newUuid(record):
     # Today is not that day
     return str(uuid.uuid4())
 
-def getTime():
-    return datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S")
-
-def getTimeStampedName(mType):
-    return "{m}-{t}".format(m=mType, t=getTime())
-
 OPMAP = {'==': OP.eq,
          '=': OP.eq,
          'eq': OP.eq,
@@ -105,6 +139,7 @@ OPMAP = {'==': OP.eq,
          '<': OP.lt,
          '<': OP.lt,
          'lt': OP.lt,
+         'in': np.in1d,
          '&': lambda x, y: OP.and_(x, y).view(np.bool_),
          '~': lambda x, y: np.logical_not(OP.and_(x, y).view(np.bool_)),
         }
@@ -112,6 +147,21 @@ OPMAP = {'==': OP.eq,
 def mapOp(op):
     return OPMAP[op]
 
+def setify(value):
+    # This is a string of a python or numpy list, needs to be made into a
+    # python set:
+    value = value.strip('set')
+    value = value.strip('()')
+    value = value.strip('[]')
+    if ',' in value:
+        value = value.split(',')
+    else:
+        value = value.split()
+    value = [v.strip() for v in value]
+    value = [v.strip("'") for v in value]
+    return set(value)
+
+
 class PbiFlags(object):
     NO_LOCAL_CONTEXT = 0
     ADAPTER_BEFORE = 1
@@ -128,6 +178,52 @@ class PbiFlags(object):
         return reduce(OP.or_,
                       (getattr(cls, fl.strip()) for fl in flag.split('|')))
 
+def subgetter(key, container='text', default=None, asType=(lambda x: x)):
+    def get(self):
+        return self.getMemberV(key, container=container, default=default,
+                               asType=asType)
+    return property(get)
+
+def subsetter(key, container='text'):
+    def set(self, value):
+        self._runCallbacks()
+        self.setMemberV(key, str(value), container=container)
+    return set
+
+def subaccs(key, container='text', default=None, asType=(lambda x: x)):
+    get = subgetter(key, container=container, default=default, asType=asType)
+    get = get.setter(subsetter(key, container=container))
+    return get
+
+def getter(key, container='attrib', asType=(lambda x: x)):
+    def get(self):
+        return asType(self.getV(container, key))
+    return property(get)
+
+def setter(key, container='attrib'):
+    def set(self, value):
+        self._runCallbacks()
+        self.setV(str(value), container, key)
+    return set
+
+def accs(key, container='attrib', asType=(lambda x: x)):
+    get = getter(key, container, asType)
+    get = get.setter(setter(key, container))
+    return get
+
+def runonce(func):
+    def runner():
+        if not runner.hasrun:
+            try:
+                return func()
+            finally:
+                runner.hasrun = True
+    runner.hasrun = False
+    return runner
+
+def updateTag(ele, tag):
+    if ele.metaname == '':
+        ele.metaname = tag
 
 class RecordWrapper(object):
     """The base functionality of a metadata element.
@@ -136,16 +232,19 @@ class RecordWrapper(object):
     RecordWrapper (e.g. append, extend). Methods in child classes often provide
     similar functionality for more raw inputs (e.g. resourceIds as strings)"""
 
+    # only things that should be kept with their parents (indices) should be
+    # True
+    KEEP_WITH_PARENT = False
 
-    def __init__(self, record=None):
+    def __init__(self, record=None, parent=None):
         """Here, record is any element in the Metadata Element tree and a
-        dictionary with four members: 'tag', 'attrib', 'text', and 'children'.
-
-        Now with a new member! 'namespace'
+        dictionary with five members: 'tag', 'attrib', 'text', 'children', and
+        'namespace'
 
         Do not deepcopy, we rely on side effects for all persistent
         modifications.
         """
+        self._callbacks = []
         if record:
             try:
                 self.record = record.record
@@ -153,8 +252,30 @@ class RecordWrapper(object):
                 self.record = record
         else:
             self.record = _emptyMember()
+            # register a callback to set the XML element 'tag' to the
+            # class's TAG member if it has one, with the class name as a
+            # fallback
+            self.registerCallback(runonce(
+                P(updateTag, self,
+                  getattr(self, 'TAG', self.__class__.__name__))))
+            if not parent is None:
+                # register a callback to append this object to the parent, so
+                # that it will be added to the XML file
+                self.registerCallback(runonce(P(parent.append, self.record)))
         assert 'tag' in self.record.keys()
 
+
+    def registerCallback(self, func):
+        if func not in self._callbacks:
+            self._callbacks.append(func)
+
+    def clearCallbacks(self):
+        self._callbacks = []
+
+    def _runCallbacks(self):
+        for func in self._callbacks:
+            func()
+
     def __len__(self):
         """Return the number of children in this node"""
         return len(self.record['children'])
@@ -277,9 +398,11 @@ class RecordWrapper(object):
     def append(self, newMember):
         """Append to the actual list of child elements"""
         if isinstance(newMember, RecordWrapper):
+            newMember._runCallbacks()
             self.record['children'].append(newMember.record)
         else:
             self.record['children'].append(newMember)
+        self._runCallbacks()
 
     def index(self, tag):
         """Return the index in 'children' list of item with 'tag' member"""
@@ -369,33 +492,12 @@ class RecordWrapper(object):
 
     # Some common attributes (to reduce code duplication):
 
-    @property
-    def name(self):
-        return self.getV('attrib', 'Name')
-
-    @name.setter
-    def name(self, value):
-        self.setV(value, 'attrib', 'Name')
-
-    @property
-    def value(self):
-        return self.getV('attrib', 'Value')
-
-    @value.setter
-    def value(self, value):
-        self.setV(value, 'attrib', 'Value')
-
-    @property
-    def version(self):
-        return self.getV('attrib', 'Version')
-
-    @property
-    def description(self):
-        return self.getV('attrib', 'Description')
-
-    @description.setter
-    def description(self, value):
-        return self.setV(value, 'attrib', 'Description')
+    name = accs('Name')
+    value = accs('Value')
+    version = accs('Version')
+    description = accs('Description')
+    uniqueId = accs('UniqueId')
+    createdAt = accs('CreatedAt')
 
 def filter_read(accessor, operator, value, read):
     return operator(accessor(read), value)
@@ -406,18 +508,6 @@ class Filters(RecordWrapper):
     def __init__(self, record=None):
         super(self.__class__, self).__init__(record)
         self.record['tag'] = self.__class__.__name__
-        self._callbacks = []
-
-    def registerCallback(self, func):
-        if func not in self._callbacks:
-            self._callbacks.append(func)
-
-    def clearCallbacks(self):
-        self._callbacks = []
-
-    def _runCallbacks(self):
-        for func in self._callbacks:
-            func()
 
     def __getitem__(self, index):
         return Filter(self.record['children'][index])
@@ -477,8 +567,8 @@ class Filters(RecordWrapper):
         for i, filt in enumerate(self):
             for req in filt:
                 if req.name == param:
-                    if not mapOp(oper)(testType(req.value),
-                                            testType(value)):
+                    if not mapOp(oper)(testType(value),
+                                            testType(req.value)):
                         options[i] = False
         return any(options)
 
@@ -489,8 +579,8 @@ class Filters(RecordWrapper):
             for req in filt:
                 if req.name == param:
                     tested = True
-                    passes |= mapOp(oper)(testType(req.value),
-                                          values)
+                    passes |= mapOp(oper)(values,
+                                          testType(req.value))
         if not tested:
             return np.ones(len(values), dtype=np.bool_)
         return passes
@@ -499,7 +589,7 @@ class Filters(RecordWrapper):
     def _bamAccMap(self):
         return {'rname': (lambda x: x.referenceName),
                 'length': (lambda x: int(x.readLength)),
-                'qname': (lambda x: x.qNameA),
+                'qname': (lambda x: x.qName),
                 'movie': (lambda x: x.movieName),
                 'zm': (lambda x: int(x.HoleNumber)),
                 # not implemented yet:
@@ -538,7 +628,7 @@ class Filters(RecordWrapper):
                 'tend': (lambda x: x.tEnd),
                 'accuracy': (
                     lambda x: (np.ones(len(x.nMM), dtype='f4') -
-                               (x.nMM + x.nIns + x.nDel).astype(np.float32)/
+                               (x.nMM + x.nIns + x.nDel).astype(np.float)/
                                (x.aEnd - x.aStart + x.tEnd - x.tStart -
                                 x.nM - x.nMM)))
                }
@@ -580,11 +670,11 @@ class Filters(RecordWrapper):
                 'bcq': int,
                 'bq': int,
                 'qs': int,
-                'rq': np.float32,
+                'rq': np.float,
                 'pos': int,
                 'tstart': int,
                 'tend': int,
-                'accuracy': np.float32,
+                'accuracy': np.float,
                 'readstart': int,
                 'cx': PbiFlags.flagMap,
                 'n_subreads': int,
@@ -618,8 +708,7 @@ class Filters(RecordWrapper):
                 value = typeMap[param](req.value)
                 operator = mapOp(req.operator)
                 reqTests.append(P(filter_read, accMap[param], operator, value))
-            tests.append(
-                lambda x, reqTests=reqTests: all([f(x) for f in reqTests]))
+            tests.append(lambda x, rt=reqTests: all([f(x) for f in rt]))
         return tests
 
     def filterIndexRecords(self, indexRecords, nameMap, movieMap,
@@ -637,6 +726,7 @@ class Filters(RecordWrapper):
                 accMap['movie'] = (lambda x: x.MovieID)
                 accMap['qname'] = (lambda x: x.MovieID)
                 accMap['zm'] = (lambda x: x.HoleNumber)
+                accMap['length'] = (lambda x: x.rEnd - x.rStart)
         elif readType == 'fasta':
             accMap = {'id': (lambda x: x.id),
                       'length': (lambda x: x.length.astype(int)),
@@ -651,7 +741,10 @@ class Filters(RecordWrapper):
             for req in filt:
                 param = req.name
                 if param in accMap.keys():
-                    value = typeMap[param](req.value)
+                    if req.operator == 'in':
+                        value = map(typeMap[param], setify(req.value))
+                    else:
+                        value = typeMap[param](req.value)
                     if param == 'rname':
                         value = nameMap[value]
                     if param == 'movie':
@@ -670,27 +763,45 @@ class Filters(RecordWrapper):
                         reqResultsForRecords &= operator(
                             accMap[param](indexRecords), value)
                     elif param == 'qname':
-                        movie, hole, span = value.split('/')
-                        operator = mapOp(req.operator)
-
-                        value = movieMap[movie]
-                        reqResultsForRecords = operator(
-                            accMap[param](indexRecords), value)
-
-                        param = 'zm'
-                        value = typeMap[param](hole)
-                        reqResultsForRecords &= operator(
-                            accMap[param](indexRecords), value)
-
-                        param = 'qstart'
-                        value = typeMap[param](span.split('_')[0])
-                        reqResultsForRecords &= operator(
-                            accMap[param](indexRecords), value)
-
-                        param = 'qend'
-                        value = typeMap[param](span.split('_')[1])
-                        reqResultsForRecords &= operator(
-                            accMap[param](indexRecords), value)
+                        # TODO: optimize "in" version...
+                        if not isinstance(value, list):
+                            values = [value]
+                        else:
+                            values = value
+                        reqResultsForRecords = np.zeros(len(indexRecords), dtype=np.bool_)
+                        #tempRes = np.ones(len(indexRecords), dtype=np.bool_)
+                        for value in values:
+                            movie, hole, span = value.split('/')
+                            operator = mapOp(req.operator)
+
+                            param = 'movie'
+                            value = movieMap[movie]
+                            #reqResultsForRecords = operator(
+                            #    accMap[param](indexRecords), value)
+                            tempRes = operator(
+                                accMap[param](indexRecords), value)
+
+                            param = 'zm'
+                            value = typeMap[param](hole)
+                            #reqResultsForRecords &= operator(
+                            #    accMap[param](indexRecords), value)
+                            tempRes &= operator(
+                                accMap[param](indexRecords), value)
+
+                            param = 'qstart'
+                            value = typeMap[param](span.split('_')[0])
+                            #reqResultsForRecords &= operator(
+                            #    accMap[param](indexRecords), value)
+                            tempRes &= operator(
+                                accMap[param](indexRecords), value)
+
+                            param = 'qend'
+                            value = typeMap[param](span.split('_')[1])
+                            #reqResultsForRecords &= operator(
+                            #    accMap[param](indexRecords), value)
+                            tempRes &= operator(
+                                accMap[param](indexRecords), value)
+                            reqResultsForRecords |= tempRes
 
                     else:
                         operator = mapOp(req.operator)
@@ -731,6 +842,8 @@ class Filters(RecordWrapper):
             for name, options in kwargs.items():
                 for i, (oper, val) in enumerate(options):
                     for filt in newFilts[i]:
+                        if isinstance(val, np.ndarray):
+                            val = list(val)
                         filt.addRequirement(name, oper, val)
             for filtList in newFilts:
                 self.extend(filtList)
@@ -738,6 +851,8 @@ class Filters(RecordWrapper):
             newFilts = [Filter() for _ in kwargs.values()[0]]
             for name, options in kwargs.items():
                 for i, (oper, val) in enumerate(options):
+                    if isinstance(val, np.ndarray):
+                        val = list(val)
                     newFilts[i].addRequirement(name, oper, val)
             self.extend(newFilts)
         #log.debug("Current filters: {s}".format(s=str(self)))
@@ -965,6 +1080,8 @@ class ExternalResources(RecordWrapper):
         Args:
             resourceIds: a list of uris as strings
         """
+        if not isinstance(resourceIds, list):
+            resourceIds = [resourceIds]
         templist = []
         self._resourceIds = []
         for res in resourceIds:
@@ -1015,10 +1132,6 @@ class ExternalResource(RecordWrapper):
             return True
         return False
 
-    @property
-    def uniqueId(self):
-        return self.getV('attrib', 'UniqueId')
-
     def merge(self, other):
         if self.metaType:
             if self.metaType != other.metaType:
@@ -1027,21 +1140,9 @@ class ExternalResource(RecordWrapper):
         if self.tags:
             self.tags = ', '.join([self.tags, other.tags])
 
-    @property
-    def metaType(self):
-        return self.getV('attrib', 'MetaType')
-
-    @metaType.setter
-    def metaType(self, value):
-        return self.setV(value, 'attrib', 'MetaType')
-
-    @property
-    def timeStampedName(self):
-        return self.getV('attrib', 'TimeStampedName')
-
-    @timeStampedName.setter
-    def timeStampedName(self, value):
-        return self.setV(value, 'attrib', 'TimeStampedName')
+    metaType = accs('MetaType')
+    timeStampedName = accs('TimeStampedName')
+    tags = accs('Tags')
 
     @property
     def resourceId(self):
@@ -1050,14 +1151,9 @@ class ExternalResource(RecordWrapper):
     @resourceId.setter
     def resourceId(self, value):
         self.setV(value, 'attrib', 'ResourceId')
-
-    @property
-    def tags(self):
-        return self.getV('attrib', 'Tags')
-
-    @tags.setter
-    def tags(self, value):
-        self.setV(value, 'attrib', 'Tags')
+        dsuuid = getDataSetUuid(value)
+        if dsuuid:
+            self.uniqueId = dsuuid
 
     @property
     def bam(self):
@@ -1070,6 +1166,10 @@ class ExternalResource(RecordWrapper):
             if index.metaType == 'PacBio.Index.PacBioIndex':
                 return index.resourceId
 
+    @pbi.setter
+    def pbi(self, value):
+        self._setIndResByMetaType('PacBio.Index.PacBioIndex', value)
+
     @property
     def bai(self):
         indices = self.indices
@@ -1078,6 +1178,16 @@ class ExternalResource(RecordWrapper):
                 return index.resourceId
 
     @property
+    def gmap(self):
+        """Unusual: returns the gmap external resource instead of the resId"""
+        return self._getSubExtResByMetaType('PacBio.GmapDB.GmapDBPath')
+
+    @gmap.setter
+    def gmap(self, value):
+        """Sets the resourceId"""
+        self._setSubResByMetaType('PacBio.GmapDB.GmapDBPath', value)
+
+    @property
     def sts(self):
         return self._getSubResByMetaType('PacBio.SubreadFile.ChipStatsFile')
 
@@ -1119,11 +1229,70 @@ class ExternalResource(RecordWrapper):
         self._setSubResByMetaType('PacBio.ReferenceFile.ReferenceFastaFile',
                                   value)
 
-    def _getSubResByMetaType(self, mType):
+    def _deleteIndByMetaType(self, mType):
+        rm = []
+        for i, res in enumerate(self.indices):
+            if res.metaType == mType:
+                rm.append(i)
+        for i in sorted(rm, reverse=True):
+            self.indices.pop(i)
+
+    def _getIndByMetaType(self, mType):
+        resources = self.indices
+        for res in resources:
+            if res.metaType == mType:
+                return res
+
+    def _getIndResByMetaType(self, mType):
+        res = self._getIndByMetaType(mType)
+        if not res is None:
+            return res.resourceId
+
+    def _setIndResByMetaType(self, mType, value):
+        if not isinstance(value, FileIndex):
+            tmp = FileIndex()
+            tmp.resourceId = value
+        else:
+            tmp = value
+        extant = self._getIndByMetaType(mType)
+        if extant:
+            if value is None:
+                self._deleteIndByMetaType(mType)
+            else:
+                extant.resourceId = value
+        else:
+            tmp.metaType = mType
+            tmp.timeStampedName = getTimeStampedName(mType)
+            resources = self.indices
+            # externalresources objects have a tag by default, which means their
+            # truthiness is true. Perhaps a truthiness change is in order
+            # TODO: (mdsmith 20160728) this can be updated now that the
+            # retention and tag system has been refactored
+            if len(resources) == 0:
+                resources = FileIndices()
+                resources.append(tmp)
+                self.append(resources)
+            else:
+                resources.append(tmp)
+
+    def _deleteExtResByMetaType(self, mType):
+        rm = []
+        for i, res in enumerate(self.externalResources):
+            if res.metaType == mType:
+                rm.append(i)
+        for i in sorted(rm, reverse=True):
+            self.externalResources.pop(i)
+
+    def _getSubExtResByMetaType(self, mType):
         resources = self.externalResources
         for res in resources:
             if res.metaType == mType:
-                return res.resourceId
+                return res
+
+    def _getSubResByMetaType(self, mType):
+        res = self._getSubExtResByMetaType(mType)
+        if not res is None:
+            return res.resourceId
 
     def _setSubResByMetaType(self, mType, value):
         if not isinstance(value, ExternalResource):
@@ -1131,17 +1300,26 @@ class ExternalResource(RecordWrapper):
             tmp.resourceId = value
         else:
             tmp = value
-        tmp.metaType = mType
-        tmp.timeStampedName = getTimeStampedName(mType)
-        resources = self.externalResources
-        # externalresources objects have a tag by default, which means their
-        # truthiness is true. Perhaps a truthiness change is in order
-        if len(resources) == 0:
-            resources = ExternalResources()
-            resources.append(tmp)
-            self.append(resources)
+        extant = self._getSubExtResByMetaType(mType)
+        if extant:
+            if value is None:
+                self._deleteExtResByMetaType(mType)
+            else:
+                extant.resourceId = value
         else:
-            resources.append(tmp)
+            tmp.metaType = mType
+            tmp.timeStampedName = getTimeStampedName(mType)
+            resources = self.externalResources
+            # externalresources objects have a tag by default, which means their
+            # truthiness is true. Perhaps a truthiness change is in order
+            # TODO: (mdsmith 20160728) this can be updated now that the
+            # retention and tag system has been refactored
+            if len(resources) == 0:
+                resources = ExternalResources()
+                resources.append(tmp)
+                self.append(resources)
+            else:
+                resources.append(tmp)
 
     @property
     def externalResources(self):
@@ -1198,6 +1376,7 @@ class FileIndices(RecordWrapper):
 
 class FileIndex(RecordWrapper):
 
+    KEEP_WITH_PARENT = True
 
     def __init__(self, record=None):
         super(self.__class__, self).__init__(record)
@@ -1205,40 +1384,21 @@ class FileIndex(RecordWrapper):
         self.attrib.setdefault('UniqueId', newUuid(self.record))
         self.attrib.setdefault('TimeStampedName', '')
 
-    @property
-    def resourceId(self):
-        return self.getV('attrib', 'ResourceId')
-
-    @resourceId.setter
-    def resourceId(self, value):
-        self.setV(value, 'attrib', 'ResourceId')
-
-    @property
-    def metaType(self):
-        return self.getV('attrib', 'MetaType')
-
-    @metaType.setter
-    def metaType(self, value):
-        return self.setV(value, 'attrib', 'MetaType')
-
-    @property
-    def timeStampedName(self):
-        return self.getV('attrib', 'TimeStampedName')
-
-    @timeStampedName.setter
-    def timeStampedName(self, value):
-        return self.setV(value, 'attrib', 'TimeStampedName')
+    resourceId = accs('ResourceId')
+    metaType = accs('MetaType')
+    timeStampedName = accs('TimeStampedName')
 
 
 class DataSetMetadata(RecordWrapper):
     """The root of the DataSetMetadata element tree, used as base for subtype
     specific DataSet or for generic "DataSet" records."""
 
+    TAG = 'DataSetMetadata'
 
     def __init__(self, record=None):
         """Here, record is the root element of the Metadata Element tree"""
         super(DataSetMetadata, self).__init__(record)
-        self.record['tag'] = 'DataSetMetadata'
+        self.record['tag'] = self.TAG
 
     def merge(self, other):
         self.numRecords += other.numRecords
@@ -1301,6 +1461,8 @@ class SubreadSetMetadata(DataSetMetadata):
     """The DataSetMetadata subtype specific to SubreadSets. Deals explicitly
     with the merging of Collections and BioSamples metadata hierarchies."""
 
+    TAG = 'DataSetMetadata'
+
     def __init__(self, record=None):
         # This doesn't really need to happen unless there are contextual
         # differences in the meanings of subtypes (e.g. BioSamples mean
@@ -1329,7 +1491,8 @@ class SubreadSetMetadata(DataSetMetadata):
         """Return a list of wrappers around Collections elements of the
         Metadata Record"""
         return CollectionsMetadata(self.getV(tag='Collections',
-                                             container='children'))
+                                             container='children'),
+                                   parent=self)
 
     @collections.setter
     def collections(self, value):
@@ -1348,6 +1511,7 @@ class SubreadSetMetadata(DataSetMetadata):
 class ContigSetMetadata(DataSetMetadata):
     """The DataSetMetadata subtype specific to ContigSets."""
 
+    TAG = 'DataSetMetadata'
 
     def __init__(self, record=None):
         if record:
@@ -1365,21 +1529,8 @@ class ContigSetMetadata(DataSetMetadata):
         else:
             self.contigs = other.contigs
 
-    @property
-    def organism(self):
-        return self.getMemberV('Organism')
-
-    @organism.setter
-    def organism(self, value):
-        self.setMemberV('Organism', value)
-
-    @property
-    def ploidy(self):
-        return self.getMemberV('Ploidy')
-
-    @ploidy.setter
-    def ploidy(self, value):
-        self.setMemberV('Ploidy', value)
+    organism = subaccs('Organism')
+    ploidy = subaccs('Ploidy')
 
     @property
     def contigs(self):
@@ -1410,6 +1561,7 @@ class ContigSetMetadata(DataSetMetadata):
 class BarcodeSetMetadata(DataSetMetadata):
     """The DataSetMetadata subtype specific to BarcodeSets."""
 
+    TAG = 'DataSetMetadata'
 
     def __init__(self, record=None):
         if record:
@@ -1420,20 +1572,16 @@ class BarcodeSetMetadata(DataSetMetadata):
                                 "{t}".format(t=type(record).__name__))
         super(BarcodeSetMetadata, self).__init__(record)
 
-    @property
-    def barcodeConstruction(self):
-        return self.getMemberV('BarcodeConstruction')
-
-    @barcodeConstruction.setter
-    def barcodeConstruction(self, value):
-        self.setMemberV('BarcodeConstruction', value)
+    barcodeConstruction = subaccs('BarcodeConstruction')
 
 
 class ContigsMetadata(RecordWrapper):
 
+    TAG = 'Contigs'
+
     def __init__(self, record=None):
         super(self.__class__, self).__init__(record)
-        self.record['tag'] = 'Contigs'
+        self.record['tag'] = self.TAG
 
     def __getitem__(self, index):
         return ContigMetadata(self.record['children'][index])
@@ -1448,9 +1596,11 @@ class ContigsMetadata(RecordWrapper):
 
 class ContigMetadata(RecordWrapper):
 
+    TAG = 'Contig'
+
     def __init__(self, record=None):
         super(self.__class__, self).__init__(record)
-        self.record['tag'] = 'Contig'
+        self.record['tag'] = self.TAG
 
     @property
     def digest(self):
@@ -1473,6 +1623,7 @@ class CollectionsMetadata(RecordWrapper):
     """The Element should just have children: a list of
     CollectionMetadataTags"""
 
+    TAG = 'Collections'
 
     def __getitem__(self, index):
         return CollectionMetadata(self.record['children'][index])
@@ -1485,65 +1636,14 @@ class CollectionsMetadata(RecordWrapper):
         self.extend([child for child in other])
 
 
-class CollectionMetadata(RecordWrapper):
-    """The metadata for a single collection. It contains Context,
-    InstrumentName etc. as attribs, InstCtrlVer etc. for children"""
-
-
-    @property
-    def context(self):
-        return self.getV('attrib', 'Context')
-
-    @property
-    def instrumentName(self):
-        return self.getV('attrib', 'InstrumentName')
-
-    @property
-    def instrumentId(self):
-        return self.getV('attrib', 'InstrumentId')
-
-    @property
-    def instCtrlVer(self):
-        return self.getMemberV('InstCtrlVer')
-
-    @property
-    def sigProcVer(self):
-        return self.getMemberV('SigProcVer')
-
-    @property
-    def automation(self):
-        return Automation(self.getMemberV('Automation'))
-
-    @property
-    def collectionNumber(self):
-        return self.getMemberV('collectionNumber')
-
-    @property
-    def cellIndex(self):
-        return self.getMemberV('cellIndex')
-
-    @property
-    def cellPac(self):
-        return self.getMemberV('cellPac', 'attrib')
-
-    @property
-    def runDetails(self):
-        return RunDetailsMetadata(self.getV('children', 'RunDetails'))
-
-    @property
-    def wellSample(self):
-        return WellSampleMetadata(self.getV('children', 'WellSample'))
+class AutomationParameter(RecordWrapper):
 
-    @property
-    def primary(self):
-        return PrimaryMetadata(self.getV('children', 'Primary'))
+    def __init__(self, record=None):
+        super(self.__class__, self).__init__(record)
+        self.record['tag'] = self.__class__.__name__
 
-class Automation(RecordWrapper):
+    value = accs('SimpleValue')
 
-    @property
-    def automationParameters(self):
-        return AutomationParameters(self.getV('children',
-                                              'AutomationParameters'))
 
 class AutomationParameters(RecordWrapper):
 
@@ -1551,10 +1651,8 @@ class AutomationParameters(RecordWrapper):
         super(self.__class__, self).__init__(record)
         self.record['tag'] = self.__class__.__name__
 
-    @property
-    def automationParameter(self):
-        return AutomationParameter(self.getV('children',
-                                             'AutomationParameter'))
+    automationParameter = accs('AutomationParameter', container='children',
+                               asType=AutomationParameter)
 
     def addParameter(self, key, value):
         temp = AutomationParameter()
@@ -1564,46 +1662,103 @@ class AutomationParameters(RecordWrapper):
             temp.value = value
         self.append(temp)
 
-class AutomationParameter(RecordWrapper):
+    def __getitem__(self, tag):
+        """Override to use tag as Name instead of strictly tag"""
+        if isinstance(tag, str):
+            for child in self:
+                child = AutomationParameter(child)
+                if child.name == tag:
+                    return child
+            return RecordWrapper(self.getV('children', tag))
+        elif isinstance(tag, int):
+            return RecordWrapper(self.record['children'][tag])
 
-    def __init__(self, record=None):
-        super(self.__class__, self).__init__(record)
-        self.record['tag'] = self.__class__.__name__
+    @property
+    def parameterNames(self):
+        return [c.name for c in self]
 
-class Provenance(RecordWrapper):
-    """The metadata concerning this dataset's provenance"""
 
-    @property
-    def createdBy(self):
-        return self.getV(container='attrib', tag='CreatedBy')
+class Automation(RecordWrapper):
+
+    automationParameters = accs('AutomationParameters', container='children',
+                                asType=AutomationParameters)
 
-    @property
-    def parentTool(self):
-        return ParentTool(self.getV('children', 'ParentTool'))
 
 class ParentTool(RecordWrapper):
     pass
 
+
+class Provenance(RecordWrapper):
+    """The metadata concerning this dataset's provenance"""
+
+    createdBy = accs('CreatedBy')
+    parentTool = accs('ParentTool', container='children', asType=ParentTool)
+
+
 class StatsMetadata(RecordWrapper):
     """The metadata from the machine sts.xml"""
 
+    # merged dists:
+    MERGED_DISTS = ["ProdDist", "ReadTypeDist", "ReadLenDist", "ReadQualDist",
+                    "MedianInsertDist", "InsertReadQualDist",
+                    "InsertReadLenDist", "ControlReadQualDist",
+                    "ControlReadLenDist"]
+
+    # continuous channel dists:
+    CHANNEL_DISTS = ['BaselineLevelDist', 'BaselineStdDist', 'SnrDist',
+                     'HqRegionSnrDist', 'HqBasPkMidDist',
+                     'BaselineLevelSequencingDist']
+
+    # continuous misc. dists:
+    OTHER_DISTS = ['PausinessDist', 'PulseRateDist', 'PulseWidthDist',
+                   'BaseRateDist', 'BaseWidthDist', 'BaseIpdDist',
+                   'LocalBaseRateDist', 'NumUnfilteredBasecallsDist']
+
+    UNMERGED_DISTS = CHANNEL_DISTS + OTHER_DISTS
+
+    @property
+    def channelDists(self):
+        tbr = {}
+        for dist in self.CHANNEL_DISTS:
+            chans = defaultdict(list)
+            for chan in self.findChildren(dist):
+                chans[chan.attrib['Channel']].append(ContinuousDistribution(chan))
+            tbr[dist] = chans
+        return tbr
+
+    @property
+    def otherDists(self):
+        tbr = defaultdict(list)
+        for disttype in self.OTHER_DISTS:
+            for dist in self.findChildren(disttype):
+                tbr[disttype].append(ContinuousDistribution(dist))
+        return tbr
+
     def merge(self, other):
-        if other.shortInsertFraction and other.prodDist:
+        if (other.shortInsertFraction and other.prodDist and
+                self.shortInsertFraction and self.prodDist):
             self.shortInsertFraction = (self.shortInsertFraction *
                                         self.prodDist.bins[1] +
                                         other.shortInsertFraction *
                                         other.prodDist.bins[1])/(
                                             self.prodDist.bins[1]
                                             + other.prodDist.bins[1])
-        if other.adapterDimerFraction and other.prodDist:
+        if (other.adapterDimerFraction and other.prodDist and
+                self.shortInsertFraction and self.prodDist):
             self.adapterDimerFraction = (self.adapterDimerFraction *
                                          self.prodDist.bins[1] +
                                          other.adapterDimerFraction *
                                          other.prodDist.bins[1])/(
                                              self.prodDist.bins[1]
                                              + other.prodDist.bins[1])
+        if other.shortInsertFraction and not self.shortInsertFraction:
+            self.shortInsertFraction = other.shortInsertFraction
+        if other.adapterDimerFraction and not self.adapterDimerFraction:
+            self.adapterDimerFraction = other.adapterDimerFraction
+        if other.prodDist and not self.prodDist:
+            self.append(other.prodDist)
         self.numSequencingZmws += other.numSequencingZmws
-        for dist in DISTLIST:
+        for dist in self.MERGED_DISTS:
             selfDist = getattr(self, dist[0].lower() + dist[1:])
             otherDist = getattr(other, dist[0].lower() + dist[1:])
             if not selfDist:
@@ -1617,6 +1772,11 @@ class StatsMetadata(RecordWrapper):
                     self.append(otherDist)
                 except BinMismatchError:
                     self.append(otherDist)
+        for dist in self.UNMERGED_DISTS:
+            otherDists = other.findChildren(dist)
+            for otherDist in otherDists:
+                if otherDist:
+                    self.append(otherDist)
 
     @property
     def prodDist(self):
@@ -1918,66 +2078,55 @@ class DiscreteDistribution(RecordWrapper):
 
 class RunDetailsMetadata(RecordWrapper):
 
+    TAG = 'RunDetails'
 
-    @property
-    def timeStampedName(self):
-        return self.getMemberV('TimeStampedName')
+    timeStampedName = subgetter('TimeStampedName')
+    name = subaccs('Name')
 
-    @property
-    def name(self):
-        return self.getMemberV('Name')
 
-    @name.setter
-    def name(self, value):
-        return self.setMemberV('Name', value)
+class BioSamplePointersMetadata(RecordWrapper):
+    """The BioSamplePointer members don't seem complex enough to justify
+    class representation, instead rely on base class methods to provide
+    iterators and accessors"""
+
+    TAG = 'BioSamplePointers'
 
 
 class WellSampleMetadata(RecordWrapper):
 
+    TAG = 'WellSample'
 
-    @property
-    def uniqueId(self):
-        return self.getV('attrib', 'UniqueId')
+    wellName = subaccs('WellName')
+    concentration = subaccs('Concentration')
+    sampleReuseEnabled = subgetter('SampleReuseEnabled')
+    stageHotstartEnabled = subgetter('StageHotstartEnabled')
+    sizeSelectionEnabled = subgetter('SizeSelectionEnabled')
+    useCount = subaccs('UseCount')
+    comments = subaccs('Comments')
+    bioSamplePointers = accs('BioSamplePointers', container='children',
+                             asType=BioSamplePointersMetadata)
 
-    @property
-    def wellName(self):
-        return self.getMemberV('WellName')
 
-    @property
-    def concentration(self):
-        return self.getMemberV('Concentration')
+class CopyFilesMetadata(RecordWrapper):
+    """The CopyFile members don't seem complex enough to justify
+    class representation, instead rely on base class methods"""
 
-    @property
-    def sampleReuseEnabled(self):
-        return self.getMemberV('SampleReuseEnabled')
+    TAG = 'CopyFiles'
 
-    @property
-    def stageHotstartEnabled(self):
-        return self.getMemberV('StageHotstartEnabled')
 
-    @property
-    def sizeSelectionEnabled(self):
-        return self.getMemberV('SizeSelectionEnabled')
+class OutputOptions(RecordWrapper):
 
-    @property
-    def useCount(self):
-        return self.getMemberV('UseCount')
+    resultsFolder = subaccs('ResultsFolder')
+    collectionPathUri = subaccs('CollectionPathUri')
+    copyFiles = accs('CopyFiles', container='children',
+                     asType=CopyFilesMetadata)
 
-    @property
-    def comments(self):
-        return self.getMemberV('Comments')
 
-    @property
-    def bioSamplePointers(self):
-        return BioSamplePointersMetadata(
-            self.getV('children', 'BioSamplePointers'))
+class SecondaryMetadata(RecordWrapper):
 
+    TAG = 'Secondary'
 
-class BioSamplePointersMetadata(RecordWrapper):
-    """The BioSamplePointer members don't seem complex enough to justify
-    class representation, instead rely on base class methods to provide
-    iterators and accessors"""
-    pass
+    cellCountInJob = subaccs('CellCountInJob')
 
 
 class PrimaryMetadata(RecordWrapper):
@@ -2002,45 +2151,13 @@ class PrimaryMetadata(RecordWrapper):
         'BetterAnalysis_Results'
     """
 
-    @property
-    def automationName(self):
-        return self.getMemberV('AutomationName')
-
-    @property
-    def configFileName(self):
-        return self.getMemberV('ConfigFileName')
-
-    @property
-    def sequencingCondition(self):
-        return self.getMemberV('SequencingCondition')
-
-    @property
-    def outputOptions(self):
-        return OutputOptions(self.getV('children', 'OutputOptions'))
-
-
-class OutputOptions(RecordWrapper):
-    @property
-    def resultsFolder(self):
-        return self.getMemberV('ResultsFolder')
+    TAG = 'Primary'
 
-    @resultsFolder.setter
-    def resultsFolder(self, value):
-        self.setMemberV('ResultsFolder', value)
-
-    @property
-    def collectionPathUri(self):
-        return self.getMemberV('CollectionPathUri')
-
-    @property
-    def copyFiles(self):
-        return CopyFilesMetadata(self.getV('children', 'CopyFiles'))
-
-
-class CopyFilesMetadata(RecordWrapper):
-    """The CopyFile members don't seem complex enough to justify
-    class representation, instead rely on base class methods"""
-    pass
+    automationName = subaccs('AutomationName')
+    configFileName = subaccs('ConfigFileName')
+    sequencingCondition = subaccs('SequencingCondition')
+    outputOptions = accs('OutputOptions', container='children',
+                         asType=OutputOptions)
 
 
 class BioSamplesMetadata(RecordWrapper):
@@ -2062,6 +2179,7 @@ class BioSamplesMetadata(RecordWrapper):
             'great biosample'
         """
 
+    TAG = 'BioSamples'
 
     def __getitem__(self, index):
         """Get a biosample"""
@@ -2079,28 +2197,74 @@ class BioSamplesMetadata(RecordWrapper):
 class BioSampleMetadata(RecordWrapper):
     """The metadata for a single BioSample"""
 
+    TAG = 'BioSample'
+
+class Kit(RecordWrapper):
+    partNumber = accs('PartNumber')
+    lotNumber = accs('LotNumber')
+    barcode = accs('Barcode')
+    expirationDate = accs('ExpirationDate')
+
+class CellPac(Kit):
+    """CellPac metadata"""
+
+class TemplatePrepKit(Kit):
+    """TemplatePrepKit metadata"""
+
+    rightAdaptorSequence = subaccs('RightAdaptorSequence')
+    leftAdaptorSequence = subaccs('LeftAdaptorSequence')
+
+class BindingKit(Kit):
+    pass
+
+class SequencingKitPlate(Kit):
+    pass
+
+class CollectionMetadata(RecordWrapper):
+    """The metadata for a single collection. It contains Context,
+    InstrumentName etc. as attribs, InstCtrlVer etc. for children"""
+
+    TAG = 'CollectionMetadata'
+
+    context = accs('Context')
+    instrumentName = accs('InstrumentName')
+    instrumentId = accs('InstrumentId')
+    instCtrlVer = subaccs('InstCtrlVer')
+    sigProcVer = subaccs('SigProcVer')
+    collectionNumber = subaccs('CollectionNumber')
+    cellIndex = subaccs('CellIndex')
+    cellPac = accs('CellPac', 'children', CellPac)
+    templatePrepKit = accs('TemplatePrepKit', 'children', TemplatePrepKit)
+    bindingKit = accs('BindingKit', 'children', BindingKit)
+    sequencingKitPlate = accs('SequencingKitPlate', 'children',
+                              SequencingKitPlate)
+    automation = accs('Automation', 'children', Automation)
+    primary = accs('Primary', 'children', PrimaryMetadata)
+    secondary = accs('Secondary', 'children', SecondaryMetadata)
 
     @property
-    def uniqueId(self):
-        return self.getV('attrib', 'UniqueId')
+    def runDetails(self):
+        return RunDetailsMetadata(self.getV('children', 'RunDetails'),
+                                  parent=self)
 
     @property
-    def createdAt(self):
-        return self.getV('attrib', 'CreatedAt')
+    def wellSample(self):
+        return WellSampleMetadata(self.getV('children', 'WellSample'),
+                                  parent=self)
 
 
 def _emptyMember(tag=None, text=None, attrib=None, children=None,
                  namespace=None):
     """Return an empty stock Element representation"""
-    if not tag:
+    if tag is None:
         tag = ''
-    if not namespace:
+    if namespace is None:
         namespace = ''
-    if not text:
+    if text is None:
         text = ''
-    if not attrib:
+    if attrib is None:
         attrib = {}
-    if not children:
+    if children is None:
         children = []
     return {'tag': tag, 'text': text, 'attrib': attrib, 'children': children,
             'namespace': namespace}
diff --git a/pbcore/io/dataset/DataSetMetaTypes.py b/pbcore/io/dataset/DataSetMetaTypes.py
new file mode 100644
index 0000000..e04305e
--- /dev/null
+++ b/pbcore/io/dataset/DataSetMetaTypes.py
@@ -0,0 +1,86 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
+import logging
+from pbcore.io.dataset.DataSetErrors import (InvalidDataSetIOError,)
+
+log = logging.getLogger(__name__)
+
+def toDsId(name):
+    """Translate a class name into a MetaType/ID"""
+    return "PacBio.DataSet.{x}".format(x=name)
+
+class DataSetMetaTypes(object):
+    """
+    This mirrors the PacBioSecondaryDataModel.xsd definitions and be used
+    to reference a specific dataset type.
+    """
+    SUBREAD = toDsId("SubreadSet")
+    HDF_SUBREAD = toDsId("HdfSubreadSet")
+    ALIGNMENT = toDsId("AlignmentSet")
+    BARCODE = toDsId("BarcodeSet")
+    CCS_ALIGNMENT = toDsId("ConsensusAlignmentSet")
+    CCS = toDsId("ConsensusReadSet")
+    REFERENCE = toDsId("ReferenceSet")
+    GMAPREFERENCE = toDsId("GmapReferenceSet")
+    CONTIG = toDsId("ContigSet")
+
+    ALL = (SUBREAD, HDF_SUBREAD, ALIGNMENT,
+           BARCODE, CCS, CCS_ALIGNMENT, REFERENCE, CONTIG, GMAPREFERENCE)
+
+    @classmethod
+    def isValid(cls, dsId):
+        return dsId in cls.ALL
+
+def dsIdToName(dsId):
+    """Translate a MetaType/ID into a class name"""
+    if DataSetMetaTypes.isValid(dsId):
+        return dsId.split('.')[-1]
+    else:
+        raise InvalidDataSetIOError("Invalid DataSet MetaType")
+
+def dsIdToSuffix(dsId):
+    """Translate a MetaType/ID into a file suffix"""
+    dsIds = DataSetMetaTypes.ALL
+    suffixMap = {dsId: dsIdToName(dsId) for dsId in dsIds}
+    suffixMap[toDsId("DataSet")] = 'DataSet'
+    if DataSetMetaTypes.isValid(dsId):
+        suffix = suffixMap[dsId]
+        suffix = suffix.lower()
+        suffix += '.xml'
+        return suffix
+    else:
+        raise InvalidDataSetIOError("Invalid DataSet MetaType")
+
diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py
index 1ba4c27..95b277c 100755
--- a/pbcore/io/dataset/DataSetReader.py
+++ b/pbcore/io/dataset/DataSetReader.py
@@ -1,3 +1,38 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
 """ Input and output functions for DataSet XML files"""
 
 import os.path
@@ -9,7 +44,7 @@ from pbcore.io.dataset.DataSetMembers import (ExternalResource,
                                               ExternalResources,
                                               DataSetMetadata,
                                               Filters, AutomationParameters,
-                                              StatsMetadata, DISTLIST)
+                                              StatsMetadata)
 from pbcore.io.dataset.DataSetWriter import _eleFromDictList
 from pbcore.io.dataset.DataSetErrors import InvalidDataSetIOError
 
@@ -97,7 +132,11 @@ def _addUnknownFile(dset, path):
     elif path.endswith('fofn'):
         return _addFofnFile(dset, path)
     else:
-        return _addGenericFile(dset, path)
+        # Give it a shot as an XML file:
+        try:
+            return _addXmlFile(dset, path)
+        except ET.ParseError:
+            return _addGenericFile(dset, path)
 
 SUB_RESOURCES = ['.scraps.bam', '.control.subreads.bam']
 FILE_INDICES = ['.fai', '.pbi', '.bai', '.metadata.xml',
@@ -270,9 +309,6 @@ def parseStats(filename):
     root = _updateStats(root)
     stats = StatsMetadata(_eleToDictList(root))
     stats.record['tag'] = 'SummaryStats'
-    whitelist = ['ShortInsertFraction', 'AdapterDimerFraction',
-                 'NumSequencingZmws'] + DISTLIST
-    stats.pruneChildrenTo(whitelist)
     return stats
 
 def parseMetadata(filename):
diff --git a/pbcore/__init__.py b/pbcore/io/dataset/DataSetUtils.py
similarity index 52%
copy from pbcore/__init__.py
copy to pbcore/io/dataset/DataSetUtils.py
index 7375688..1c6e083 100644
--- a/pbcore/__init__.py
+++ b/pbcore/io/dataset/DataSetUtils.py
@@ -1,5 +1,5 @@
-#################################################################################
-# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
 #
 # All rights reserved.
 #
@@ -16,16 +16,65 @@
 #
 # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
 # THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
-# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
 # ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
 # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
 # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
-#################################################################################
+###############################################################################
+
+# Author: Martin D. Smith
+
+
+import os
+import logging
+
+log = logging.getLogger(__name__)
+
+def fileType(fname):
+    """Get the extension of fname (with h5 type)"""
+    remainder, ftype = os.path.splitext(fname)
+    if ftype == '.h5':
+        _, prefix = os.path.splitext(remainder)
+        ftype = prefix + ftype
+    elif ftype == '.index':
+        _, prefix = os.path.splitext(remainder)
+        if prefix == '.contig':
+            ftype = prefix + ftype
+    ftype = ftype.strip('.')
+    return ftype
+
+def getDataSetUuid(xmlfile):
+    """
+    Quickly retrieve the uuid from the root element of a dataset XML file,
+    using a streaming parser to avoid loading the entire dataset into memory.
+    Returns None if the parsing fails.
+    """
+    try:
+        import xml.etree.cElementTree as ET
+        for event, element in ET.iterparse(xmlfile, events=("start",)):
+            return element.get("UniqueId")
+    except Exception:
+        return None
+
+
+def getDataSetMetaType(xmlfile):
+    """
+    Quickly retrieve the MetaType from the root element of a dataset XML file,
+    using a streaming parser to avoid loading the entire dataset into memory.
+    Returns None if the parsing fails.
+    """
+    try:
+        import xml.etree.cElementTree as ET
+        for event, element in ET.iterparse(xmlfile, events=("start",)):
+            return element.get("MetaType")
+    except Exception:
+        return None
 
-__VERSION__ = "1.2.10"
diff --git a/pbcore/io/dataset/DataSetValidator.py b/pbcore/io/dataset/DataSetValidator.py
index e4fca6a..e2804bb 100755
--- a/pbcore/io/dataset/DataSetValidator.py
+++ b/pbcore/io/dataset/DataSetValidator.py
@@ -1,3 +1,38 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
 """Validate DataSet XML files"""
 
 import os
@@ -27,10 +62,10 @@ def validateResources(xmlroot, relTo='.'):
         if resId:
             parsedId = urlparse(resId)
             rfn = urlparse(resId).path.strip()
-            if not os.path.isfile(rfn):
-                if (not os.path.isfile(os.path.join(relTo,
+            if not os.path.exists(rfn):
+                if (not os.path.exists(os.path.join(relTo,
                                                     rfn)) and
-                        not os.path.isfile(os.path.join('.',
+                        not os.path.exists(os.path.join('.',
                                                         rfn))):
                     raise IOError, "{f} not found".format(f=rfn)
 
diff --git a/pbcore/io/dataset/DataSetWriter.py b/pbcore/io/dataset/DataSetWriter.py
index e8989d4..4ee57e4 100755
--- a/pbcore/io/dataset/DataSetWriter.py
+++ b/pbcore/io/dataset/DataSetWriter.py
@@ -1,3 +1,38 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
 """ Input and output functions for DataSet XML files"""
 
 import copy, time
@@ -226,7 +261,8 @@ def _addDataSetMetadataElement(dataSet, root):
         if dataSet.metadata.summaryStats:
             stats = dataSet.metadata.summaryStats
             dataSet.metadata.summaryStats = None
-        root.append(_eleFromDictList(dataSet.metadata.record))
+        root.append(_eleFromDictList(dataSet.metadata.record,
+                                     defaultNS=namespaces()['pbmeta']))
         if stats:
             dataSet.metadata.summaryStats = stats
         # Metadata order matters....
@@ -241,7 +277,7 @@ def _guessNs(tag):
             return namespaces()[nsprefix]
     return ''
 
-def _eleFromDictList(eleAsDict, core=False):
+def _eleFromDictList(eleAsDict, core=False, defaultNS=None):
     """A last ditch capture method for uknown Elements"""
     if eleAsDict['tag'] == 'DataSets':
         print eleAsDict['namespace']
@@ -252,6 +288,9 @@ def _eleFromDictList(eleAsDict, core=False):
         newNamespace = _guessNs(eleAsDict['tag'])
         if newNamespace != '':
             eleAsDict['namespace'] = newNamespace
+    if eleAsDict['namespace'] == '' and not defaultNS is None:
+            eleAsDict['namespace'] = defaultNS
+
     if core:
         ele = ET.Element("{{{n}}}{t}".format(n=eleAsDict['namespace'],
                                              t=eleAsDict['tag']),
diff --git a/pbcore/io/dataset/__init__.py b/pbcore/io/dataset/__init__.py
index bffd9a8..f9d1346 100755
--- a/pbcore/io/dataset/__init__.py
+++ b/pbcore/io/dataset/__init__.py
@@ -1,4 +1,39 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
 from DataSetIO import *
+from DataSetUtils import *
 
 import logging
 
diff --git a/pbcore/io/dataset/utils.py b/pbcore/io/dataset/utils.py
index d9b352b..949ac94 100755
--- a/pbcore/io/dataset/utils.py
+++ b/pbcore/io/dataset/utils.py
@@ -1,3 +1,38 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
 """
 Utils that support fringe DataSet features.
 """
@@ -6,11 +41,21 @@ import tempfile
 import logging
 import json
 import shutil
+import datetime
 import pysam
+import numpy as np
 from pbcore.util.Process import backticks
 
 log = logging.getLogger(__name__)
 
+def getTimeStampedName(mType):
+    """Generate a timestamped name using the given metatype 'mType' and the
+    current UTC time"""
+    mType = mType.lower()
+    mType = '_'.join(mType.split('.'))
+    time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
+    return "{m}-{t}".format(m=mType, t=time)
+
 def which(exe):
     if os.path.exists(exe) and os.access(exe, os.X_OK):
         return exe
@@ -21,7 +66,7 @@ def which(exe):
             return this_path
     return None
 
-def consolidateXml(indset, outbam, useTmp=True):
+def consolidateXml(indset, outbam, useTmp=True, cleanup=True):
     tmpout = tempfile.mkdtemp(suffix="consolidate-xml")
     tmp_xml = os.path.join(tmpout,
                            "orig.{t}.xml".format(
@@ -51,8 +96,9 @@ def consolidateXml(indset, outbam, useTmp=True):
     _pbmergeXML(tmp_xml, outbam)
     if useTmp:
         shutil.copy(outbam, origOutBam)
-        # cleanup:
-        shutil.rmtree(tmpout)
+        shutil.copy(outbam + ".pbi", origOutBam + ".pbi")
+        if cleanup:
+            shutil.rmtree(tmpout)
     return outbam
 
 def consolidateBams(inFiles, outFile, filterDset=None, useTmp=True):
@@ -273,3 +319,123 @@ def _emitFilterScript(filterDset, filtScriptName):
     with open(filtScriptName, 'w') as scriptFile:
         scriptFile.write(json.dumps(script))
 
+RS = 65536
+
+def xy_to_hn(x, y):
+    return x * RS + y
+
+def hn_to_xy(hn):
+    x = hn/RS
+    y = hn - (x * RS)
+    return x, y
+
+def shift(cx, cy, d):
+    # 0 is up, 1 is right, 2 is down, 3 is left
+    if d == 0:
+        cy += 1
+    elif d == 1:
+        cx += 1
+    elif d == 2:
+        cy -= 1
+    elif d == 3:
+        cx -= 1
+    return cx, cy, d
+
+def change_d(d):
+    d += 1
+    d %= 4
+    return d
+
+def move(cx, cy, x, y, d):
+    if cx == x and cy == y:
+        return cx - 1, y, 0
+    if abs(x - cx) == abs(y - cy):
+        d = change_d(d)
+        # expand the search
+        if d == 0:
+            cx -= 1
+        return shift(cx, cy, d)
+    else:
+        return shift(cx, cy, d)
+
+def find_closest(x, y, pos, limit=81):
+    found = False
+    cx = x
+    cy = y
+    d = None
+    fails = 0
+    while not found:
+        hn = xy_to_hn(cx, cy)
+        if hn in pos:
+            return hn
+        else:
+            fails += 1
+            cx, cy, d = move(cx, cy, x, y, d)
+        if fails >= limit:
+            return None
+
+def quadratic_expand(lol):
+    samples = [[p] for p in lol[0]]
+    for ps in lol[1:]:
+        newsamples = []
+        for p in ps:
+            for s in samples:
+                newsamples.append(s[:] + [p])
+        samples = newsamples
+    return samples
+
+def prodround(values, target):
+    """Round the floats in values (whose product is <target>) to integers in a
+    way that minimizes the absolute change in values
+    Args:
+        values: a list of numbers
+        target: the product of values (perhaps approximate)
+    Returns:
+        The values array, rounded to integers
+    """
+    opts = [[np.floor(v), round(v), np.ceil(v)] for v in values]
+    combos = quadratic_expand(opts)
+    best = combos[0]
+    for combo in combos[1:]:
+        p = np.prod(combo)
+        err = abs(target - p)
+        berr = abs(target - np.prod(best))
+        rnd = np.sum([abs(v-c) for v, c in zip(values, combo)])
+        brnd = np.sum([abs(v-c) for v, c in zip(values, best)])
+        if (err < berr) or ((err == berr) and (rnd < brnd)):
+            best = combo
+    return best
+
+def sampleUniformly(nsamples, dimbounds):
+    """dimbounds is list of tuples of range, inclusive"""
+    volume = 1
+    for dmin, dmax in dimbounds:
+        volume *= dmax - dmin
+    volume_per_sample = np.true_divide(volume, nsamples)
+    sample_side_length = np.power(volume_per_sample,
+                                  np.true_divide(1.0, len(dimbounds)))
+    per_axis = [max(1.0, np.true_divide((dmax - dmin), sample_side_length))
+                for dmin, dmax in dimbounds]
+    per_axis = prodround(per_axis, nsamples)
+    # Shrink the stride to account for end margins
+    strides = [np.true_divide(dmax - dmin, nsam + 1)
+               for (dmin, dmax), nsam in zip(dimbounds, per_axis)]
+    # introduce a margin
+    points = [np.linspace(dmin + dstride,
+                          dmax - dstride,
+                          round(nsamp))
+              for (dmin, dmax), nsamp, dstride in zip(dimbounds, per_axis,
+                                                      strides)]
+    points = [map(round, ps) for ps in points]
+    points = [map(int, ps) for ps in points]
+    samples = quadratic_expand(points)
+    return samples
+
+
+def sampleHolesUniformly(nsamples, samplefrom, faillimit=25, rowstart=64,
+                         colstart=64, nrows=1024, ncols=1144):
+    xys = sampleUniformly(nsamples, [(colstart, ncols), (rowstart, nrows)])
+    hns = [find_closest(x, y, samplefrom, limit=faillimit) for x, y in xys]
+    return [hn for hn in hns if not hn is None]
+
+
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 266b3e6..3ae70f4 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,3 +1,3 @@
 sphinx
 nose
-pyxb
+pyxb==1.2.4
diff --git a/tests/test_pbcore_io_AlnFileReaders.py b/tests/test_pbcore_io_AlnFileReaders.py
index 2a128d7..5d95a25 100644
--- a/tests/test_pbcore_io_AlnFileReaders.py
+++ b/tests/test_pbcore_io_AlnFileReaders.py
@@ -320,7 +320,8 @@ class _BasicAlnFileReaderTests(object):
 
     def testReadGroupTable(self):
         rgFwd = self.fwdAln.readGroupInfo
-        EQ([('ID', '<i4'), ('MovieName', 'O'), ('ReadType', 'O'), ('SequencingChemistry', 'O'), ('FrameRate', '<f8')], rgFwd.dtype)
+        EQ([('ID', '<i4'), ('MovieName', 'O'), ('ReadType', 'O'), ('SequencingChemistry', 'O'), ('FrameRate', '<f8'), ('BaseFeatures', 'O')], rgFwd.dtype)
+        EQ(True, isinstance(rgFwd.BaseFeatures, frozenset))
         EQ("P6-C4", rgFwd.SequencingChemistry)
         EQ("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0", rgFwd.MovieName)
         #EQ("bar", rgFwd.ReadType)
@@ -470,6 +471,15 @@ class TestCCSBam(object):
                 aln.qEnd
 
 
+class TestEmptyBam(object):
+
+    def test_empty_bam_reads_in_range(self):
+        with IndexedBamReader(data.getEmptyAlignedBam()) as bam:
+            reads = bam.readsInRange("lambda_NEB3011", 0, 50000,
+                justIndices=True)
+            EQ(len(reads), 0)
+
+
 class TestMultipleReadGroups(object):
     """
     Verify that BAM files with multiple read groups are handled sensibly - see
diff --git a/tests/test_pbcore_io_GffIO.py b/tests/test_pbcore_io_GffIO.py
index 24c1b5c..1be13b0 100644
--- a/tests/test_pbcore_io_GffIO.py
+++ b/tests/test_pbcore_io_GffIO.py
@@ -1,5 +1,6 @@
 
 from StringIO import StringIO
+import tempfile
 import unittest
 import os.path
 
@@ -191,3 +192,13 @@ chr1\tkinModCall\tmodified_base\t16348\t16348\t42\t-\t.\tcoverage=115;context=CC
             start = [ (rec.seqid, rec.start) for rec in f ]
             self.assertEqual(start, self.sorted_start)
         rm_out(gff_out)
+
+    def test_empty_file(self):
+        gff_tmp = tempfile.NamedTemporaryFile(suffix=".gff").name
+        with open(gff_tmp, "w") as f:
+            f.write("##gff-version 3\n")
+            f.write("##source ipdSummary\n")
+            f.write("##sequence-region lambda_NEB3011 1 48502")
+        gff_out = tempfile.NamedTemporaryFile(suffix=".gff").name
+        merge_gffs(self.files + [gff_tmp], gff_out)
+        rm_out(gff_out)
diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py
index dd3577b..8b0d0a5 100644
--- a/tests/test_pbdataset.py
+++ b/tests/test_pbdataset.py
@@ -4,52 +4,28 @@ import re
 import logging
 import itertools
 import tempfile
-
-import numpy as np
 import unittest
-import shutil
-import copy
-from random import shuffle
 from unittest.case import SkipTest
 
+import shutil
+import numpy as np
+
 from pbcore.io import PacBioBamIndex, IndexedBamReader
 from pbcore.io import openIndexedAlignmentFile
-from pbcore.io.dataset.utils import BamtoolsVersion
+from pbcore.io.dataset.utils import consolidateXml
 from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
-                       openDataSet, DataSetMetaTypes, HdfSubreadSet,
-                       ConsensusReadSet, ConsensusAlignmentSet, openDataFile,
-                       divideKeys, keysToRanges, getDataSetUuid,
-                       getDataSetMetaType)
-from pbcore.io.dataset.DataSetIO import _dsIdToSuffix, InvalidDataSetIOError
+                       openDataSet, HdfSubreadSet,
+                       ConsensusReadSet, ConsensusAlignmentSet)
+from pbcore.io.dataset.DataSetMetaTypes import InvalidDataSetIOError
 from pbcore.io.dataset.DataSetMembers import ExternalResource, Filters
 from pbcore.io.dataset.DataSetValidator import validateFile
 from pbcore.util.Process import backticks
 import pbcore.data.datasets as data
 import pbcore.data as upstreamdata
 
-log = logging.getLogger(__name__)
-
-def _check_constools():
-    if not BamtoolsVersion().good:
-        log.warn("Bamtools not found or out of date")
-        return False
-
-    cmd = "pbindex"
-    o, r, m = backticks(cmd)
-    if r != 1:
-        return False
-
-    cmd = "samtools"
-    o, r, m = backticks(cmd)
-    if r != 1:
-        return False
-    return True
-
-def _internal_data():
-    if os.path.exists("/pbi/dept/secondary/siv/testdata"):
-        return True
-    return False
+from utils import _pbtestdata, _check_constools, _internal_data
 
+log = logging.getLogger(__name__)
 
 class TestDataSet(unittest.TestCase):
     """Unit and integrationt tests for the DataSet class and \
@@ -237,7 +213,7 @@ class TestDataSet(unittest.TestCase):
             2)
 
 
-        dset = AlignmentSet(upstreamdata.getEmptyBam())
+        dset = AlignmentSet(upstreamdata.getEmptyAlignedBam())
         self.assertEqual(dset.numRecords, 0)
         self.assertEqual(dset.totalLength, 0)
         self.assertEqual(len(list(dset)), 0)
@@ -250,7 +226,7 @@ class TestDataSet(unittest.TestCase):
             1)
 
         # empty and full:
-        dset = AlignmentSet(upstreamdata.getEmptyBam(), data.getBam())
+        dset = AlignmentSet(upstreamdata.getEmptyAlignedBam(), data.getBam())
         self.assertEqual(dset.numRecords, 92)
         self.assertEqual(dset.totalLength, 123588)
         self.assertEqual(len(list(dset)), 92)
@@ -270,7 +246,7 @@ class TestDataSet(unittest.TestCase):
         dset.index
         self.assertEqual(len(dset.resourceReaders()), 1)
 
-        dset = ConsensusAlignmentSet(upstreamdata.getEmptyBam())
+        dset = ConsensusAlignmentSet(upstreamdata.getEmptyAlignedBam())
         self.assertEqual(dset.numRecords, 0)
         self.assertEqual(dset.totalLength, 0)
         self.assertEqual(len(list(dset)), 0)
@@ -283,6 +259,9 @@ class TestDataSet(unittest.TestCase):
         outfile = os.path.split(upstreamdata.getEmptyBam())[1]
         outpath = os.path.join(outdir, outfile)
         shutil.copy(upstreamdata.getEmptyBam(), outpath)
+        alnoutfile = os.path.split(upstreamdata.getEmptyAlignedBam())[1]
+        alnoutpath = os.path.join(outdir, alnoutfile)
+        shutil.copy(upstreamdata.getEmptyAlignedBam(), alnoutpath)
         dset = SubreadSet(outpath)
         self.assertEqual(dset.numRecords, 0)
         self.assertEqual(dset.totalLength, 0)
@@ -309,7 +288,7 @@ class TestDataSet(unittest.TestCase):
             1)
 
 
-        dset = AlignmentSet(outpath)
+        dset = AlignmentSet(alnoutpath)
         self.assertEqual(dset.numRecords, 0)
         self.assertEqual(dset.totalLength, 0)
         self.assertEqual(len(list(dset)), 0)
@@ -320,7 +299,7 @@ class TestDataSet(unittest.TestCase):
             1)
 
         # empty and full:
-        dset = AlignmentSet(outpath, data.getBam())
+        dset = AlignmentSet(alnoutpath, data.getBam())
         # without a pbi, updating counts is broken
         self.assertEqual(dset.numRecords, 0)
         self.assertEqual(dset.totalLength, 0)
@@ -340,27 +319,83 @@ class TestDataSet(unittest.TestCase):
         dset.updateCounts()
         self.assertEqual(len(dset.resourceReaders()), 1)
 
-        dset = ConsensusAlignmentSet(outpath)
+        dset = ConsensusAlignmentSet(alnoutpath)
         self.assertEqual(dset.numRecords, 0)
         self.assertEqual(dset.totalLength, 0)
         self.assertEqual(len(list(dset)), 0)
         dset.updateCounts()
         self.assertEqual(len(dset.resourceReaders()), 1)
         dset.induceIndices()
-        dset = ConsensusAlignmentSet(outpath)
+        dset = ConsensusAlignmentSet(alnoutpath)
         self.assertEqual(dset.numRecords, 0)
         self.assertEqual(dset.totalLength, 0)
         self.assertEqual(len(list(dset)), 0)
         dset.updateCounts()
         self.assertEqual(len(dset.resourceReaders()), 1)
 
+
+    def test_empty_bam_index_dtype(self):
+        # Make sure the BAM and DataSet APIs are consistent
+        empty_bam = upstreamdata.getEmptyBam()
+        sset = SubreadSet(empty_bam)
+        empty = np.array([], dtype=np.int32)
+
+        # The BAM API
+        self.assertTrue(np.array_equal(
+            sset.resourceReaders()[0].index.qId,
+            empty))
+
+        # The DataSet API
+        self.assertTrue(np.array_equal(
+            sset.index.qId,
+            empty))
+
+        # Check to make sure we can stack them:
+        full_bam = upstreamdata.getUnalignedBam()
+        sset = SubreadSet(empty_bam, full_bam)
+
+        # The BAM API
+        self.assertTrue(len(sset.resourceReaders()[1].index.qId) != 0)
+
+        # The DataSet API
+        self.assertTrue(len(sset.index.qId) != 0)
+
+    def test_empty_aligned_bam_index_dtype(self):
+        # Make sure the BAM and DataSet APIs are consistent
+        empty_bam = data.getEmptyAlignedBam()
+        alnFile = AlignmentSet(empty_bam)
+        empty = np.array([], dtype=np.int32)
+
+        # The BAM API
+        self.assertTrue(np.array_equal(
+            alnFile.resourceReaders()[0].tId,
+            empty))
+        self.assertTrue(np.array_equal(
+            alnFile.resourceReaders()[0].index.tId,
+            empty))
+
+        # The DataSet API
+        self.assertTrue(np.array_equal(alnFile.tId, empty))
+        self.assertTrue(np.array_equal(alnFile.index.tId, empty))
+
+        # Check to make sure we can stack them:
+        full_bam = upstreamdata.getBamAndCmpH5()[0]
+        aset = AlignmentSet(empty_bam, full_bam)
+
+        # The BAM API
+        self.assertTrue(len(aset.resourceReaders()[1].index.qId) != 0)
+
+        # The DataSet API
+        self.assertTrue(len(aset.index.qId) != 0)
+
     def test_read_ranges(self):
         # This models the old and new ways by which Genomic Consensus generates
         # lists of paired tStarts and tEnds.
 
         full_bam = upstreamdata.getBamAndCmpH5()[0]
         empty_bam = data.getEmptyAlignedBam()
-        file_lists = [[full_bam, empty_bam],
+        file_lists = [[empty_bam],
+                      [full_bam, empty_bam],
                       [empty_bam, full_bam]]
         refId_list = ['lambda_NEB3011', 0]
         minMapQV = 30
@@ -369,16 +404,6 @@ class TestDataSet(unittest.TestCase):
             for refId in refId_list:
                 alnFile = AlignmentSet(*file_list)
 
-                # old GC:
-                # TODO(mdsmith)(2016-01-27): Remove assertRaises when tId
-                # accession is fixed for empty aligned bam+pbi files
-                with self.assertRaises(AttributeError):
-                    for reader in alnFile.resourceReaders(refId):
-                        reader.index.tId
-                        refLen = reader.referenceInfo(refId).Length
-                        rows = reader.readsInRange(refId, 0, refLen,
-                                                   justIndices=True)
-
                 # new GC (just testing that it doesn't raise exceptions):
                 rows = alnFile.index[
                     ((alnFile.tId == alnFile.referenceInfo(refId).ID) &
@@ -465,14 +490,6 @@ class TestDataSet(unittest.TestCase):
         with self.assertRaises(Exception):
             HdfSubreadSet(fasta, strict=True)
 
-    def test_dsIdToSuffix(self):
-        suffixes = ['subreadset.xml', 'hdfsubreadset.xml', 'alignmentset.xml',
-                    'barcodeset.xml', 'consensusreadset.xml',
-                    'consensusalignmentset.xml',
-                    'referenceset.xml', 'contigset.xml']
-        for dsId, exp in zip(DataSetMetaTypes.ALL, suffixes):
-            self.assertEqual(_dsIdToSuffix(dsId), exp)
-
     def test_updateCounts_without_pbi(self):
         log.info("Testing updateCounts without pbi")
         data_fname = data.getBam(0)
@@ -663,148 +680,64 @@ class TestDataSet(unittest.TestCase):
         _ = ds.newUuid()
         self.assertTrue(old != ds.uuid)
 
-    def test_split(self):
-        ds1 = openDataSet(data.getXml(12))
-        self.assertTrue(ds1.numExternalResources > 1)
-        dss = ds1.split()
-        self.assertTrue(len(dss) == ds1.numExternalResources)
-        dss = ds1.split(chunks=1)
-        self.assertTrue(len(dss) == 1)
-        dss = ds1.split(chunks=2, ignoreSubDatasets=True)
-        self.assertTrue(len(dss) == 2)
-        self.assertFalse(dss[0].uuid == dss[1].uuid)
-        self.assertTrue(dss[0].name == dss[1].name)
-        # Lets try merging and splitting on subdatasets
-        ds1 = openDataSet(data.getXml(8))
-        self.assertEquals(ds1.totalLength, 123588)
-        ds1tl = ds1.totalLength
-        ds2 = openDataSet(data.getXml(11))
-        self.assertEquals(ds2.totalLength, 117086)
-        ds2tl = ds2.totalLength
-        dss = ds1 + ds2
-        self.assertTrue(dss.totalLength == (ds1tl + ds2tl))
-        ds1, ds2 = sorted(dss.split(2, ignoreSubDatasets=False),
-                          key=lambda x: x.totalLength,
-                          reverse=True)
-        self.assertTrue(ds1.totalLength == ds1tl)
-        self.assertTrue(ds2.totalLength == ds2tl)
-
-    def test_split_zmws(self):
-        N_RECORDS = 117
-        test_file = upstreamdata.getUnalignedBam()
-        ds1 = openDataFile(test_file)
-        self.assertEqual(len([r for r in ds1]), N_RECORDS)
-        self.assertEqual(len(ds1), N_RECORDS)
-        dss = ds1.split(chunks=1, zmws=True)
-        self.assertEqual(len(dss), 1)
-        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
-                         N_RECORDS)
-        self.assertEqual(sum([len(ds_) for ds_ in dss]),
-                         N_RECORDS)
-
-        # We have a lower limit on the number of zmws, now
-        dss = ds1.split(chunks=12, zmws=True)
-        self.assertEqual(len(dss), 2)
-        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
-                         N_RECORDS)
-        self.assertEqual(sum([len(ds_) for ds_ in dss]),
-                         N_RECORDS)
-        self.assertEqual(
-            dss[0].zmwRanges,
-            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
-              1650, 32328)])
-        self.assertEqual(
-            dss[-1].zmwRanges,
-            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
-              32560, 54396)])
-        ranges = sorted([c.zmwRanges[0][1:] for c in dss])
-        interspans = []
-        last = None
-        for rg in ranges:
-            if not last is None:
-                interspans.append((last, rg[0]))
-                self.assertFalse(last == rg[0])
-            last = rg[1]
-        for rg in interspans:
-            self.assertEqual(len(np.nonzero(np.logical_and(
-                ds1.index.holeNumber < rg[1],
-                ds1.index.holeNumber > rg[0]))[0]), 0)
-
-    #@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
-    #                     "Missing testadata directory")
-    @unittest.skip("Too expensive")
-    def test_large_split_zmws(self):
-        N_RECORDS = 959539
-        test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
-                     "2372215/0007/Analysis_Results/m150404_101626_42"
-                     "267_c100807920800000001823174110291514_s1_p0.al"
-                     "l.subreadset.xml")
-        ds1 = openDataFile(test_file)
-        self.assertEqual(len(ds1), N_RECORDS)
-        dss = ds1.split(chunks=1, zmws=True)
-        self.assertEqual(len(dss), 1)
-        self.assertEqual(sum([len(ds_) for ds_ in dss]),
-                         N_RECORDS)
-        dss = ds1.split(chunks=12, zmws=True)
-        self.assertEqual(len(dss), 12)
-        self.assertEqual(sum([len(ds_) for ds_ in dss]),
-                         N_RECORDS)
-        self.assertEqual(
-            dss[0].zmwRanges,
-            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
-              7, 14007)])
-        self.assertEqual(
-            dss[-1].zmwRanges,
-            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
-              149876, 163475)])
-        ranges = sorted([c.zmwRanges[0][1:] for c in dss])
-        interspans = []
-        last = None
-        for rg in ranges:
-            if not last is None:
-                interspans.append((last, rg[0]))
-                self.assertFalse(last == rg[0])
-            last = rg[1]
-        for rg in interspans:
-            self.assertEqual(len(np.nonzero(np.logical_and(
-                ds1.index.holeNumber < rg[1],
-                ds1.index.holeNumber > rg[0]))[0]), 0)
-
-
-    @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
-                         "Missing testadata directory")
-    def test_multi_movie_split_zmws(self):
-        N_RECORDS = 1745161
-        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
-                       "2372215/0007/Analysis_Results/m150404_101626_42"
-                       "267_c100807920800000001823174110291514_s1_p0.al"
-                       "l.subreadset.xml")
-        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
-                       "2590980/0008/Analysis_Results/m141115_075238_et"
-                       "han_c100699872550000001823139203261572_s1_p0.al"
-                       "l.subreadset.xml")
-        ds1 = SubreadSet(test_file_1, test_file_2)
-        # used to get total:
-        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
-        self.assertEqual(len(ds1), N_RECORDS)
-        dss = ds1.split(chunks=1, zmws=True)
-        self.assertEqual(len(dss), 1)
-        self.assertEqual(sum([len(ds_) for ds_ in dss]),
-                         N_RECORDS)
-
-        dss = ds1.split(chunks=12, zmws=True)
-        self.assertEqual(len(dss), 12)
-        self.assertEqual(sum([len(ds_) for ds_ in dss]),
-                         N_RECORDS)
-        self.assertEqual(
-            dss[0].zmwRanges,
-            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
-              7, 22098)])
-        self.assertEqual(
-            dss[-1].zmwRanges,
-            [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0',
-              127814, 163468)])
-
+    def test_newUuid_repeat(self):
+        ds = DataSet()
+        old = ds.uuid
+        new = ds.newUuid()
+        self.assertTrue(old != ds.uuid)
+        self.assertTrue(old != new)
+        self.assertTrue(ds.uuid == new)
+        reallynew = ds.newUuid()
+        # Note that you can keep calling new, and each tiem it will be
+        # different:
+        last = ds.uuid
+        for _ in range(10):
+            ds.newUuid()
+            self.assertTrue(ds.uuid != new)
+            self.assertTrue(ds.uuid != last)
+            last = ds.uuid
+        self.assertTrue(reallynew != new)
+        self.assertTrue(reallynew != old)
+
+    def test_newUuid_copy(self):
+        fn_orig = data.getXml(8)
+        fn1 = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        fn2 = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        shutil.copy(fn_orig, fn1)
+        shutil.copy(fn_orig, fn2)
+        ds1 = openDataSet(fn1)
+        ds2 = openDataSet(fn2)
+        self.assertEqual(ds1.uuid, ds2.uuid)
+        for _ in range(10):
+            ds1.newUuid()
+            ds2.newUuid()
+            self.assertEqual(ds1.uuid, ds2.uuid)
+
+    def test_newUuid_random(self):
+        fn_orig = data.getXml(8)
+        fn1 = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        fn2 = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        shutil.copy(fn_orig, fn1)
+        shutil.copy(fn_orig, fn2)
+        ds1 = openDataSet(fn1)
+        ds2 = openDataSet(fn2)
+        self.assertEqual(ds1.uuid, ds2.uuid)
+        for _ in range(10):
+            ds1.newUuid(random=True)
+            ds2.newUuid(random=True)
+            self.assertNotEqual(ds1.uuid, ds2.uuid)
+
+    def test_bad_xml_extension(self):
+        fn = tempfile.NamedTemporaryFile(
+            suffix=".alignmentset.xml.disabled").name
+        with AlignmentSet(data.getXml(8)) as aln:
+            aln.write(fn)
+        with AlignmentSet(fn) as aln:
+            self.assertEqual(len(aln), 92)
+        shutil.copy(data.getBam(), fn)
+        with self.assertRaises(IOError):
+            with AlignmentSet(fn) as aln:
+                self.assertEqual(len(aln), 92)
 
     @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
                          "Missing testadata directory")
@@ -902,49 +835,6 @@ class TestDataSet(unittest.TestCase):
         ds3 = AlignmentSet(data.getBam())
         ds3.write(outfile)
 
-    def test_addFilters(self):
-        ds1 = DataSet()
-        filt = Filters()
-        filt.addRequirement(rq=[('>', '0.85')])
-        ds1.addFilters(filt)
-        self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
-        # Or added from a source XML
-        ds2 = DataSet(data.getXml(16))
-        self.assertTrue(str(ds2.filters).startswith(
-            '( rname = E.faecalis'))
-
-    def test_setFilters(self):
-        ds1 = DataSet()
-        filt = Filters()
-        filt.addRequirement(rq=[('>', '0.85')])
-        ds1.addFilters(filt)
-        self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
-        # Or added from a source XML
-        ds2 = DataSet()
-        ds2.filters = ds1.filters
-        self.assertEquals(str(ds2.filters), '( rq > 0.85 )')
-
-
-    def test_add_double_bound_filters(self):
-        ds1 = AlignmentSet(data.getXml(8))
-        ds1.filters.addRequirement(rq=[('>', '0.85'),
-                                       ('<', '0.99')])
-        self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )')
-
-        ds1 = AlignmentSet(data.getXml(8))
-        self.assertEquals(str(ds1.filters), '')
-        ds1.filters.addFilter(rq=[('>', '0.85'),
-                                  ('<', '0.99')])
-        self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )')
-
-        ds1.filters.addFilter(length=[('>', '1000')])
-        self.assertEquals(str(ds1.filters),
-                          '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )')
-
-        ds1.filters.removeFilter(0)
-        self.assertEquals(str(ds1.filters),
-                          '( length > 1000 )')
-
     def test_addMetadata(self):
         ds = DataSet()
         ds.addMetadata(None, Name='LongReadsRock')
@@ -957,47 +847,6 @@ class TestDataSet(unittest.TestCase):
         ds2._metadata.totalLength += 100000
         self.assertEquals(ds2._metadata.totalLength, 200000)
 
-    @unittest.skipIf(not _internal_data(),
-                     "Internal data not available")
-    def test_loadMetadata(self):
-        aln = AlignmentSet(data.getXml(no=8))
-        self.assertFalse(aln.metadata.collections)
-        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
-                         'SA3-Sequel/lambda/roche_SAT/'
-                         'm54013_151205_032353.run.metadata.xml')
-        self.assertTrue(aln.metadata.collections)
-        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
-                'SA3-Sequel/lambda/roche_SAT/'
-                'm54013_151205_032353.subreadset.xml')
-        sset = SubreadSet(sset_fn)
-        orig_metadata = copy.deepcopy(sset.metadata)
-        sset.metadata.collections = None
-        self.assertFalse(sset.metadata.collections)
-        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
-                          'SA3-Sequel/lambda/roche_SAT/'
-                          'm54013_151205_032353.run.metadata.xml')
-        stack = zip(sset.metadata, orig_metadata)
-        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
-        sset.write(fn)
-        validateFile(fn)
-        validateFile(sset_fn)
-        self.assertEqual(sset.metadata, orig_metadata)
-
-
-        # load the wrong thing...
-        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
-                'SA3-Sequel/lambda/roche_SAT/'
-                'm54013_151205_032353.subreadset.xml')
-        sset = SubreadSet(sset_fn)
-        orig_metadata = copy.deepcopy(sset.metadata)
-        sset.metadata.collections = None
-        self.assertFalse(sset.metadata.collections)
-        with self.assertRaises(InvalidDataSetIOError):
-            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
-                              'SA3-Sequel/lambda/roche_SAT/'
-                              'm54013_151205_032353.sts.xml')
-
-
     def test_copyTo(self):
         aln = AlignmentSet(data.getXml(no=8), strict=True)
         explen = len(aln)
@@ -1036,6 +885,24 @@ class TestDataSet(unittest.TestCase):
         self.assertNotEqual(sorted(aln.toExternalFiles()),
                             sorted(aln2.toExternalFiles()))
 
+    @unittest.skipIf((not _pbtestdata() or not _check_constools()),
+                     "Internal data not available")
+    def test_copyTo_same_base_names(self):
+        import pbtestdata
+        # see bug 33778
+        tmp_bam = tempfile.NamedTemporaryFile(suffix=".bam").name
+        log.debug(tmp_bam)
+        ds = AlignmentSet(pbtestdata.get_file("aligned-ds-2"))
+        log.debug(pbtestdata.get_file("aligned-ds-2"))
+        consolidateXml(ds, tmp_bam, cleanup=True)
+        with AlignmentSet(tmp_bam) as f:
+            qnames = set()
+            for rec in f:
+                qnames.add(rec.qName)
+            assert len(qnames) == len([rec for rec in f])
+            assert len(qnames) == len(f)
+
+
     def test_addExternalResources(self):
         ds = DataSet()
         er1 = ExternalResource()
@@ -1250,373 +1117,6 @@ class TestDataSet(unittest.TestCase):
         for r1, r2 in itertools.izip(reads1, reads2):
             self.assertEqual(r1, r2)
 
-    def test_n_subreads_filter(self):
-        ds2 = AlignmentSet(data.getXml(8))
-        ds2.filters.addRequirement(n_subreads=[('>', '4')])
-        self.assertEqual(len(list(ds2.records)), 87)
-        self.assertEqual(len(ds2), 87)
-
-    def test_filter(self):
-        ds2 = AlignmentSet(data.getXml(8))
-        ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
-        self.assertEqual(len(list(ds2.records)), 20)
-        ds2.disableFilters()
-        self.assertEqual(len(list(ds2.records)), 92)
-        ds2.enableFilters()
-        self.assertEqual(len(list(ds2.records)), 20)
-
-    def test_context_filters(self):
-        ss = SubreadSet(upstreamdata.getUnalignedBam())
-        self.assertEqual(set(ss.index.contextFlag), {0, 1, 2, 3})
-        self.assertEqual(
-            [len(np.flatnonzero(ss.index.contextFlag == cx))
-             for cx in sorted(set(ss.index.contextFlag))],
-            [15, 33, 32, 37])
-        self.assertEqual(len(ss.index), 117)
-
-        # no adapters/barcodes
-        ss.filters.addRequirement(cx=[('=', 0)])
-        self.assertEqual(len(ss.index), 15)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # no adapters/barcodes
-        ss.filters.addRequirement(cx=[('=', 'NO_LOCAL_CONTEXT')])
-        self.assertEqual(len(ss.index), 15)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # some adapters/barcodes
-        ss.filters.addRequirement(cx=[('!=', 0)])
-        self.assertEqual(len(ss.index), 102)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # adapter before
-        ss.filters.addRequirement(cx=[('&', 1)])
-        self.assertEqual(len(ss.index), 70)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # adapter before
-        ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE')])
-        self.assertEqual(len(ss.index), 70)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # adapter after
-        ss.filters.addRequirement(cx=[('&', 2)])
-        self.assertEqual(len(ss.index), 69)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # adapter before or after
-        ss.filters.addRequirement(cx=[('&', 3)])
-        self.assertEqual(len(ss.index), 102)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # adapter before or after
-        ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE | ADAPTER_AFTER')])
-        self.assertEqual(len(ss.index), 102)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # adapter before or after but not both
-        ss.filters.addRequirement(cx=[('!=', 0)])
-        ss.filters.addRequirement(cx=[('~', 1),
-                                      ('~', 2)])
-        self.assertEqual(len(ss.index), 65)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # adapter before or after
-        ss.filters.addRequirement(cx=[('&', 1),
-                                      ('&', 2)])
-        self.assertEqual(len(ss.index), 102)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # adapter before and after
-        ss.filters.addRequirement(cx=[('&', 1)])
-        ss.filters.addRequirement(cx=[('&', 2)])
-        self.assertEqual(len(ss.index), 37)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # adapter before but not after
-        ss.filters.addRequirement(cx=[('&', 1)])
-        ss.filters.addRequirement(cx=[('~', 2)])
-        self.assertEqual(len(ss.index), 33)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # no adapter before
-        ss.filters.addRequirement(cx=[('~', 1)])
-        self.assertEqual(len(ss.index), 47)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # no adapter before or after
-        ss.filters.addRequirement(cx=[('~', 1)])
-        ss.filters.addRequirement(cx=[('~', 2)])
-        self.assertEqual(len(ss.index), 15)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-        # no adapter before or after
-        ss.filters.addRequirement(cx=[('~', 3)])
-        self.assertEqual(len(ss.index), 15)
-        ss.filters.removeRequirement('cx')
-        self.assertEqual(len(ss.index), 117)
-
-    @SkipTest
-    def test_split_by_contigs_presplit(self):
-        # Consumes too much memory for Jenkins
-
-        # Test to make sure the result of a split by contigs has an appropriate
-        # number of records (make sure filters are appropriately aggressive)
-        ds2 = DataSet(data.getXml(15))
-        bams = ds2.externalResources.resourceIds
-        self.assertEqual(len(bams), 2)
-        refwindows = ds2.refWindows
-        self.assertEqual(refwindows, [(0, 0, 224992)])
-        res1 = openIndexedAlignmentFile(bams[0][7:])
-        res2 = openIndexedAlignmentFile(bams[1][7:])
-        def count(iterable):
-            count = 0
-            for _ in iterable:
-                count += 1
-            return count
-        self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409)
-        self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375)
-        self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784)
-        self.assertEqual(count(ds2.records), 2784)
-        ds2.disableFilters()
-        self.assertEqual(count(ds2.records), 53552)
-        self.assertEqual(ds2.countRecords(), 53552)
-
-    def test_split_by_contigs_with_split_and_maxChunks(self):
-        # test to make sure the refWindows work when chunks == # refs
-        ds3 = AlignmentSet(data.getBam())
-        dss = ds3.split(contigs=True)
-        self.assertEqual(len(dss), 12)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        # not all references have something mapped to them, refWindows doesn't
-        # care...
-        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
-        self.assertEqual(refWindows,
-            [('B.vulgatus.4', 0, 1449), ('B.vulgatus.5', 0, 1449),
-             ('C.beijerinckii.13', 0, 1433), ('C.beijerinckii.14', 0, 1433),
-             ('C.beijerinckii.9', 0, 1433), ('E.coli.6', 0, 1463),
-             ('E.faecalis.1', 0, 1482), ('E.faecalis.2', 0, 1482),
-             ('R.sphaeroides.1', 0, 1386), ('S.epidermidis.2', 0, 1472),
-             ('S.epidermidis.3', 0, 1472), ('S.epidermidis.4', 0, 1472)])
-        old_refWindows = refWindows
-        random_few = [('C.beijerinckii.13', 0, 1433),
-                      ('B.vulgatus.4', 0, 1449),
-                      ('E.faecalis.1', 0, 1482)]
-
-        dss = ds3.split(contigs=True, maxChunks=1)
-        self.assertEqual(len(dss), 1)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        self.assertEqual(refWindows, old_refWindows)
-
-        dss = ds3.split(contigs=True, maxChunks=24)
-        # This isn't expected if num refs >= 100, as map check isn't made
-        # for now (too expensive)
-        # There are only 12 refs represented in this set, however...
-        self.assertEqual(len(dss), 12)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            if not found:
-                log.debug(ref)
-            self.assertTrue(found)
-
-        # test with maxchunks but no breaking contigs
-        dss = ds3.split(contigs=True, maxChunks=36)
-        self.assertEqual(len(dss), 12)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            self.assertTrue(found)
-
-        # test with maxchunks and breaking contigs is allowed (triggers
-        # targetsize, may result in fewer chunks)
-        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True)
-        self.assertEqual(len(dss), 2)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            self.assertTrue(found)
-
-        # test with previous setup and smaller targetSize, resulting in more
-        # chunks
-        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True,
-                        targetSize=10)
-        self.assertEqual(len(dss), 9)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            self.assertTrue(found)
-
-        # test with byRecords and fewer chunks than atoms
-        dss = ds3.split(contigs=True, chunks=3, byRecords=True)
-        self.assertEqual(len(dss), 3)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            self.assertTrue(found)
-
-        # test with byRecords and more chunks than atoms
-        orf = random_few
-        random_few = [('C.beijerinckii.13', 0, 747),
-                      ('B.vulgatus.4', 0, 1449),
-                      ('E.faecalis.1', 0, 742)]
-        dss = ds3.split(contigs=True, chunks=16, byRecords=True)
-        self.assertEqual(len(dss), 16)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            self.assertTrue(found)
-
-        # test with byRecords and updateCounts
-        random_few = orf
-        dss = ds3.split(contigs=True, chunks=3, byRecords=True,
-                        updateCounts=True)
-        self.assertEqual(len(dss), 3)
-        sizes = sorted([dset.numRecords for dset in dss])
-        self.assertListEqual(sizes, [30, 31, 31])
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            self.assertTrue(found)
-
-        # test with byRefLength and updateCounts
-        random_few = orf
-        dss = ds3.split(contigs=True, chunks=3, updateCounts=True)
-        self.assertEqual(len(dss), 3)
-        sizes = sorted([dset.numRecords for dset in dss])
-        self.assertListEqual(sizes, [20, 24, 48])
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            self.assertTrue(found)
-
-    def test_split_by_contigs_with_split(self):
-        # test to make sure the refWindows work when chunks == # refs
-        ds3 = AlignmentSet(data.getBam())
-        dss = ds3.split(contigs=True)
-        self.assertEqual(len(dss), 12)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        # not all references have something mapped to them, refWindows doesn't
-        # care...
-        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
-        random_few = [('C.beijerinckii.13', 0, 1433),
-                      ('B.vulgatus.4', 0, 1449),
-                      ('E.faecalis.1', 0, 1482)]
-        for reference in random_few:
-            found = False
-            for ref in refWindows:
-                if ref == reference:
-                    found = True
-            self.assertTrue(found)
-        old_refWindows = refWindows
-
-        dss = ds3.split(contigs=True, chunks=1)
-        self.assertEqual(len(dss), 1)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        self.assertEqual(refWindows, old_refWindows)
-
-        dss = ds3.split(contigs=True, chunks=24)
-        self.assertEqual(len(dss), 24)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-
-        random_few = [('E.faecalis.2', 0, 741),
-                      ('E.faecalis.2', 741, 1482)]
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            if not found:
-                log.debug(ref)
-            self.assertTrue(found)
-
-        dss = ds3.split(contigs=True, chunks=36)
-        self.assertEqual(len(dss), 36)
-        refWindows = sorted(reduce(lambda x, y: x + y,
-                                   [ds.refWindows for ds in dss]))
-        random_few = [('E.faecalis.2', 0, 494),
-                      ('E.faecalis.2', 494, 988),
-                      ('E.faecalis.2', 988, 1482)]
-        for ref in random_few:
-            found = False
-            for window in refWindows:
-                if ref == window:
-                    found = True
-            self.assertTrue(found)
-
-    def test_filter_reference_contigs(self):
-        ds2 = ReferenceSet(data.getRef())
-        self.assertEqual(len(list(ds2.refNames)), 59)
-        self.assertEqual(len(list(ds2.records)), len(ds2.index))
-        filt = Filters()
-        filt.addRequirement(id=[('==', 'E.faecalis.1')])
-        ds2.addFilters(filt)
-        self.assertEqual(str(ds2.filters),
-                         "( id == E.faecalis.1 )")
-        self.assertEqual(len(ds2.refNames), 1)
-        self.assertEqual(len(list(ds2.records)), 1)
-        self.assertEqual(len(list(ds2.records)), len(ds2.index))
-        ds2.disableFilters()
-        self.assertEqual(len(list(ds2.refNames)), 59)
-        self.assertEqual(len(list(ds2.records)), 59)
-        self.assertEqual(len(list(ds2.records)), len(ds2.index))
-        ds2.enableFilters()
-        self.assertEqual(len(list(ds2.refNames)), 1)
-        self.assertEqual(len(list(ds2.records)), 1)
-        self.assertEqual(len(list(ds2.records)), len(ds2.index))
-
     # TODO: get this working again when adding manual subdatasets is good to go
     @SkipTest
     def test_reads_in_subdataset(self):
@@ -1644,32 +1144,6 @@ class TestDataSet(unittest.TestCase):
         #ds3 = DataSet(data.getXml(13))
         #self.assertEqual(len(list(ds3.readsInSubDatasets())), 2)
 
-    def test_refWindows(self):
-        ds = AlignmentSet(data.getBam())
-        dss = ds.split(chunks=2, contigs=True)
-        self.assertEqual(len(dss), 2)
-        log.debug(dss[0].filters)
-        log.debug(dss[1].filters)
-        self.assertTrue(
-            '( rname = E.faecalis.2 '
-            in str(dss[0].filters)
-            or
-            '( rname = E.faecalis.2 '
-            in str(dss[1].filters))
-        ds = AlignmentSet(data.getBam())
-        ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
-                                         ('=', 'E.faecalis.2')],
-                                  tStart=[('<', '99'),
-                                          ('<', '299')],
-                                  tEnd=[('>', '0'),
-                                        ('>', '100')])
-        self.assertEqual(str(ds.filters),
-                         '( rname = E.faecalis.2 AND tstart '
-                         '< 99 AND tend > 0 ) OR ( rname = '
-                         'E.faecalis.2 AND tstart < 299 AND tend > 100 )')
-        self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
-                                         ('E.faecalis.2', 100, 299)])
-
     def test_intervalContour(self):
         ds = AlignmentSet(data.getBam(0))
         coverage = ds.intervalContour('E.faecalis.1')
@@ -1964,9 +1438,10 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(len(readers[1].readGroupTable), 1)
         self.assertEqual(len(readers[2].readGroupTable), 1)
         self.assertEqual(len(aln.readGroupTable), 3)
+        self.assertTrue("BaseFeatures" in aln.readGroupTable.dtype.fields)
 
     def test_missing_file(self):
-        with self.assertRaises(InvalidDataSetIOError):
+        with self.assertRaises(IOError):
             aln = AlignmentSet("NOPE")
 
     def test_repr(self):
@@ -2104,6 +1579,36 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(ds4.metadata.summaryStats.readLenDist.bins,
                          [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
 
+    def test_multi_channel_dists(self):
+        ds = DataSet(data.getBam())
+        ds.loadStats(data.getStats())
+        ds2 = DataSet(data.getBam())
+        ds2.loadStats(data.getStats())
+        self.assertTrue(
+            list(ds.metadata.summaryStats.findChildren('BaselineLevelDist')))
+        self.assertTrue(ds.metadata.summaryStats.channelDists)
+        self.assertTrue(ds.metadata.summaryStats.otherDists)
+        self.assertTrue(ds.metadata.summaryStats.otherDists['PausinessDist'])
+        self.assertTrue(
+            ds.metadata.summaryStats.channelDists['HqBasPkMidDist']['G'])
+
+        # merge two
+        ds3 = ds + ds2
+
+        # unmerged dists should increase in length:
+        self.assertEqual(
+            len(ds3.metadata.summaryStats.channelDists['HqBasPkMidDist']['G']),
+            2 * len(
+                ds.metadata.summaryStats.channelDists['HqBasPkMidDist']['G']))
+        self.assertEqual(
+            len(ds3.metadata.summaryStats.otherDists['PausinessDist']),
+            2 * len(
+                ds.metadata.summaryStats.otherDists['PausinessDist']))
+        # merged dists should not:
+        self.assertEqual(
+            len(ds3.metadata.summaryStats.readLenDist),
+            len(ds.metadata.summaryStats.readLenDist))
+
     def test_stats_metadata(self):
         ds = DataSet(data.getBam())
         ds.loadStats(data.getStats())
@@ -2196,15 +1701,18 @@ class TestDataSet(unittest.TestCase):
                          [1, 1, 1, 0, 2, 2, 2])
 
         # now lets test the subdataset metadata retention:
-        ss = SubreadSet(data.getXml(10))
-        ss.loadStats(data.getStats(0))
-        ss.loadStats(data.getStats(1))
-        self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws)
-        self.assertEqual(
-            2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws)
-        self.assertEqual(
-            150292.0,
-            ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
+        # or not, disabling for performance reasons
+        # TODO: make this fast again, then re-enable. Copying that much was
+        # killer
+        #ss = SubreadSet(data.getXml(10))
+        #ss.loadStats(data.getStats(0))
+        #ss.loadStats(data.getStats(1))
+        #self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws)
+        #self.assertEqual(
+        #    2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws)
+        #self.assertEqual(
+        #    150292.0,
+        #    ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
 
     @unittest.skipIf(not _internal_data(),
                      "Internal data not available")
@@ -2395,105 +1903,6 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
                          aln.referenceInfo(1))
 
-    @unittest.skipIf(not _internal_data(),
-                     "Internal data not available")
-    def test_qname_filter_scaling(self):
-        # unaligned bam
-        bam0 = ("/pbi/dept/secondary/siv/testdata/"
-                "SA3-DS/ecoli/2590956/0003/"
-                "Analysis_Results/m140913_222218_42240_c10069"
-                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
-        bam1 = ("/pbi/dept/secondary/siv/testdata/"
-                "SA3-DS/ecoli/2590953/0001/"
-                "Analysis_Results/m140913_005018_42139_c10071"
-                "3652400000001823152404301534_s1_p0.all.subreadset.xml")
-        sset = SubreadSet(bam0, bam1)
-        self.assertEqual(len(sset), 178570)
-        size = 10
-        qn = [r.qName for r in sset[:size]]
-        good_qn = [('=', name) for name in qn]
-        sset.filters.addRequirement(qname=good_qn)
-        self.assertEqual(size, sum(1 for _ in sset))
-        self.assertEqual(size, len(sset))
-
-
-        sset = SubreadSet(data.getXml(10))
-        self.assertEqual(len(sset), 92)
-        size = 10
-        qn = [r.qName for r in sset[:size]]
-        good_qn = [('=', name) for name in qn]
-        sset.filters.addRequirement(qname=good_qn)
-        self.assertEqual(size, sum(1 for _ in sset))
-        self.assertEqual(size, len(sset))
-
-
-    @unittest.skipIf(not _internal_data(),
-                     "Internal data not available")
-    def test_movie_filter(self):
-        # unaligned bam
-        bam0 = ("/pbi/dept/secondary/siv/testdata/"
-                "SA3-DS/ecoli/2590956/0003/"
-                "Analysis_Results/m140913_222218_42240_c10069"
-                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
-        bam1 = ("/pbi/dept/secondary/siv/testdata/"
-                "SA3-DS/ecoli/2590953/0001/"
-                "Analysis_Results/m140913_005018_42139_c10071"
-                "3652400000001823152404301534_s1_p0.all.subreadset.xml")
-        aln = SubreadSet(bam0, bam1)
-        self.assertEqual(len(set(aln.readGroupTable['ID'])),
-                         len(aln.readGroupTable['ID']))
-        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
-        self.assertEqual(len(set(aln.readGroupTable['ID'])),
-                         len(set(aln.index.qId)))
-        self.assertEqual(len(aln), 178570)
-        aln.filters.addRequirement(movie=[(
-            '=',
-            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
-        self.assertEqual(len(SubreadSet(bam1)), len(aln))
-
-        # aligned bam
-        #bam0 = ("/pbi/dept/secondary/siv/testdata/"
-        #        "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
-        #        "m140913_222218_42240_c1006999524000000018231"
-        #        "39203261564_s1_p0.all.alignmentset.xml")
-        bam0 = upstreamdata.getBamAndCmpH5()[0]
-        bam1 = ("/pbi/dept/secondary/siv/testdata/"
-                "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
-                "m140913_005018_42139_c1007136524000000018231"
-                "52404301534_s1_p0.all.alignmentset.xml")
-        aln = AlignmentSet(bam0, bam1)
-        self.assertEqual(len(set(aln.readGroupTable['ID'])),
-                         len(aln.readGroupTable['ID']))
-        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
-        self.assertEqual(len(set(aln.readGroupTable['ID'])),
-                         len(set(aln.index.qId)))
-        self.assertEqual(len(aln), 103144)
-        aln.filters.addRequirement(movie=[(
-            '=',
-            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
-        self.assertEqual(len(AlignmentSet(bam1)), len(aln))
-
-        # cmpH5
-        cmp1 = upstreamdata.getBamAndCmpH5()[1]
-        cmp2 = ("/pbi/dept/secondary/siv/testdata/"
-                "genomic_consensus-unittest/bam_c4p6_tests/"
-                "ecoli_c4p6.cmp.h5")
-        aln = AlignmentSet(cmp1, cmp2)
-        self.assertEqual(len(set(aln.readGroupTable['ID'])),
-                         len(aln.readGroupTable['ID']))
-        self.assertEqual(len(set(aln.readGroupTable['ID'])),
-                         len(set(aln.index.MovieID)))
-        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
-        self.assertEqual(len(aln), 57147)
-        aln.filters.addRequirement(movie=[(
-            '=',
-            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0')])
-        len1 = len(AlignmentSet(cmp1))
-        self.assertEqual(len1, len(aln))
-
-        aln.filters.removeRequirement('movie')
-        self.assertEqual(len(aln), 57147)
-
     def test_exceptions(self):
         try:
             raise InvalidDataSetIOError("Wrong!")
@@ -2504,75 +1913,6 @@ class TestDataSet(unittest.TestCase):
         aln = AlignmentSet(data.getXml(8))
         self.assertEqual(aln.createdAt, '2015-08-05T10:25:18')
 
-    def test_divideKeys_keysToRanges(self):
-        keys = [0, 1, 2, 3, 5, 8, 50]
-        res = divideKeys(keys, 0)
-        self.assertEqual(res, [])
-        res = keysToRanges(res)
-        self.assertEqual(res, [])
-
-        res = divideKeys(keys, 1)
-        self.assertEqual(res, [[0, 1, 2, 3, 5, 8, 50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 50]])
-
-        res = divideKeys(keys, 2)
-        self.assertEqual(res, [[0, 1, 2], [3, 5, 8, 50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 2], [3, 50]])
-
-        res = divideKeys(keys, 3)
-        self.assertEqual(res, [[0, 1], [2, 3], [5, 8, 50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 1], [2, 3], [5, 50]])
-
-        res = divideKeys(keys, 7)
-        self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
-                               [5, 5], [8, 8], [50, 50]])
-
-        res = divideKeys(keys, 8)
-        self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
-                               [5, 5], [8, 8], [50, 50]])
-
-
-        keys = [0, 1, 2, 2, 3, 5, 8, 50, 50]
-        res = divideKeys(keys, 0)
-        self.assertEqual(res, [])
-        res = keysToRanges(res)
-        self.assertEqual(res, [])
-
-        res = divideKeys(keys, 1)
-        self.assertEqual(res, [[0, 1, 2, 2, 3, 5, 8, 50, 50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 50]])
-
-        res = divideKeys(keys, 2)
-        self.assertEqual(res, [[0, 1, 2, 2], [3, 5, 8, 50, 50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 2], [3, 50]])
-
-        res = divideKeys(keys, 3)
-        self.assertEqual(res, [[0, 1, 2], [2, 3, 5], [8, 50, 50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 2], [2, 5], [8, 50]])
-
-        res = divideKeys(keys, 9)
-        self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
-                               [5, 5], [8, 8], [50, 50], [50, 50]])
-
-        res = divideKeys(keys, 10)
-        self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
-        res = keysToRanges(res)
-        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
-                               [5, 5], [8, 8], [50, 50], [50, 50]])
-
-
     @unittest.skipIf(not _internal_data(),
                      "Internal data not available")
     def test_load_sts_from_extres(self):
@@ -2666,8 +2006,10 @@ class TestDataSet(unittest.TestCase):
         ss3 = ss + ss2
         self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
                          [b1 + b2 for b1, b2 in
-                          zip(ss.metadata.summaryStats.readLenDist.bins,
-                              ss2.metadata.summaryStats.readLenDist.bins)])
+                          itertools.izip_longest(
+                              ss.metadata.summaryStats.readLenDist.bins,
+                              ss2.metadata.summaryStats.readLenDist.bins,
+                              fillvalue=0)])
         ss4 = SubreadSet(outXml, outXml2)
 
         # one full one partial
@@ -2684,8 +2026,10 @@ class TestDataSet(unittest.TestCase):
         ss3 = ss + ss2
         self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
                          [b1 + b2 for b1, b2 in
-                          zip(ss.metadata.summaryStats.readLenDist.bins,
-                              ss2.metadata.summaryStats.readLenDist.bins)])
+                          itertools.izip_longest(
+                              ss.metadata.summaryStats.readLenDist.bins,
+                              ss2.metadata.summaryStats.readLenDist.bins,
+                              fillvalue=0)])
         ss4 = SubreadSet(outXml, outXml2)
 
         # two full
@@ -2751,29 +2095,3 @@ class TestDataSet(unittest.TestCase):
             sset.write(resXml, validate=False)
             with self.assertRaises(InvalidDataSetIOError):
                 sset = SubreadSet(resXml)
-
-
-    @unittest.skip("Too expensive")
-    def test_huge_zmw_split(self):
-        human = ('/pbi/dept/secondary/siv/testdata/SA3-DS/'
-                 'human/JCV_85x_v030/jcv_85x_v030.subreadset.xml')
-        sset = SubreadSet(human)
-        ssets = sset.split(zmws=True, maxChunks=5)
-
-    def test_get_dataset_uuid(self):
-        ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
-        ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
-        ds.write(ds_file)
-        uuid = getDataSetUuid(ds_file)
-        self.assertEqual(uuid, ds.uuid)
-        with open(ds_file, "w") as out:
-            out.write("hello world!")
-        uuid = getDataSetUuid(ds_file)
-        self.assertEqual(uuid, None)
-
-    def test_get_dataset_metatype(self):
-        ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
-        ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
-        ds.write(ds_file)
-        meta_type = getDataSetMetaType(ds_file)
-        self.assertEqual(meta_type, "PacBio.DataSet.SubreadSet")
diff --git a/tests/test_pbdataset_filters.py b/tests/test_pbdataset_filters.py
new file mode 100644
index 0000000..afbbb02
--- /dev/null
+++ b/tests/test_pbdataset_filters.py
@@ -0,0 +1,373 @@
+
+
+import logging
+import tempfile
+import unittest
+from unittest.case import SkipTest
+
+import numpy as np
+
+from pbcore.io import DataSet, SubreadSet, ReferenceSet, AlignmentSet
+from pbcore.io.dataset.DataSetMembers import Filters
+import pbcore.data.datasets as data
+import pbcore.data as upstreamdata
+
+from utils import _pbtestdata, _check_constools, _internal_data
+
+log = logging.getLogger(__name__)
+
+
+class TestDataSetFilters(unittest.TestCase):
+    """Unit and integrationt tests for the DataSet class and \
+    associated module functions"""
+
+    def test_addFilters(self):
+        ds1 = DataSet()
+        filt = Filters()
+        filt.addRequirement(rq=[('>', '0.85')])
+        ds1.addFilters(filt)
+        self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
+        # Or added from a source XML
+        ds2 = DataSet(data.getXml(16))
+        self.assertTrue(str(ds2.filters).startswith(
+            '( rname = E.faecalis'))
+
+    def test_setFilters(self):
+        ds1 = DataSet()
+        filt = Filters()
+        filt.addRequirement(rq=[('>', '0.85')])
+        ds1.addFilters(filt)
+        self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
+        # Or added from a source XML
+        ds2 = DataSet()
+        ds2.filters = ds1.filters
+        self.assertEquals(str(ds2.filters), '( rq > 0.85 )')
+
+    def test_add_double_bound_filters(self):
+        ds1 = AlignmentSet(data.getXml(8))
+        ds1.filters.addRequirement(rq=[('>', '0.85'),
+                                       ('<', '0.99')])
+        self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )')
+
+        ds1 = AlignmentSet(data.getXml(8))
+        self.assertEquals(str(ds1.filters), '')
+        ds1.filters.addFilter(rq=[('>', '0.85'),
+                                  ('<', '0.99')])
+        self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )')
+
+        ds1.filters.addFilter(length=[('>', '1000')])
+        self.assertEquals(str(ds1.filters),
+                          '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )')
+
+        ds1.filters.removeFilter(0)
+        self.assertEquals(str(ds1.filters),
+                          '( length > 1000 )')
+
+    def test_n_subreads_filter(self):
+        ds2 = AlignmentSet(data.getXml(8))
+        ds2.filters.addRequirement(n_subreads=[('>', '4')])
+        self.assertEqual(len(list(ds2.records)), 87)
+        self.assertEqual(len(ds2), 87)
+
+    def test_filter(self):
+        ds2 = AlignmentSet(data.getXml(8))
+        ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
+        self.assertEqual(len(list(ds2.records)), 20)
+        ds2.disableFilters()
+        self.assertEqual(len(list(ds2.records)), 92)
+        ds2.enableFilters()
+        self.assertEqual(len(list(ds2.records)), 20)
+
+    def test_context_filters(self):
+        ss = SubreadSet(upstreamdata.getUnalignedBam())
+        self.assertEqual(set(ss.index.contextFlag), {0, 1, 2, 3})
+        self.assertEqual(
+            [len(np.flatnonzero(ss.index.contextFlag == cx))
+             for cx in sorted(set(ss.index.contextFlag))],
+            [15, 33, 32, 37])
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapters/barcodes
+        ss.filters.addRequirement(cx=[('=', 0)])
+        self.assertEqual(len(ss.index), 15)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapters/barcodes
+        ss.filters.addRequirement(cx=[('=', 'NO_LOCAL_CONTEXT')])
+        self.assertEqual(len(ss.index), 15)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # some adapters/barcodes
+        ss.filters.addRequirement(cx=[('!=', 0)])
+        self.assertEqual(len(ss.index), 102)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before
+        ss.filters.addRequirement(cx=[('&', 1)])
+        self.assertEqual(len(ss.index), 70)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before
+        ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE')])
+        self.assertEqual(len(ss.index), 70)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter after
+        ss.filters.addRequirement(cx=[('&', 2)])
+        self.assertEqual(len(ss.index), 69)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before or after
+        ss.filters.addRequirement(cx=[('&', 3)])
+        self.assertEqual(len(ss.index), 102)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before or after
+        ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE | ADAPTER_AFTER')])
+        self.assertEqual(len(ss.index), 102)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before or after but not both
+        ss.filters.addRequirement(cx=[('!=', 0)])
+        ss.filters.addRequirement(cx=[('~', 1),
+                                      ('~', 2)])
+        self.assertEqual(len(ss.index), 65)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before or after
+        ss.filters.addRequirement(cx=[('&', 1),
+                                      ('&', 2)])
+        self.assertEqual(len(ss.index), 102)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before and after
+        ss.filters.addRequirement(cx=[('&', 1)])
+        ss.filters.addRequirement(cx=[('&', 2)])
+        self.assertEqual(len(ss.index), 37)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before but not after
+        ss.filters.addRequirement(cx=[('&', 1)])
+        ss.filters.addRequirement(cx=[('~', 2)])
+        self.assertEqual(len(ss.index), 33)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapter before
+        ss.filters.addRequirement(cx=[('~', 1)])
+        self.assertEqual(len(ss.index), 47)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapter before or after
+        ss.filters.addRequirement(cx=[('~', 1)])
+        ss.filters.addRequirement(cx=[('~', 2)])
+        self.assertEqual(len(ss.index), 15)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapter before or after
+        ss.filters.addRequirement(cx=[('~', 3)])
+        self.assertEqual(len(ss.index), 15)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+    def test_filter_reference_contigs(self):
+        ds2 = ReferenceSet(data.getRef())
+        self.assertEqual(len(list(ds2.refNames)), 59)
+        self.assertEqual(len(list(ds2.records)), len(ds2.index))
+        filt = Filters()
+        filt.addRequirement(id=[('==', 'E.faecalis.1')])
+        ds2.addFilters(filt)
+        self.assertEqual(str(ds2.filters),
+                         "( id == E.faecalis.1 )")
+        self.assertEqual(len(ds2.refNames), 1)
+        self.assertEqual(len(list(ds2.records)), 1)
+        self.assertEqual(len(list(ds2.records)), len(ds2.index))
+        ds2.disableFilters()
+        self.assertEqual(len(list(ds2.refNames)), 59)
+        self.assertEqual(len(list(ds2.records)), 59)
+        self.assertEqual(len(list(ds2.records)), len(ds2.index))
+        ds2.enableFilters()
+        self.assertEqual(len(list(ds2.refNames)), 1)
+        self.assertEqual(len(list(ds2.records)), 1)
+        self.assertEqual(len(list(ds2.records)), len(ds2.index))
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_qname_filter_scaling(self):
+        # unaligned bam
+        bam0 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590956/0003/"
+                "Analysis_Results/m140913_222218_42240_c10069"
+                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
+        bam1 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590953/0001/"
+                "Analysis_Results/m140913_005018_42139_c10071"
+                "3652400000001823152404301534_s1_p0.all.subreadset.xml")
+        sset = SubreadSet(bam0, bam1)
+        self.assertEqual(len(sset), 178570)
+        size = 10
+        qn = [r.qName for r in sset[:size]]
+        good_qn = [('=', name) for name in qn]
+        sset.filters.addRequirement(qname=good_qn)
+        self.assertEqual(size, sum(1 for _ in sset))
+        self.assertEqual(size, len(sset))
+
+
+        sset = SubreadSet(data.getXml(10))
+        self.assertEqual(len(sset), 92)
+        size = 10
+        qn = [r.qName for r in sset[:size]]
+        good_qn = [('=', name) for name in qn]
+        sset.filters.addRequirement(qname=good_qn)
+        self.assertEqual(size, sum(1 for _ in sset))
+        self.assertEqual(size, len(sset))
+
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_movie_filter(self):
+        # unaligned bam
+        bam0 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590956/0003/"
+                "Analysis_Results/m140913_222218_42240_c10069"
+                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
+        bam1 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590953/0001/"
+                "Analysis_Results/m140913_005018_42139_c10071"
+                "3652400000001823152404301534_s1_p0.all.subreadset.xml")
+        aln = SubreadSet(bam0, bam1)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(aln.readGroupTable['ID']))
+        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(set(aln.index.qId)))
+        self.assertEqual(len(aln), 178570)
+        aln.filters.addRequirement(movie=[(
+            '=',
+            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
+        self.assertEqual(len(SubreadSet(bam1)), len(aln))
+
+        # aligned bam
+        #bam0 = ("/pbi/dept/secondary/siv/testdata/"
+        #        "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+        #        "m140913_222218_42240_c1006999524000000018231"
+        #        "39203261564_s1_p0.all.alignmentset.xml")
+        bam0 = upstreamdata.getBamAndCmpH5()[0]
+        bam1 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
+                "m140913_005018_42139_c1007136524000000018231"
+                "52404301534_s1_p0.all.alignmentset.xml")
+        aln = AlignmentSet(bam0, bam1)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(aln.readGroupTable['ID']))
+        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(set(aln.index.qId)))
+        self.assertEqual(len(aln), 103144)
+        aln.filters.addRequirement(movie=[(
+            '=',
+            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
+        self.assertEqual(len(AlignmentSet(bam1)), len(aln))
+
+        # cmpH5
+        cmp1 = upstreamdata.getBamAndCmpH5()[1]
+        cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+                "genomic_consensus-unittest/bam_c4p6_tests/"
+                "ecoli_c4p6.cmp.h5")
+        aln = AlignmentSet(cmp1, cmp2)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(aln.readGroupTable['ID']))
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(set(aln.index.MovieID)))
+        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+        self.assertEqual(len(aln), 57147)
+        aln.filters.addRequirement(movie=[(
+            '=',
+            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0')])
+        len1 = len(AlignmentSet(cmp1))
+        self.assertEqual(len1, len(aln))
+
+        aln.filters.removeRequirement('movie')
+        self.assertEqual(len(aln), 57147)
+
+    def test_accuracy_filter(self):
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(list(aln)), 177)
+        aln.filters.addRequirement(accuracy=[('>', '.85')])
+        self.assertEqual(len(list(aln)), 174)
+
+    def test_membership_filter(self):
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(list(aln)), 177)
+        hns = np.unique(aln.index.holeNumber)[:1]
+        aln.filters.addRequirement(zm=[('in', hns)])
+        self.assertEqual(len(list(aln)), 5)
+
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(list(aln)), 177)
+        hns = np.unique(aln.index.holeNumber)
+        aln.filters.addRequirement(zm=[('in', hns)])
+        self.assertEqual(len(list(aln)), 177)
+
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(list(aln)), 177)
+        hns = np.unique(aln.index.holeNumber)
+        hns = [n for _ in range(10000) for n in hns]
+        hns = np.array(hns)
+        aln.filters.addRequirement(zm=[('in', hns)])
+        self.assertEqual(len(list(aln)), 177)
+
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(list(aln)), 177)
+        hns = np.unique(aln.index.holeNumber)[:1]
+        hns = list(hns)
+        aln.filters.addRequirement(zm=[('in', hns)])
+        self.assertEqual(len(list(aln)), 5)
+
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(list(aln)), 177)
+        hns = np.unique(aln.index.holeNumber)[:1]
+        hns = set(hns)
+        aln.filters.addRequirement(zm=[('in', hns)])
+        self.assertEqual(len(list(aln)), 5)
+
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(list(aln)), 177)
+        qnames = [r.qName for r in aln[:10]]
+        aln.filters.addRequirement(qname=[('in', qnames)])
+        self.assertEqual(len(list(aln)), 10)
+
+        fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(list(aln)), 177)
+        hns = np.unique(aln.index.holeNumber)[:1]
+        aln.filters.addRequirement(zm=[('in', hns)])
+        aln.write(fn)
+        aln.close()
+        aln2 = AlignmentSet(fn)
+        self.assertEqual(len(list(aln2)), 5)
+
+    def test_contigset_filter(self):
+        ref = ReferenceSet(data.getXml(9))
+        self.assertEqual(len(list(ref)), 59)
+        ref.filters.addRequirement(length=[('>', '1450')])
+        self.assertEqual(len(list(ref)), 34)
+
+    def test_cmp_alignmentset_filters(self):
+        aln = AlignmentSet(upstreamdata.getBamAndCmpH5()[1], strict=True)
+        self.assertEqual(len(aln), 112)
+        aln.filters.addRequirement(length=[('>=', 1000)])
+        self.assertEqual(len(aln), 12)
diff --git a/tests/test_pbdataset_metadata.py b/tests/test_pbdataset_metadata.py
new file mode 100644
index 0000000..e20f7c1
--- /dev/null
+++ b/tests/test_pbdataset_metadata.py
@@ -0,0 +1,114 @@
+
+import logging
+import unittest
+import tempfile
+import copy
+
+from pbcore.io import SubreadSet, AlignmentSet
+from pbcore.io.dataset.DataSetErrors import InvalidDataSetIOError
+from pbcore.io.dataset.DataSetMembers import CollectionMetadata
+import pbcore.data.datasets as data
+from pbcore.io.dataset.DataSetValidator import validateFile
+
+from utils import _pbtestdata, _check_constools, _internal_data
+
+log = logging.getLogger(__name__)
+
+class TestDataSet(unittest.TestCase):
+    """Unit and integrationt tests for the DataSet class and \
+    associated module functions"""
+
+    def test_existing(self):
+        ds = SubreadSet(data.getSubreadSet(), skipMissing=True)
+        self.assertEqual(ds.metadata.bioSamples[0].name,
+                         'consectetur purus')
+
+        self.assertTrue(ds.metadata.collections[0].getV('children',
+                                                        'Automation'))
+        self.assertTrue(ds.metadata.collections[0].automation)
+        ds.metadata.collections[
+            0].automation.automationParameters.addParameter('foo', 'bar')
+        self.assertEqual(
+            ds.metadata.collections[
+                0].automation.automationParameters['foo'].value,
+            'bar')
+        self.assertEqual(
+            ds.metadata.collections[
+                0].automation.automationParameters.parameterNames,
+            [None, 'foo'])
+
+
+    def test_de_novo(self):
+        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        log.info(ofn)
+        ss = SubreadSet(data.getXml(10))
+        col = CollectionMetadata()
+        self.assertFalse(ss.metadata.collections)
+
+        ss.metadata.collections.append(col)
+        self.assertTrue(ss.metadata.collections)
+
+        col.cellIndex = 1
+        self.assertTrue(ss.metadata.collections[0].cellIndex, 1)
+
+        col.instrumentName = "foo"
+        self.assertTrue(ss.metadata.collections[0].instrumentName, "foo")
+
+        col.context = 'bar'
+        self.assertTrue(ss.metadata.collections[0].context, "bar")
+
+        ss.metadata.collections[0].runDetails.name = 'foo'
+        self.assertEqual('foo', ss.metadata.collections[0].runDetails.name)
+
+        ss.metadata.collections[0].wellSample.name = 'bar'
+        self.assertEqual('bar', ss.metadata.collections[0].wellSample.name)
+
+        ss.metadata.collections[0].wellSample.wellName = 'baz'
+        self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName)
+
+        ss.metadata.collections[0].wellSample.concentration = 'baz'
+        self.assertEqual('baz',
+                         ss.metadata.collections[0].wellSample.concentration)
+        ss.write(ofn, validate=False)
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_loadMetadata(self):
+        aln = AlignmentSet(data.getXml(no=8))
+        self.assertFalse(aln.metadata.collections)
+        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+                         'SA3-Sequel/lambda/roche_SAT/'
+                         'm54013_151205_032353.run.metadata.xml')
+        self.assertTrue(aln.metadata.collections)
+        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
+                'SA3-Sequel/lambda/roche_SAT/'
+                'm54013_151205_032353.subreadset.xml')
+        sset = SubreadSet(sset_fn)
+        orig_metadata = copy.deepcopy(sset.metadata)
+        sset.metadata.collections = None
+        self.assertFalse(sset.metadata.collections)
+        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+                          'SA3-Sequel/lambda/roche_SAT/'
+                          'm54013_151205_032353.run.metadata.xml')
+        stack = zip(sset.metadata, orig_metadata)
+        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        sset.write(fn)
+        validateFile(fn)
+        validateFile(sset_fn)
+        self.assertEqual(sset.metadata, orig_metadata)
+
+
+        # load the wrong thing...
+        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
+                'SA3-Sequel/lambda/roche_SAT/'
+                'm54013_151205_032353.subreadset.xml')
+        sset = SubreadSet(sset_fn)
+        orig_metadata = copy.deepcopy(sset.metadata)
+        sset.metadata.collections = None
+        self.assertFalse(sset.metadata.collections)
+        with self.assertRaises(InvalidDataSetIOError):
+            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+                              'SA3-Sequel/lambda/roche_SAT/'
+                              'm54013_151205_032353.sts.xml')
+
+
diff --git a/tests/test_pbdataset_split.py b/tests/test_pbdataset_split.py
new file mode 100644
index 0000000..9ca3b26
--- /dev/null
+++ b/tests/test_pbdataset_split.py
@@ -0,0 +1,506 @@
+
+import os
+import logging
+import tempfile
+import unittest
+from unittest.case import SkipTest
+
+import numpy as np
+
+from pbcore.io import openIndexedAlignmentFile
+from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
+                       openDataSet, HdfSubreadSet,
+                       openDataFile)
+import pbcore.data.datasets as data
+import pbcore.data as upstreamdata
+
+from utils import _pbtestdata, _check_constools, _internal_data
+
+log = logging.getLogger(__name__)
+
+class TestDataSetSplit(unittest.TestCase):
+    """Unit and integrationt tests for the DataSet class and \
+    associated module functions"""
+
+    def test_split(self):
+        ds1 = openDataSet(data.getXml(12))
+        self.assertTrue(ds1.numExternalResources > 1)
+        dss = ds1.split()
+        self.assertTrue(len(dss) == ds1.numExternalResources)
+        self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
+        self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
+        self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
+        dss = ds1.split(chunks=1)
+        self.assertTrue(len(dss) == 1)
+        self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
+        self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
+        self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
+        dss = ds1.split(chunks=2)
+        self.assertTrue(len(dss) == 2)
+        self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
+        self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
+        self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
+        dss = ds1.split(chunks=2, ignoreSubDatasets=True)
+        self.assertTrue(len(dss) == 2)
+        self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
+        self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
+        self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
+        self.assertFalse(dss[0].uuid == dss[1].uuid)
+        self.assertTrue(dss[0].name == dss[1].name)
+        # Lets try merging and splitting on subdatasets
+        ds1 = openDataSet(data.getXml(8))
+        self.assertEquals(ds1.totalLength, 123588)
+        ds1tl = ds1.totalLength
+        ds2 = openDataSet(data.getXml(11))
+        self.assertEquals(ds2.totalLength, 117086)
+        ds2tl = ds2.totalLength
+        dss = ds1 + ds2
+        self.assertTrue(dss.totalLength == (ds1tl + ds2tl))
+        ds1, ds2 = sorted(dss.split(2, ignoreSubDatasets=False),
+                          key=lambda x: x.totalLength,
+                          reverse=True)
+        self.assertTrue(ds1.totalLength == ds1tl)
+        self.assertTrue(ds2.totalLength == ds2tl)
+
+    def test_split_zmws(self):
+        N_RECORDS = 117
+        test_file = upstreamdata.getUnalignedBam()
+        ds1 = openDataFile(test_file)
+        self.assertEqual(len([r for r in ds1]), N_RECORDS)
+        self.assertEqual(len(ds1), N_RECORDS)
+        dss = ds1.split(chunks=1, zmws=True)
+        self.assertEqual(len(dss), 1)
+        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
+                         N_RECORDS)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+
+        # We have a lower limit on the number of zmws, now
+        dss = ds1.split(chunks=12, zmws=True)
+        self.assertEqual(len(dss), 2)
+        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
+                         N_RECORDS)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+        self.assertEqual(
+            dss[0].zmwRanges,
+            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
+              1650, 32328)])
+        self.assertEqual(
+            dss[-1].zmwRanges,
+            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
+              32560, 54396)])
+        ranges = sorted([c.zmwRanges[0][1:] for c in dss])
+        interspans = []
+        last = None
+        for rg in ranges:
+            if not last is None:
+                interspans.append((last, rg[0]))
+                self.assertFalse(last == rg[0])
+            last = rg[1]
+        for rg in interspans:
+            self.assertEqual(len(np.nonzero(np.logical_and(
+                ds1.index.holeNumber < rg[1],
+                ds1.index.holeNumber > rg[0]))[0]), 0)
+
+    #@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+    #                     "Missing testadata directory")
+    @unittest.skip("Too expensive")
+    def test_large_split_zmws(self):
+        N_RECORDS = 959539
+        test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+                     "2372215/0007/Analysis_Results/m150404_101626_42"
+                     "267_c100807920800000001823174110291514_s1_p0.al"
+                     "l.subreadset.xml")
+        ds1 = openDataFile(test_file)
+        self.assertEqual(len(ds1), N_RECORDS)
+        dss = ds1.split(chunks=1, zmws=True)
+        self.assertEqual(len(dss), 1)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+        dss = ds1.split(chunks=12, zmws=True)
+        self.assertEqual(len(dss), 12)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+        self.assertEqual(
+            dss[0].zmwRanges,
+            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+              7, 14007)])
+        self.assertEqual(
+            dss[-1].zmwRanges,
+            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+              149876, 163475)])
+        ranges = sorted([c.zmwRanges[0][1:] for c in dss])
+        interspans = []
+        last = None
+        for rg in ranges:
+            if not last is None:
+                interspans.append((last, rg[0]))
+                self.assertFalse(last == rg[0])
+            last = rg[1]
+        for rg in interspans:
+            self.assertEqual(len(np.nonzero(np.logical_and(
+                ds1.index.holeNumber < rg[1],
+                ds1.index.holeNumber > rg[0]))[0]), 0)
+
+
+    @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+                         "Missing testadata directory")
+    def test_multi_movie_split_zmws(self):
+        N_RECORDS = 1745161
+        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+                       "2372215/0007/Analysis_Results/m150404_101626_42"
+                       "267_c100807920800000001823174110291514_s1_p0.al"
+                       "l.subreadset.xml")
+        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+                       "2590980/0008/Analysis_Results/m141115_075238_et"
+                       "han_c100699872550000001823139203261572_s1_p0.al"
+                       "l.subreadset.xml")
+        ds1 = SubreadSet(test_file_1, test_file_2)
+        # used to get total:
+        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
+        self.assertEqual(len(ds1), N_RECORDS)
+        dss = ds1.split(chunks=1, zmws=True)
+        self.assertEqual(len(dss), 1)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+
+        dss = ds1.split(chunks=12, zmws=True)
+        self.assertEqual(len(dss), 12)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+        self.assertEqual(
+            dss[0].zmwRanges,
+            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+              7, 22098)])
+        self.assertEqual(
+            dss[-1].zmwRanges,
+            [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0',
+              127814, 163468)])
+
+
+    @SkipTest
+    def test_split_by_contigs_presplit(self):
+        # Consumes too much memory for Jenkins
+
+        # Test to make sure the result of a split by contigs has an appropriate
+        # number of records (make sure filters are appropriately aggressive)
+        ds2 = DataSet(data.getXml(15))
+        bams = ds2.externalResources.resourceIds
+        self.assertEqual(len(bams), 2)
+        refwindows = ds2.refWindows
+        self.assertEqual(refwindows, [(0, 0, 224992)])
+        res1 = openIndexedAlignmentFile(bams[0][7:])
+        res2 = openIndexedAlignmentFile(bams[1][7:])
+        def count(iterable):
+            count = 0
+            for _ in iterable:
+                count += 1
+            return count
+        self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409)
+        self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375)
+        self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784)
+        self.assertEqual(count(ds2.records), 2784)
+        ds2.disableFilters()
+        self.assertEqual(count(ds2.records), 53552)
+        self.assertEqual(ds2.countRecords(), 53552)
+
+    def test_split_by_contigs_with_split_and_maxChunks(self):
+        # test to make sure the refWindows work when chunks == # refs
+        ds3 = AlignmentSet(data.getBam())
+        dss = ds3.split(contigs=True)
+        self.assertEqual(len(dss), 12)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        # not all references have something mapped to them, refWindows doesn't
+        # care...
+        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
+        self.assertEqual(refWindows,
+            [('B.vulgatus.4', 0, 1449), ('B.vulgatus.5', 0, 1449),
+             ('C.beijerinckii.13', 0, 1433), ('C.beijerinckii.14', 0, 1433),
+             ('C.beijerinckii.9', 0, 1433), ('E.coli.6', 0, 1463),
+             ('E.faecalis.1', 0, 1482), ('E.faecalis.2', 0, 1482),
+             ('R.sphaeroides.1', 0, 1386), ('S.epidermidis.2', 0, 1472),
+             ('S.epidermidis.3', 0, 1472), ('S.epidermidis.4', 0, 1472)])
+        old_refWindows = refWindows
+        random_few = [('C.beijerinckii.13', 0, 1433),
+                      ('B.vulgatus.4', 0, 1449),
+                      ('E.faecalis.1', 0, 1482)]
+
+        dss = ds3.split(contigs=True, maxChunks=1)
+        self.assertEqual(len(dss), 1)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        self.assertEqual(refWindows, old_refWindows)
+
+        dss = ds3.split(contigs=True, maxChunks=24)
+        # This isn't expected if num refs >= 100, as map check isn't made
+        # for now (too expensive)
+        # There are only 12 refs represented in this set, however...
+        self.assertEqual(len(dss), 12)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            if not found:
+                log.debug(ref)
+            self.assertTrue(found)
+
+        # test with maxchunks but no breaking contigs
+        dss = ds3.split(contigs=True, maxChunks=36)
+        self.assertEqual(len(dss), 12)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
+        # test with maxchunks and breaking contigs is allowed (triggers
+        # targetsize, may result in fewer chunks)
+        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True)
+        self.assertEqual(len(dss), 2)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
+        # test with previous setup and smaller targetSize, resulting in more
+        # chunks
+        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True,
+                        targetSize=10)
+        self.assertEqual(len(dss), 9)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
+        # test with byRecords and fewer chunks than atoms
+        dss = ds3.split(contigs=True, chunks=3, byRecords=True)
+        self.assertEqual(len(dss), 3)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
+        # test with byRecords and more chunks than atoms
+        orf = random_few
+        random_few = [('C.beijerinckii.13', 0, 747),
+                      ('B.vulgatus.4', 0, 1449),
+                      ('E.faecalis.1', 0, 742)]
+        dss = ds3.split(contigs=True, chunks=16, byRecords=True)
+        self.assertEqual(len(dss), 16)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
+        # test with byRecords and updateCounts
+        random_few = orf
+        dss = ds3.split(contigs=True, chunks=3, byRecords=True,
+                        updateCounts=True)
+        self.assertEqual(len(dss), 3)
+        sizes = sorted([dset.numRecords for dset in dss])
+        self.assertListEqual(sizes, [30, 31, 31])
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
+        # test with byRefLength and updateCounts
+        random_few = orf
+        dss = ds3.split(contigs=True, chunks=3, updateCounts=True)
+        self.assertEqual(len(dss), 3)
+        sizes = sorted([dset.numRecords for dset in dss])
+        self.assertListEqual(sizes, [20, 24, 48])
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
+    def test_split_by_contigs_with_split(self):
+        # test to make sure the refWindows work when chunks == # refs
+        ds3 = AlignmentSet(data.getBam())
+        dss = ds3.split(contigs=True)
+        self.assertEqual(len(dss), 12)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        # not all references have something mapped to them, refWindows doesn't
+        # care...
+        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
+        random_few = [('C.beijerinckii.13', 0, 1433),
+                      ('B.vulgatus.4', 0, 1449),
+                      ('E.faecalis.1', 0, 1482)]
+        for reference in random_few:
+            found = False
+            for ref in refWindows:
+                if ref == reference:
+                    found = True
+            self.assertTrue(found)
+        old_refWindows = refWindows
+
+        dss = ds3.split(contigs=True, chunks=1)
+        self.assertEqual(len(dss), 1)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        self.assertEqual(refWindows, old_refWindows)
+
+        dss = ds3.split(contigs=True, chunks=24)
+        self.assertEqual(len(dss), 24)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+
+        random_few = [('E.faecalis.2', 0, 741),
+                      ('E.faecalis.2', 741, 1482)]
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            if not found:
+                log.debug(ref)
+            self.assertTrue(found)
+
+        dss = ds3.split(contigs=True, chunks=36)
+        self.assertEqual(len(dss), 36)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        random_few = [('E.faecalis.2', 0, 494),
+                      ('E.faecalis.2', 494, 988),
+                      ('E.faecalis.2', 988, 1482)]
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
+    def test_refWindows(self):
+        ds = AlignmentSet(data.getBam())
+        dss = ds.split(chunks=2, contigs=True)
+        self.assertEqual(len(dss), 2)
+        log.debug(dss[0].filters)
+        log.debug(dss[1].filters)
+        self.assertTrue(
+            '( rname = E.faecalis.2 '
+            in str(dss[0].filters)
+            or
+            '( rname = E.faecalis.2 '
+            in str(dss[1].filters))
+        ds = AlignmentSet(data.getBam())
+        ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
+                                         ('=', 'E.faecalis.2')],
+                                  tStart=[('<', '99'),
+                                          ('<', '299')],
+                                  tEnd=[('>', '0'),
+                                        ('>', '100')])
+        self.assertEqual(str(ds.filters),
+                         '( rname = E.faecalis.2 AND tstart '
+                         '< 99 AND tend > 0 ) OR ( rname = '
+                         'E.faecalis.2 AND tstart < 299 AND tend > 100 )')
+        self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
+                                         ('E.faecalis.2', 100, 299)])
+
+
+    @unittest.skip("Too expensive")
+    def test_huge_zmw_split(self):
+        human = ('/pbi/dept/secondary/siv/testdata/SA3-DS/'
+                 'human/JCV_85x_v030/jcv_85x_v030.subreadset.xml')
+        sset = SubreadSet(human)
+        ssets = sset.split(zmws=True, maxChunks=5)
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not found, skipping")
+    def test_subreadset_split_metadata_element_name(self):
+        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        log.debug(fn)
+        sset = SubreadSet("/pbi/dept/secondary/siv/testdata/"
+                          "SA3-Sequel/phi29/315/3150101/"
+                          "r54008_20160219_002905/1_A01/"
+                          "m54008_160219_003234.subreadset.xml")
+        chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True)
+        chunks[0].write(fn)
+
+    def test_contigset_split(self):
+        ref = ReferenceSet(data.getXml(9))
+        exp_n_contigs = len(ref)
+        refs = ref.split(10)
+        self.assertEqual(len(refs), 10)
+        obs_n_contigs = 0
+        for r in refs:
+            obs_n_contigs += sum(1 for _ in r)
+        self.assertEqual(obs_n_contigs, exp_n_contigs)
+
+
+    def test_split_hdfsubreadset(self):
+        hdfds = HdfSubreadSet(*upstreamdata.getBaxH5_v23())
+        self.assertEqual(len(hdfds.toExternalFiles()), 3)
+        hdfdss = hdfds.split(chunks=2, ignoreSubDatasets=True)
+        self.assertEqual(len(hdfdss), 2)
+        self.assertEqual(len(hdfdss[0].toExternalFiles()), 2)
+        self.assertEqual(len(hdfdss[1].toExternalFiles()), 1)
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not found, skipping")
+    def test_barcode_split_cornercases(self):
+        fn = ('/pbi/dept/secondary/siv/testdata/'
+              'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
+              '.tiny.subreadset.xml')
+        sset = SubreadSet(fn, skipMissing=True)
+        ssets = sset.split(chunks=3, barcodes=True)
+        self.assertEqual([str(ss.filters) for ss in ssets],
+                         ["( bc = [0, 0] )",
+                          "( bc = [1, 1] )",
+                          "( bc = [2, 2] )"])
+        sset = SubreadSet(fn, skipMissing=True)
+        self.assertEqual(len(sset), 15133)
+        sset.filters = None
+        self.assertEqual(str(sset.filters), "")
+        sset.updateCounts()
+        self.assertEqual(len(sset), 2667562)
+
+        sset.filters.addRequirement(bc=[('=', '[2, 2]')])
+        self.assertEqual(str(sset.filters), "( bc = [2, 2] )")
+        sset.updateCounts()
+        self.assertEqual(len(sset), 4710)
+
+        sset.filters = None
+        self.assertEqual(str(sset.filters), "")
+        sset.updateCounts()
+        self.assertEqual(len(sset), 2667562)
+
+        sset.filters.addRequirement(bc=[('=', '[2,2]')])
+        self.assertEqual(str(sset.filters), "( bc = [2,2] )")
+        sset.updateCounts()
+        self.assertEqual(len(sset), 4710)
+
diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py
index 71282a7..9d74aeb 100644
--- a/tests/test_pbdataset_subtypes.py
+++ b/tests/test_pbdataset_subtypes.py
@@ -5,22 +5,20 @@ import unittest
 import tempfile
 import os
 import itertools
-import numpy as np
-
-import pysam
+import xml.etree.ElementTree as ET
 
 from pbcore.util.Process import backticks
 from pbcore.io.dataset.utils import (consolidateBams, _infixFname,
-                                    BamtoolsVersion)
-from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet,
+                                     BamtoolsVersion, consolidateXml)
+from pbcore.io import (SubreadSet, ConsensusReadSet,
                        ReferenceSet, ContigSet, AlignmentSet, BarcodeSet,
                        FastaReader, FastaWriter, IndexedFastaReader,
                        HdfSubreadSet, ConsensusAlignmentSet,
-                       openDataFile, FastaWriter, FastqReader, openDataSet)
+                       openDataFile, FastqReader,
+                       GmapReferenceSet)
 import pbcore.data as upstreamData
 import pbcore.data.datasets as data
 from pbcore.io.dataset.DataSetValidator import validateXml
-import xml.etree.ElementTree as ET
 
 log = logging.getLogger(__name__)
 
@@ -49,7 +47,6 @@ class TestDataSet(unittest.TestCase):
     """Unit and integrationt tests for the DataSet class and \
     associated module functions"""
 
-
     def test_subread_build(self):
         ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
         ds2 = SubreadSet(data.getXml(no=5), skipMissing=True)
@@ -67,6 +64,38 @@ class TestDataSet(unittest.TestCase):
         self.assertEquals(type(ds4._metadata).__name__, 'SubreadSetMetadata')
         self.assertEquals(len(ds4.metadata.collections), 1)
 
+    def test_subreadset_metadata_element_name(self):
+        # without touching the element:
+        sset = SubreadSet(data.getXml(10))
+        log.debug(data.getXml(10))
+        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        log.debug(fn)
+        sset.write(fn)
+        f = ET.parse(fn)
+        self.assertEqual(len(f.getroot().findall(
+            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
+            'SubreadSetMetadata')),
+            0)
+        self.assertEqual(len(f.getroot().findall(
+            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
+            'DataSetMetadata')),
+            1)
+
+        # with touching the element:
+        sset = SubreadSet(data.getXml(10))
+        sset.metadata.description = 'foo'
+        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        sset.write(fn, validate=False)
+        f = ET.parse(fn)
+        self.assertEqual(len(f.getroot().findall(
+            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
+            'SubreadSetMetadata')),
+            0)
+        self.assertEqual(len(f.getroot().findall(
+            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
+            'DataSetMetadata')),
+            1)
+
     def test_valid_referencesets(self):
         validateXml(ET.parse(data.getXml(9)).getroot(), skipResources=True)
 
@@ -134,17 +163,6 @@ class TestDataSet(unittest.TestCase):
         for name in names:
             self.assertTrue(ds[name].id == name)
 
-    def test_contigset_split(self):
-        ref = ReferenceSet(data.getXml(9))
-        exp_n_contigs = len(ref)
-        refs = ref.split(10)
-        self.assertEqual(len(refs), 10)
-        obs_n_contigs = 0
-        for r in refs:
-            obs_n_contigs += sum(1 for _ in r)
-        self.assertEqual(obs_n_contigs, exp_n_contigs)
-
-
     def test_contigset_len(self):
         ref = ReferenceSet(data.getXml(9))
         exp_n_contigs = len(ref)
@@ -155,6 +173,30 @@ class TestDataSet(unittest.TestCase):
             obs_n_contigs += len(r)
         self.assertEqual(obs_n_contigs, exp_n_contigs)
 
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not found, skipping")
+    def test_gmapreferenceset_len(self):
+        fas = ('/pbi/dept/secondary/siv/testdata/isoseq/'
+               'lexigoen-ground-truth/reference/SIRV_150601a.fasta')
+        ref = GmapReferenceSet(fas)
+        fn = '/pbi/dept/secondary/siv/testdata/isoseq/gmap_db'
+        ver = '2014-12-19'
+        name = 'SIRV'
+        ref.gmap = fn
+        self.assertEqual(ref.gmap.resourceId, fn)
+        ref.gmap.version = ver
+        self.assertEqual(ref.gmap.version, ver)
+        ref.gmap.name = name
+        self.assertEqual(ref.gmap.name, name)
+        fn = tempfile.NamedTemporaryFile(suffix=".gmapreferenceset.xml").name
+        # TODO: Turn validation back on when xsd sync happens
+        ref.write(fn, validate=False)
+        log.debug(fn)
+        gref = GmapReferenceSet(fn)
+        gref.gmap.resourceId
+        gref.gmap.version
+        gref.gmap.name
+        gref.gmap.timeStampedName
 
     def test_ccsread_build(self):
         ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
@@ -255,6 +297,31 @@ class TestDataSet(unittest.TestCase):
 
     @unittest.skipIf(not _check_constools(),
                      "bamtools or pbindex not found, skipping")
+    def test_pbmerge_indexing(self):
+        log.debug("Test through API")
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(aln.toExternalFiles()), 2)
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outfn = os.path.join(outdir, 'merged.bam')
+        log.info(outfn)
+        consolidateXml(aln, outfn, cleanup=False)
+        self.assertTrue(os.path.exists(outfn))
+        self.assertTrue(os.path.exists(outfn + '.pbi'))
+        cons = AlignmentSet(outfn)
+        self.assertEqual(len(aln), len(cons))
+        orig_stats = os.stat(outfn + '.pbi')
+        cons.externalResources[0].pbi = None
+        self.assertEqual(None, cons.externalResources[0].pbi)
+        cons.induceIndices()
+        self.assertEqual(outfn + '.pbi', cons.externalResources[0].pbi)
+        self.assertEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
+        cons.externalResources[0].pbi = None
+        self.assertEqual(None, cons.externalResources[0].pbi)
+        cons.induceIndices(force=True)
+        self.assertNotEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
+
+    @unittest.skipIf(not _check_constools(),
+                     "bamtools or pbindex not found, skipping")
     def test_alignmentset_consolidate(self):
         log.debug("Test methods directly")
         aln = AlignmentSet(data.getXml(12))
@@ -399,18 +466,6 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(aln.externalResources[0].reference, reference)
 
 
-    def test_accuracy_filter(self):
-        aln = AlignmentSet(data.getXml(12))
-        self.assertEqual(len(list(aln)), 177)
-        aln.filters.addRequirement(accuracy=[('>', '.85')])
-        self.assertEqual(len(list(aln)), 174)
-
-    def test_contigset_filter(self):
-        ref = ReferenceSet(data.getXml(9))
-        self.assertEqual(len(list(ref)), 59)
-        ref.filters.addRequirement(length=[('>', '1450')])
-        self.assertEqual(len(list(ref)), 34)
-
     @unittest.skipIf(not _check_constools() or not _internal_data(),
                      "bamtools, pbindex or data not found, skipping")
     def test_alignmentset_partial_consolidate(self):
@@ -585,14 +640,6 @@ class TestDataSet(unittest.TestCase):
                 self.assertEqual(len([rec for rec in ds_new]), 1,
                                  "failed on %d" % i)
 
-    def test_split_hdfsubreadset(self):
-        hdfds = HdfSubreadSet(*upstreamData.getBaxH5_v23())
-        self.assertEqual(len(hdfds.toExternalFiles()), 3)
-        hdfdss = hdfds.split(chunks=2, ignoreSubDatasets=True)
-        self.assertEqual(len(hdfdss), 2)
-        self.assertEqual(len(hdfdss[0].toExternalFiles()), 2)
-        self.assertEqual(len(hdfdss[1].toExternalFiles()), 1)
-
     @unittest.skipIf(not _internal_data(),
                      "Internal data not found, skipping")
     def test_len_fastq(self):
@@ -812,25 +859,32 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(sset.numRecords, 59)
 
     def test_alignment_reference(self):
+        rfn = data.getXml(9)
         rs1 = ReferenceSet(data.getXml(9))
         fasta_res = rs1.externalResources[0]
         fasta_file = urlparse(fasta_res.resourceId).path
 
         ds1 = AlignmentSet(data.getXml(8),
-            referenceFastaFname=rs1)
+                           referenceFastaFname=rs1)
         aln_ref = None
         for aln in ds1:
             aln_ref = aln.reference()
             break
         self.assertTrue(aln_ref is not None)
+        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
+        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
+                         fasta_file)
 
         ds1 = AlignmentSet(data.getXml(8),
-            referenceFastaFname=fasta_file)
+                           referenceFastaFname=fasta_file)
         aln_ref = None
         for aln in ds1:
             aln_ref = aln.reference()
             break
         self.assertTrue(aln_ref is not None)
+        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
+        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
+                         fasta_file)
 
         ds1 = AlignmentSet(data.getXml(8))
         ds1.addReference(fasta_file)
@@ -839,6 +893,41 @@ class TestDataSet(unittest.TestCase):
             aln_ref = aln.reference()
             break
         self.assertTrue(aln_ref is not None)
+        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
+        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
+                         fasta_file)
+
+        fofn_out = tempfile.NamedTemporaryFile(suffix=".fofn").name
+        log.debug(fofn_out)
+        with open(fofn_out, 'w') as f:
+            f.write(data.getXml(8))
+            f.write('\n')
+            f.write(data.getXml(11))
+            f.write('\n')
+        ds1 = AlignmentSet(fofn_out,
+                           referenceFastaFname=fasta_file)
+        aln_ref = None
+        for aln in ds1:
+            aln_ref = aln.reference()
+            break
+        self.assertTrue(aln_ref is not None)
+        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
+        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
+                         fasta_file)
+
+        # Re-enable when referenceset is acceptable reference
+        #ds1 = AlignmentSet(data.getXml(8),
+        #                   referenceFastaFname=rfn)
+        #aln_ref = None
+        #for aln in ds1:
+        #    aln_ref = aln.reference()
+        #    break
+        #self.assertTrue(aln_ref is not None)
+        #self.assertTrue(isinstance(aln_ref, basestring))
+        #self.assertEqual(
+        #    ds1.externalResources[0]._getSubExtResByMetaType(
+        #        'PacBio.ReferenceFile.ReferenceFastaFile').uniqueId,
+        #    rs1.uuid)
 
     def test_nested_external_resources(self):
         log.debug("Testing nested externalResources in AlignmentSets")
@@ -931,40 +1020,6 @@ class TestDataSet(unittest.TestCase):
         ds_out = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name
         ds.write(ds_out)
 
-    @unittest.skipIf(not _internal_data(),
-                     "Internal data not found, skipping")
-    def test_barcode_split_cornercases(self):
-        fn = ('/pbi/dept/secondary/siv/testdata/'
-              'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
-              '.tiny.subreadset.xml')
-        sset = SubreadSet(fn)
-        ssets = sset.split(chunks=3, barcodes=True)
-        self.assertEqual([str(ss.filters) for ss in ssets],
-                         ["( bc = [0, 0] )",
-                          "( bc = [1, 1] )",
-                          "( bc = [2, 2] )"])
-        sset = SubreadSet(fn)
-        self.assertEqual(len(sset), 15133)
-        sset.filters = None
-        self.assertEqual(str(sset.filters), "")
-        sset.updateCounts()
-        self.assertEqual(len(sset), 2667562)
-
-        sset.filters.addRequirement(bc=[('=', '[2, 2]')])
-        self.assertEqual(str(sset.filters), "( bc = [2, 2] )")
-        sset.updateCounts()
-        self.assertEqual(len(sset), 4710)
-
-        sset.filters = None
-        self.assertEqual(str(sset.filters), "")
-        sset.updateCounts()
-        self.assertEqual(len(sset), 2667562)
-
-        sset.filters.addRequirement(bc=[('=', '[2,2]')])
-        self.assertEqual(str(sset.filters), "( bc = [2,2] )")
-        sset.updateCounts()
-        self.assertEqual(len(sset), 4710)
-
     def test_merged_contigset(self):
         fn = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
         with ContigSet(upstreamData.getLambdaFasta(),
diff --git a/tests/test_pbdataset_utils.py b/tests/test_pbdataset_utils.py
new file mode 100644
index 0000000..e19e02c
--- /dev/null
+++ b/tests/test_pbdataset_utils.py
@@ -0,0 +1,183 @@
+
+import logging
+import tempfile
+import unittest
+
+from pbcore.io.dataset.DataSetMetaTypes import dsIdToSuffix
+from pbcore.io import (DataSetMetaTypes, divideKeys, keysToRanges,
+                       SubreadSet, getDataSetUuid, getDataSetMetaType)
+from pbcore.io.dataset.utils import (xy_to_hn, hn_to_xy, sampleHolesUniformly,
+                                     sampleUniformly, quadratic_expand,
+                                     prodround)
+
+import pbcore.data as upstreamdata
+
+from utils import _pbtestdata, _check_constools, _internal_data
+
+log = logging.getLogger(__name__)
+
+class TestDataSetUtils(unittest.TestCase):
+    """Unit and integrationt tests for the DataSet class and \
+    associated module functions"""
+
+    def test_hn_xy_converters(self):
+        for x in range(64, 1024, 10):
+            for y in range(64, 1144, 10):
+                hn = xy_to_hn(x, y)
+                ox, oy = hn_to_xy(hn)
+                self.assertEqual(x, ox)
+                self.assertEqual(y, oy)
+
+    def test_quadratic_expand(self):
+        i = [[0, 1, 2]]
+        e = [[0], [1], [2]]
+        o = quadratic_expand(i)
+        self.assertEqual(e, o)
+
+        i = [[0, 1, 2], [3, 4, 5]]
+        e = [[0, 3], [1, 3], [2, 3],
+             [0, 4], [1, 4], [2, 4],
+             [0, 5], [1, 5], [2, 5]]
+        o = quadratic_expand(i)
+        self.assertEqual(e, o)
+
+    def test_prodround(self):
+        i = [1.1, 2.5, 3.8]
+        e = [1, 2, 4]
+        o = prodround(i, 8)
+        self.assertEqual(e, o)
+        e = [1, 3, 3]
+        o = prodround(i, 9)
+        self.assertEqual(e, o)
+        e = [1, 3, 4]
+        o = prodround(i, 12)
+        self.assertEqual(e, o)
+
+    def test_sampleUniformly(self):
+        hns = sampleUniformly(4, [(0, 1), (0, 1)])
+        self.assertEqual(hns, [[0, 0], [1, 0], [0, 1], [1, 1]])
+
+        hns = sampleUniformly(4, [(0, 4), (0, 4)])
+        self.assertEqual(hns, [[1, 1], [3, 1], [1, 3], [3, 3]])
+
+        hns = sampleUniformly(4, [(0, 4), (0, 8)])
+        self.assertEqual(hns, [[1, 3], [3, 3], [1, 5], [3, 5]])
+
+        hns = sampleUniformly(3, [(0, 4), (0, 8)])
+        self.assertEqual(hns, [[2, 2], [2, 4], [2, 6]])
+
+    def test_sampleHolesUniformly(self):
+        ncols = 1144
+        nrows = 1024
+        bounds = [(64, ncols), (64, nrows)]
+        expected = [xy_to_hn(x, y) for x,y in sampleUniformly(100, bounds)]
+        all_xy = quadratic_expand([range(64, ncols), range(64, nrows)])
+        all_holes = [xy_to_hn(x, y) for x, y in all_xy]
+        samples = sampleHolesUniformly(100, all_holes)
+        self.assertEqual(expected, samples)
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_sampleHolesUniformly_realdata(self):
+        path = ('/pbi/dept/secondary/siv/testdata/SA3-Sequel/'
+                'lambda/315/3150128/r54008_20160308_001811/'
+                '2_B01/m54008_160308_053311.subreads.bam')
+        ds = SubreadSet(path, strict=True)
+        samples = sampleHolesUniformly(100, ds.index.holeNumber)
+        self.assertEqual(len(samples), 100)
+
+    def test_get_dataset_uuid(self):
+        ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
+        ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        ds.write(ds_file)
+        uuid = getDataSetUuid(ds_file)
+        self.assertEqual(uuid, ds.uuid)
+        with open(ds_file, "w") as out:
+            out.write("hello world!")
+        uuid = getDataSetUuid(ds_file)
+        self.assertEqual(uuid, None)
+
+    def test_get_dataset_metatype(self):
+        ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
+        ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        ds.write(ds_file)
+        meta_type = getDataSetMetaType(ds_file)
+        self.assertEqual(meta_type, "PacBio.DataSet.SubreadSet")
+
+    def test_dsIdToSuffix(self):
+        suffixes = ['subreadset.xml', 'hdfsubreadset.xml', 'alignmentset.xml',
+                    'barcodeset.xml', 'consensusreadset.xml',
+                    'consensusalignmentset.xml',
+                    'referenceset.xml', 'contigset.xml']
+        for dsId, exp in zip(DataSetMetaTypes.ALL, suffixes):
+            self.assertEqual(dsIdToSuffix(dsId), exp)
+
+    def test_divideKeys_keysToRanges(self):
+        keys = [0, 1, 2, 3, 5, 8, 50]
+        res = divideKeys(keys, 0)
+        self.assertEqual(res, [])
+        res = keysToRanges(res)
+        self.assertEqual(res, [])
+
+        res = divideKeys(keys, 1)
+        self.assertEqual(res, [[0, 1, 2, 3, 5, 8, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 50]])
+
+        res = divideKeys(keys, 2)
+        self.assertEqual(res, [[0, 1, 2], [3, 5, 8, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 2], [3, 50]])
+
+        res = divideKeys(keys, 3)
+        self.assertEqual(res, [[0, 1], [2, 3], [5, 8, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 1], [2, 3], [5, 50]])
+
+        res = divideKeys(keys, 7)
+        self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
+                               [5, 5], [8, 8], [50, 50]])
+
+        res = divideKeys(keys, 8)
+        self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
+                               [5, 5], [8, 8], [50, 50]])
+
+
+        keys = [0, 1, 2, 2, 3, 5, 8, 50, 50]
+        res = divideKeys(keys, 0)
+        self.assertEqual(res, [])
+        res = keysToRanges(res)
+        self.assertEqual(res, [])
+
+        res = divideKeys(keys, 1)
+        self.assertEqual(res, [[0, 1, 2, 2, 3, 5, 8, 50, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 50]])
+
+        res = divideKeys(keys, 2)
+        self.assertEqual(res, [[0, 1, 2, 2], [3, 5, 8, 50, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 2], [3, 50]])
+
+        res = divideKeys(keys, 3)
+        self.assertEqual(res, [[0, 1, 2], [2, 3, 5], [8, 50, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 2], [2, 5], [8, 50]])
+
+        res = divideKeys(keys, 9)
+        self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
+                               [5, 5], [8, 8], [50, 50], [50, 50]])
+
+        res = divideKeys(keys, 10)
+        self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
+                               [5, 5], [8, 8], [50, 50], [50, 50]])
+
+
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..4b0d2da
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,40 @@
+
+import logging
+import os
+from pbcore.util.Process import backticks
+from pbcore.io.dataset.utils import BamtoolsVersion
+
+log = logging.getLogger(__name__)
+
+def _pbtestdata():
+    try:
+        import pbtestdata
+        return True
+    except ImportError:
+        return False
+
+def _check_constools():
+    if not BamtoolsVersion().good:
+        log.warn("Bamtools not found or out of date")
+        return False
+
+    cmd = "pbindex"
+    o, r, m = backticks(cmd)
+    if r != 1:
+        return False
+
+    cmd = "samtools"
+    o, r, m = backticks(cmd)
+    if r != 1:
+        return False
+
+    cmd = "pbmerge"
+    o, r, m = backticks(cmd)
+    if r != 1:
+        return False
+    return True
+
+def _internal_data():
+    if os.path.exists("/pbi/dept/secondary/siv/testdata"):
+        return True
+    return False

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git



More information about the debian-med-commit mailing list