[med-svn] [python-pbcore] 04/11: Imported Upstream version 1.2.2+dfsg1

Afif Elghraoui afif-guest at moszumanska.debian.org
Tue Aug 11 07:48:18 UTC 2015


This is an automated email from the git hooks/post-receive script.

afif-guest pushed a commit to branch master
in repository python-pbcore.

commit cd3bf3529d696597f98846c32c28999d5534aea3
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Mon Aug 10 22:55:45 2015 -0700

    Imported Upstream version 1.2.2+dfsg1
---
 CHANGELOG.org                                      |    7 +
 bin/dataset.py                                     |    2 +
 pbcore/__init__.py                                 |    2 +-
 pbcore/data/Makefile                               |    7 +
 pbcore/data/datasets/__init__.py                   |    6 +-
 pbcore/data/datasets/alignment.dataset.xml         |    8 +
 .../datasets/m150430_142051_Mon_p1_b25.sts.xml     |    1 +
 ...100710482550000001823136404221563_s1_p0.sts.xml |    1 +
 pbcore/data/datasets/pbalchemysim0.reference.xml   |   79 +-
 pbcore/data/datasets/subreadSetWithStats.xml       | 1139 +++++++++++++++-
 pbcore/io/FastaIO.py                               |    2 +-
 pbcore/io/align/PacBioBamIndex.py                  |   83 +-
 pbcore/io/dataset/DataSetIO.py                     | 1400 ++++++++++++++------
 pbcore/io/dataset/DataSetMembers.py                |  206 ++-
 pbcore/io/dataset/DataSetReader.py                 |  233 ++--
 pbcore/io/dataset/DataSetValidator.py              |   53 +-
 pbcore/io/dataset/DataSetWriter.py                 |   19 +-
 pbcore/io/dataset/EntryPoints.py                   |   53 +-
 pbcore/util/ToolRunner.py                          |    3 +
 tests/test_pbdataset.py                            |  205 ++-
 tests/test_pbdataset_subtypes.py                   |  174 ++-
 21 files changed, 2996 insertions(+), 687 deletions(-)

diff --git a/CHANGELOG.org b/CHANGELOG.org
index 46802d3..466bdf7 100644
--- a/CHANGELOG.org
+++ b/CHANGELOG.org
@@ -1,3 +1,10 @@
+* Version 1.2.2
+- Bug fixes and speed improvements to the
+- Dataset constructor APIs enhanced
+
+* Version 1.2.1
+- Same as 1.2.0; first version on PyPI
+
 * Version 1.2.0
 - Support for the new BGZF-based bam.pbi file format
 - Support for PacBio BAM spec version 3.0b7; previous versions are
diff --git a/bin/dataset.py b/bin/dataset.py
index 5570603..1d3cc3d 100755
--- a/bin/dataset.py
+++ b/bin/dataset.py
@@ -42,6 +42,8 @@ def get_parser():
             description=description)
     parser.add_argument("--debug", default=False, action='store_true',
                         help="Turn on debug level logging")
+    parser.add_argument("--strict", default=False, action='store_true',
+                        help="Turn on strict tests, raise all errors")
     subparser_list = get_subparsers()
     parser = add_subparsers(parser, subparser_list)
     return parser
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
index 78b5851..937e2f1 100644
--- a/pbcore/__init__.py
+++ b/pbcore/__init__.py
@@ -28,4 +28,4 @@
 # POSSIBILITY OF SUCH DAMAGE.
 #################################################################################
 
-__VERSION__ = "1.2.1"
+__VERSION__ = "1.2.2"
diff --git a/pbcore/data/Makefile b/pbcore/data/Makefile
index 002e84e..99fbcf0 100644
--- a/pbcore/data/Makefile
+++ b/pbcore/data/Makefile
@@ -18,15 +18,22 @@ $(MOVIE1).aligned_subreads.cmp.h5: $(MOVIE1).1.bax.h5 lambdaNEB.fa
 testdata:
 	dataset.py create --type ReferenceSet --relative datasets/pbalchemysim0.reference.xml datasets/pbalchemysim0.reference.fasta
 	bax2bam datasets/pbalchemysim0.bas.h5 -o datasets/pbalchemysim0 --pulsefeatures DeletionTag,DeletionQV,InsertionQV,MergeQV,SubstitutionQV && \
+	samtools index datasets/pbalchemysim0.subreads.bam
+	pbindex datasets/pbalchemysim0.subreads.bam
 	dataset.py create --type SubreadSet --relative datasets/pbalchemysim0.subread.xml datasets/pbalchemysim0.subreads.bam
 	pbalign datasets/pbalchemysim0.subreads.bam datasets/pbalchemysim0.reference.fasta datasets/pbalchemysim0.pbalign.bam && \
 	dataset.py create --type AlignmentSet --relative datasets/pbalchemysim0.pbalign.xml datasets/pbalchemysim0.pbalign.bam
 	dataset.py split --contigs --chunks 2 datasets/pbalchemysim0.pbalign.xml
 	bax2bam datasets/pbalchemysim1.bas.h5 -o datasets/pbalchemysim1 --pulsefeatures DeletionTag,DeletionQV,InsertionQV,MergeQV,SubstitutionQV && \
+	samtools index datasets/pbalchemysim1.subreads.bam
+	pbindex datasets/pbalchemysim1.subreads.bam
 	dataset.py create --type SubreadSet --relative datasets/pbalchemysim1.subread.xml datasets/pbalchemysim1.subreads.bam
 	pbalign datasets/pbalchemysim1.subreads.bam datasets/pbalchemysim0.reference.fasta datasets/pbalchemysim1.pbalign.bam && \
 	dataset.py create --type AlignmentSet --relative datasets/pbalchemysim1.pbalign.xml datasets/pbalchemysim1.pbalign.bam
 	dataset.py create --type AlignmentSet --relative datasets/pbalchemysim.pbalign.xml datasets/pbalchemysim0.pbalign.bam datasets/pbalchemysim1.pbalign.bam
+	dataset.py create --type SubreadSet --relative datasets/subreadSetWithStats.xml datasets/pbalchemysim0.subread.xml
+	dataset.py loadstats datasets/subreadSetWithStats.xml datasets/m150430_142051_Mon_p1_b25.sts.xml
+	dataset.py loadstats datasets/subreadSetWithStats.xml datasets/m150616_053259_ethan_c100710482550000001823136404221563_s1_p0.sts.xml
 
 upstream_testdata:
 	dataset.py create --type ReferenceSet --relative datasets/lambda.reference.xml lambdaNEB.fa
diff --git a/pbcore/data/datasets/__init__.py b/pbcore/data/datasets/__init__.py
index 87d8fd5..30357fb 100755
--- a/pbcore/data/datasets/__init__.py
+++ b/pbcore/data/datasets/__init__.py
@@ -15,9 +15,9 @@ XML_FILES = ["alignment.dataset.xml", #0
              "pbalchemysim0.reference.xml", #9
              "pbalchemysim0.subread.xml",
              "pbalchemysim1.pbalign.xml",
-             "pbalchemysim.pbalign.xml", # both 0 and 1 bam files
+             "pbalchemysim.pbalign.xml", # both sim0 and sim1 bam files
              "pbalchemysim1.subread.xml",
-             "subreadSetWithStats.xml", #14 # TODO: replace w regenable+new XSD
+             "subreadSetWithStats.xml", #14
              "pbalchemysim0.pbalign.chunk0contigs.xml",
              "pbalchemysim0.pbalign.chunk1contigs.xml",
             ]
@@ -37,7 +37,7 @@ def getXml(no=0):
     return _getAbsPath(XML_FILES[no])
 
 def getXmlWithStats():
-    return _getAbsPath(XML_FILES[9])
+    return _getAbsPath(XML_FILES[14])
 
 def getBam(no=0):
     return _getAbsPath(BAM_FILES[no])
diff --git a/pbcore/data/datasets/alignment.dataset.xml b/pbcore/data/datasets/alignment.dataset.xml
index ceb7a1a..b45f24e 100644
--- a/pbcore/data/datasets/alignment.dataset.xml
+++ b/pbcore/data/datasets/alignment.dataset.xml
@@ -5,6 +5,14 @@
             <FileIndices>
                 <FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="file:///mnt/path/to/alignments0.pbi"/>
             </FileIndices>
+            <ExternalResources>
+                <ExternalResource Name="Reference used to produce BAM file" Description="Points to the reference used to generate the above BAM file." MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="file:///mnt/path/to/reference.fasta" Tags="Example">
+                    <FileIndices>
+                        <FileIndex MetaType="PacBio.Index.SaWriterIndex" ResourceId="file:///mnt/path/to/reference.fasta.sa"/>
+                        <FileIndex MetaType="PacBio.Index.SamIndex" ResourceId="file:///mnt/path/to/reference.fasta.fai"/>
+                    </FileIndices>
+                </ExternalResource>
+            </ExternalResources>
         </ExternalResource>
         <ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments1.bam" Tags="Example">
             <FileIndices>
diff --git a/pbcore/data/datasets/m150430_142051_Mon_p1_b25.sts.xml b/pbcore/data/datasets/m150430_142051_Mon_p1_b25.sts.xml
new file mode 100644
index 0000000..abc631c
--- /dev/null
+++ b/pbcore/data/datasets/m150430_142051_Mon_p1_b25.sts.xml
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8"?><PipeStats xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns="http://pacificbiosciences.com/PipelineStats/PipeStats.xsd"><MovieName>m150430_142051_Mon_p1_b25</MovieName><MovieLength>40</MovieLength><NumFramesDropped>-1</NumFramesDropped><NumSequencingZmws>2876</NumSequencingZmws><TraceFileSize>2891592848</TraceFileSize><PulseFileSize>255406259</PulseFileSize><BaseFileSize>44432421</BaseFileSize> [...]
\ No newline at end of file
diff --git a/pbcore/data/datasets/m150616_053259_ethan_c100710482550000001823136404221563_s1_p0.sts.xml b/pbcore/data/datasets/m150616_053259_ethan_c100710482550000001823136404221563_s1_p0.sts.xml
new file mode 100755
index 0000000..2ebd8c0
--- /dev/null
+++ b/pbcore/data/datasets/m150616_053259_ethan_c100710482550000001823136404221563_s1_p0.sts.xml
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8"?><PipeStats xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns="http://pacificbiosciences.com/PipelineStats/PipeStats.xsd"><MovieName>m150616_053259_ethan_c100710482550000001823136404221563_s1_p0</MovieName><MovieLength>239.97777777777779</MovieLength><NumFramesDropped>0</NumFramesDropped><NumSequencingZmws>150292</NumSequencingZmws><TraceFileSize>707245733286</TraceFileSize><PulseFileSize>2809917 [...]
\ No newline at end of file
diff --git a/pbcore/data/datasets/pbalchemysim0.reference.xml b/pbcore/data/datasets/pbalchemysim0.reference.xml
index ab6e126..0bc2b06 100644
--- a/pbcore/data/datasets/pbalchemysim0.reference.xml
+++ b/pbcore/data/datasets/pbalchemysim0.reference.xml
@@ -1,2 +1,77 @@
-<?xml version='1.0' encoding='UTF-8'?>
-<ReferenceSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" CreatedAt="2015-07-09T17:04:24" MetaType="PacBio.DataSet.ReferenceSet" Name="" Tags="" UniqueId="6294e841-d71c-5d45-2bc9-4dafef1a9116" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"><ExternalResources><ExternalResource MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="pbalchemysim0.reference.fasta"><FileIndices> [...]
\ No newline at end of file
+<?xml version="1.0" ?>
+<ReferenceSet CreatedAt="2015-07-20T15:34:19" MetaType="PacBio.DataSet.ReferenceSet" Name="" Tags="" UniqueId="45a23d37-50b1-9d86-2253-085f10cc0bc1" Version="3.0.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+	<ExternalResources>
+		<ExternalResource MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="pbalchemysim0.reference.fasta">
+			<FileIndices>
+				<FileIndex MetaType="PacBio.Index.SamIndex" ResourceId="pbalchemysim0.reference.fasta.fai"/>
+			</FileIndices>
+		</ExternalResource>
+	</ExternalResources>
+	<DataSetMetadata>
+		<TotalLength>85774</TotalLength>
+		<NumRecords>59</NumRecords>
+		<Organism/>
+		<Ploidy/>
+		<Contigs>
+			<Contig Description="" Digest="DEPRECATED" Length="1458" Name="A.baumannii.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1462" Name="A.odontolyticus.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1472" Name="B.cereus.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1473" Name="B.cereus.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1472" Name="B.cereus.4"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1472" Name="B.cereus.6"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1449" Name="B.vulgatus.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1449" Name="B.vulgatus.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1449" Name="B.vulgatus.3"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1449" Name="B.vulgatus.4"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1449" Name="B.vulgatus.5"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.3"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.4"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.5"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.6"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.7"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.8"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.9"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.10"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.11"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.12"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.13"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1433" Name="C.beijerinckii.14"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1423" Name="D.radiodurans.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1423" Name="D.radiodurans.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1482" Name="E.faecalis.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1482" Name="E.faecalis.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1463" Name="E.coli.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1463" Name="E.coli.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1463" Name="E.coli.4"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1463" Name="E.coli.5"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1463" Name="E.coli.6"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1463" Name="E.coli.7"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1424" Name="H.pylori.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1494" Name="L.gasseri.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1471" Name="L.monocytogenes.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1471" Name="L.monocytogenes.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1471" Name="L.monocytogenes.3"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1471" Name="L.monocytogenes.5"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1462" Name="N.meningitidis.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1446" Name="P.acnes.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1457" Name="P.aeruginosa.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1457" Name="P.aeruginosa.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1386" Name="R.sphaeroides.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1388" Name="R.sphaeroides.3"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1473" Name="S.aureus.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1473" Name="S.aureus.4"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1473" Name="S.aureus.5"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1472" Name="S.epidermidis.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1472" Name="S.epidermidis.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1472" Name="S.epidermidis.3"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1472" Name="S.epidermidis.4"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1472" Name="S.epidermidis.5"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1470" Name="S.agalactiae.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1478" Name="S.mutans.1"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1478" Name="S.mutans.2"/>
+			<Contig Description="" Digest="DEPRECATED" Length="1467" Name="S.pneumoniae.1"/>
+		</Contigs>
+	</DataSetMetadata>
+</ReferenceSet>
diff --git a/pbcore/data/datasets/subreadSetWithStats.xml b/pbcore/data/datasets/subreadSetWithStats.xml
index b550a72..6bacfad 100644
--- a/pbcore/data/datasets/subreadSetWithStats.xml
+++ b/pbcore/data/datasets/subreadSetWithStats.xml
@@ -1,2 +1,1137 @@
-<?xml version='1.0' encoding='UTF-8'?>
-<SubreadSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="7fd49e97-512d-7e1d-9bde-b3889b5295f5" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"><ExternalResources><ExternalResource Description="Points to an example Subreads BAM file." MetaType [...]
+<?xml version="1.0" ?>
+<DataSet CreatedAt="2015-07-09T17:04:25" MetaType="PacBio.DataSet.SubreadSet" Name="" Tags="" UniqueId="a3c23749-a427-3072-bc56-5e2409e287b4" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+	<ExternalResources>
+		<ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="/home/UNIXHOME/mdsmith/p4/mainline/software/smrtanalysis/bioinformatics/lib/python/pbcore/pbcore/data/datasets/pbalchemysim0.subreads.bam">
+			<FileIndices/>
+		</ExternalResource>
+	</ExternalResources>
+	<DataSetMetadata>
+		<NumRecords>-1</NumRecords>
+		<TotalLength>-1</TotalLength>
+		<SummaryStats>
+			<NumSequencingZmws>153168.0</NumSequencingZmws>
+			<AdapterDimerFraction>1.28301765719e-05</AdapterDimerFraction>
+			<ShortInsertFraction>3.20754414298e-05</ShortInsertFraction>
+			<ProdDist>
+				<NumBins>4</NumBins>
+				<MetricDescription>Productivity</MetricDescription>
+				<BinCounts>
+					<BinCount>120979</BinCount>
+					<BinCount>25123</BinCount>
+					<BinCount>7066</BinCount>
+					<BinCount>0</BinCount>
+				</BinCounts>
+				<BinLabels>
+					<BinLabel>Empty</BinLabel>
+					<BinLabel>Productive</BinLabel>
+					<BinLabel>Other</BinLabel>
+					<BinLabel>NotDefined</BinLabel>
+				</BinLabels>
+			</ProdDist>
+			<ReadTypeDist>
+				<NumBins>9</NumBins>
+				<MetricDescription>Read Type</MetricDescription>
+				<BinCounts>
+					<BinCount>89042</BinCount>
+					<BinCount>14534</BinCount>
+					<BinCount>199</BinCount>
+					<BinCount>4367</BinCount>
+					<BinCount>8126</BinCount>
+					<BinCount>110</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>36790</BinCount>
+					<BinCount>0</BinCount>
+				</BinCounts>
+				<BinLabels>
+					<BinLabel>Empty</BinLabel>
+					<BinLabel>FullHqRead0</BinLabel>
+					<BinLabel>FullHqRead1</BinLabel>
+					<BinLabel>PartialHqRead0</BinLabel>
+					<BinLabel>PartialHqRead1</BinLabel>
+					<BinLabel>PartialHqRead2</BinLabel>
+					<BinLabel>Multiload</BinLabel>
+					<BinLabel>Indeterminate</BinLabel>
+					<BinLabel>NotDefined</BinLabel>
+				</BinLabels>
+			</ReadTypeDist>
+			<ReadLenDist>
+				<SampleSize>901</SampleSize>
+				<SampleMean>4528.69384765625</SampleMean>
+				<SampleMed>5227</SampleMed>
+				<SampleStd>2322.8055598026981</SampleStd>
+				<Sample95thPct>7367</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>418.89999389648438</BinWidth>
+				<MinOutlierValue>77</MinOutlierValue>
+				<MinBinValue>77</MinBinValue>
+				<MaxBinValue>12225.1025390625</MaxBinValue>
+				<MaxOutlierValue>12644</MaxOutlierValue>
+				<MetricDescription>Polymerase Read Length</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>62</BinCount>
+					<BinCount>39</BinCount>
+					<BinCount>36</BinCount>
+					<BinCount>29</BinCount>
+					<BinCount>37</BinCount>
+					<BinCount>19</BinCount>
+					<BinCount>29</BinCount>
+					<BinCount>37</BinCount>
+					<BinCount>32</BinCount>
+					<BinCount>32</BinCount>
+					<BinCount>40</BinCount>
+					<BinCount>45</BinCount>
+					<BinCount>54</BinCount>
+					<BinCount>73</BinCount>
+					<BinCount>77</BinCount>
+					<BinCount>97</BinCount>
+					<BinCount>95</BinCount>
+					<BinCount>49</BinCount>
+					<BinCount>17</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+				</BinCounts>
+			</ReadLenDist>
+			<ReadQualDist>
+				<SampleSize>901</SampleSize>
+				<SampleMean>0.82736450433731079</SampleMean>
+				<SampleMed>0.83167940378189087</SampleMed>
+				<SampleStd>0.029663275550147809</SampleStd>
+				<Sample95thPct>0.86801999807357788</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>0.0049052196554839611</BinWidth>
+				<MinOutlierValue>0.7500004768371582</MinOutlierValue>
+				<MinBinValue>0.7500004768371582</MinBinValue>
+				<MaxBinValue>0.89225196838378906</MaxBinValue>
+				<MaxOutlierValue>0.89715707302093506</MaxOutlierValue>
+				<MetricDescription>Polymerase Read Quality</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>12</BinCount>
+					<BinCount>8</BinCount>
+					<BinCount>11</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>17</BinCount>
+					<BinCount>11</BinCount>
+					<BinCount>20</BinCount>
+					<BinCount>24</BinCount>
+					<BinCount>24</BinCount>
+					<BinCount>34</BinCount>
+					<BinCount>25</BinCount>
+					<BinCount>38</BinCount>
+					<BinCount>52</BinCount>
+					<BinCount>33</BinCount>
+					<BinCount>45</BinCount>
+					<BinCount>48</BinCount>
+					<BinCount>57</BinCount>
+					<BinCount>47</BinCount>
+					<BinCount>69</BinCount>
+					<BinCount>65</BinCount>
+					<BinCount>55</BinCount>
+					<BinCount>51</BinCount>
+					<BinCount>57</BinCount>
+					<BinCount>42</BinCount>
+					<BinCount>29</BinCount>
+					<BinCount>12</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>1</BinCount>
+				</BinCounts>
+			</ReadQualDist>
+			<InsertReadLenDist>
+				<SampleSize>901</SampleSize>
+				<SampleMean>460.4583740234375</SampleMean>
+				<SampleMed>350</SampleMed>
+				<SampleStd>592.13342124584449</SampleStd>
+				<Sample95thPct>751</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>98</BinWidth>
+				<MinOutlierValue>78</MinOutlierValue>
+				<MinBinValue>78</MinBinValue>
+				<MaxBinValue>2920</MaxBinValue>
+				<MaxOutlierValue>5969</MaxOutlierValue>
+				<MetricDescription>Read Length of Insert</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>21</BinCount>
+					<BinCount>25</BinCount>
+					<BinCount>621</BinCount>
+					<BinCount>151</BinCount>
+					<BinCount>23</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>7</BinCount>
+					<BinCount>4</BinCount>
+					<BinCount>4</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>5</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>15</BinCount>
+				</BinCounts>
+			</InsertReadLenDist>
+			<InsertReadQualDist>
+				<SampleSize>901</SampleSize>
+				<SampleMean>0.96519893407821655</SampleMean>
+				<SampleMed>0.99638187885284424</SampleMed>
+				<SampleStd>0.063829474832070937</SampleStd>
+				<Sample95thPct>0.9987713098526001</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>0.0082235373556613922</BinWidth>
+				<MinOutlierValue>0.75223779678344727</MinOutlierValue>
+				<MinBinValue>0.75223779678344727</MinBinValue>
+				<MaxBinValue>0.990720272064209</MaxBinValue>
+				<MaxOutlierValue>0.99894392490386963</MaxOutlierValue>
+				<MetricDescription>Read Quality of Insert</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>6</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>7</BinCount>
+					<BinCount>11</BinCount>
+					<BinCount>14</BinCount>
+					<BinCount>13</BinCount>
+					<BinCount>12</BinCount>
+					<BinCount>13</BinCount>
+					<BinCount>6</BinCount>
+					<BinCount>5</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>9</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>5</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>9</BinCount>
+					<BinCount>9</BinCount>
+					<BinCount>11</BinCount>
+					<BinCount>16</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>16</BinCount>
+					<BinCount>24</BinCount>
+					<BinCount>35</BinCount>
+					<BinCount>59</BinCount>
+					<BinCount>579</BinCount>
+					<BinCount>1</BinCount>
+				</BinCounts>
+			</InsertReadQualDist>
+			<MedianInsertDist>
+				<SampleSize>973</SampleSize>
+				<SampleMean>513.21990966796875</SampleMean>
+				<SampleMed>350</SampleMed>
+				<SampleStd>575.83020626506971</SampleStd>
+				<Sample95thPct>1322</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>93.333335876464844</BinWidth>
+				<MinOutlierValue>289</MinOutlierValue>
+				<MinBinValue>289</MinBinValue>
+				<MaxBinValue>2995.666259765625</MaxBinValue>
+				<MaxOutlierValue>6278</MaxOutlierValue>
+				<MetricDescription>Median Insert</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>714</BinCount>
+					<BinCount>101</BinCount>
+					<BinCount>32</BinCount>
+					<BinCount>18</BinCount>
+					<BinCount>13</BinCount>
+					<BinCount>8</BinCount>
+					<BinCount>5</BinCount>
+					<BinCount>6</BinCount>
+					<BinCount>7</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>4</BinCount>
+					<BinCount>6</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>7</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>4</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>9</BinCount>
+				</BinCounts>
+			</MedianInsertDist>
+			<ReadLenDist>
+				<SampleSize>24222</SampleSize>
+				<SampleMean>13965.3232421875</SampleMean>
+				<SampleMed>11484</SampleMed>
+				<SampleStd>10580.691922120142</SampleStd>
+				<Sample95thPct>34936</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>2078.2333984375</BinWidth>
+				<MinOutlierValue>63</MinOutlierValue>
+				<MinBinValue>63</MinBinValue>
+				<MaxBinValue>60331.7890625</MaxBinValue>
+				<MaxOutlierValue>62410</MaxOutlierValue>
+				<MetricDescription>Polymerase Read Length</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>2564</BinCount>
+					<BinCount>2649</BinCount>
+					<BinCount>2135</BinCount>
+					<BinCount>1875</BinCount>
+					<BinCount>1977</BinCount>
+					<BinCount>1829</BinCount>
+					<BinCount>1533</BinCount>
+					<BinCount>1304</BinCount>
+					<BinCount>1148</BinCount>
+					<BinCount>1065</BinCount>
+					<BinCount>919</BinCount>
+					<BinCount>854</BinCount>
+					<BinCount>803</BinCount>
+					<BinCount>650</BinCount>
+					<BinCount>605</BinCount>
+					<BinCount>609</BinCount>
+					<BinCount>646</BinCount>
+					<BinCount>562</BinCount>
+					<BinCount>333</BinCount>
+					<BinCount>132</BinCount>
+					<BinCount>23</BinCount>
+					<BinCount>5</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+				</BinCounts>
+			</ReadLenDist>
+			<ReadQualDist>
+				<SampleSize>24222</SampleSize>
+				<SampleMean>0.85828709602355957</SampleMean>
+				<SampleMed>0.868025004863739</SampleMed>
+				<SampleStd>0.030875069589002442</SampleStd>
+				<Sample95thPct>0.88995635509490967</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>0.00517143402248621</BinWidth>
+				<MinOutlierValue>0.75007820129394531</MinOutlierValue>
+				<MinBinValue>0.75007820129394531</MinBinValue>
+				<MaxBinValue>0.90004932880401611</MaxBinValue>
+				<MaxOutlierValue>0.90522122383117676</MaxOutlierValue>
+				<MetricDescription>Polymerase Read Quality</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>122</BinCount>
+					<BinCount>136</BinCount>
+					<BinCount>130</BinCount>
+					<BinCount>144</BinCount>
+					<BinCount>163</BinCount>
+					<BinCount>177</BinCount>
+					<BinCount>186</BinCount>
+					<BinCount>213</BinCount>
+					<BinCount>237</BinCount>
+					<BinCount>251</BinCount>
+					<BinCount>280</BinCount>
+					<BinCount>338</BinCount>
+					<BinCount>412</BinCount>
+					<BinCount>426</BinCount>
+					<BinCount>527</BinCount>
+					<BinCount>563</BinCount>
+					<BinCount>686</BinCount>
+					<BinCount>790</BinCount>
+					<BinCount>925</BinCount>
+					<BinCount>1095</BinCount>
+					<BinCount>1377</BinCount>
+					<BinCount>1525</BinCount>
+					<BinCount>1775</BinCount>
+					<BinCount>2133</BinCount>
+					<BinCount>2515</BinCount>
+					<BinCount>2973</BinCount>
+					<BinCount>2823</BinCount>
+					<BinCount>1192</BinCount>
+					<BinCount>103</BinCount>
+					<BinCount>4</BinCount>
+					<BinCount>1</BinCount>
+				</BinCounts>
+			</ReadQualDist>
+			<MedianInsertDist>
+				<SampleSize>570</SampleSize>
+				<SampleMean>5406.65625</SampleMean>
+				<SampleMed>2860</SampleMed>
+				<SampleStd>6322.1968204214045</SampleStd>
+				<Sample95thPct>18508</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>1090.199951171875</BinWidth>
+				<MinOutlierValue>0</MinOutlierValue>
+				<MinBinValue>0</MinBinValue>
+				<MaxBinValue>31615.791015625</MaxBinValue>
+				<MaxOutlierValue>64092</MaxOutlierValue>
+				<MetricDescription>Median Insert</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>28</BinCount>
+					<BinCount>159</BinCount>
+					<BinCount>125</BinCount>
+					<BinCount>65</BinCount>
+					<BinCount>30</BinCount>
+					<BinCount>26</BinCount>
+					<BinCount>25</BinCount>
+					<BinCount>9</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>12</BinCount>
+					<BinCount>7</BinCount>
+					<BinCount>11</BinCount>
+					<BinCount>11</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>9</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>5</BinCount>
+					<BinCount>4</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>1</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>0</BinCount>
+					<BinCount>2</BinCount>
+					<BinCount>4</BinCount>
+				</BinCounts>
+			</MedianInsertDist>
+			<InsertReadQualDist>
+				<SampleSize>24222</SampleSize>
+				<SampleMean>0.86046397686004639</SampleMean>
+				<SampleMed>0.8684546947479248</SampleMed>
+				<SampleStd>0.035067464845073885</SampleStd>
+				<Sample95thPct>0.89117431640625</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>0.0082973875105381012</BinWidth>
+				<MinOutlierValue>0.75007820129394531</MinOutlierValue>
+				<MinBinValue>0.75007820129394531</MinBinValue>
+				<MaxBinValue>0.99070233106613159</MaxBinValue>
+				<MaxOutlierValue>0.998999834060669</MaxOutlierValue>
+				<MetricDescription>Read Quality of Insert</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>196</BinCount>
+					<BinCount>221</BinCount>
+					<BinCount>249</BinCount>
+					<BinCount>282</BinCount>
+					<BinCount>308</BinCount>
+					<BinCount>392</BinCount>
+					<BinCount>438</BinCount>
+					<BinCount>619</BinCount>
+					<BinCount>701</BinCount>
+					<BinCount>871</BinCount>
+					<BinCount>1141</BinCount>
+					<BinCount>1465</BinCount>
+					<BinCount>1989</BinCount>
+					<BinCount>2463</BinCount>
+					<BinCount>3213</BinCount>
+					<BinCount>4186</BinCount>
+					<BinCount>4264</BinCount>
+					<BinCount>744</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>9</BinCount>
+					<BinCount>4</BinCount>
+					<BinCount>7</BinCount>
+					<BinCount>10</BinCount>
+					<BinCount>32</BinCount>
+					<BinCount>51</BinCount>
+					<BinCount>13</BinCount>
+					<BinCount>21</BinCount>
+					<BinCount>29</BinCount>
+					<BinCount>46</BinCount>
+					<BinCount>247</BinCount>
+					<BinCount>1</BinCount>
+				</BinCounts>
+			</InsertReadQualDist>
+			<InsertReadLenDist>
+				<SampleSize>24222</SampleSize>
+				<SampleMean>12747.7919921875</SampleMean>
+				<SampleMed>10937</SampleMed>
+				<SampleStd>9364.0618727600377</SampleStd>
+				<Sample95thPct>29948</Sample95thPct>
+				<NumBins>30</NumBins>
+				<BinWidth>1436.566650390625</BinWidth>
+				<MinOutlierValue>1</MinOutlierValue>
+				<MinBinValue>1</MinBinValue>
+				<MaxBinValue>41661.4296875</MaxBinValue>
+				<MaxOutlierValue>43098</MaxOutlierValue>
+				<MetricDescription>Read Length of Insert</MetricDescription>
+				<BinCounts>
+					<BinCount>0</BinCount>
+					<BinCount>1660</BinCount>
+					<BinCount>2384</BinCount>
+					<BinCount>1828</BinCount>
+					<BinCount>1455</BinCount>
+					<BinCount>1315</BinCount>
+					<BinCount>1302</BinCount>
+					<BinCount>1365</BinCount>
+					<BinCount>1296</BinCount>
+					<BinCount>1200</BinCount>
+					<BinCount>1088</BinCount>
+					<BinCount>965</BinCount>
+					<BinCount>856</BinCount>
+					<BinCount>822</BinCount>
+					<BinCount>820</BinCount>
+					<BinCount>878</BinCount>
+					<BinCount>756</BinCount>
+					<BinCount>747</BinCount>
+					<BinCount>754</BinCount>
+					<BinCount>654</BinCount>
+					<BinCount>535</BinCount>
+					<BinCount>382</BinCount>
+					<BinCount>321</BinCount>
+					<BinCount>275</BinCount>
+					<BinCount>181</BinCount>
+					<BinCount>167</BinCount>
+					<BinCount>117</BinCount>
+					<BinCount>64</BinCount>
+					<BinCount>22</BinCount>
+					<BinCount>9</BinCount>
+					<BinCount>3</BinCount>
+					<BinCount>1</BinCount>
+				</BinCounts>
+			</InsertReadLenDist>
+		</SummaryStats>
+	</DataSetMetadata>
+	<DataSets>
+		<DataSet CreatedAt="2015-07-09T17:04:25" MetaType="PacBio.DataSet.SubreadSet" Name="" Tags="" UniqueId="d3ece804-9945-e736-3be0-91b7a104dd71" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+			<ExternalResources>
+				<ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="/home/UNIXHOME/mdsmith/p4/mainline/software/smrtanalysis/bioinformatics/lib/python/pbcore/pbcore/data/datasets/pbalchemysim0.subreads.bam">
+					<FileIndices/>
+				</ExternalResource>
+			</ExternalResources>
+			<DataSetMetadata>
+				<NumRecords>-1</NumRecords>
+				<TotalLength>-1</TotalLength>
+				<SummaryStats>
+					<NumSequencingZmws>2876</NumSequencingZmws>
+					<AdapterDimerFraction>0</AdapterDimerFraction>
+					<ShortInsertFraction>0</ShortInsertFraction>
+					<ProdDist>
+						<NumBins>4</NumBins>
+						<MetricDescription>Productivity</MetricDescription>
+						<BinCounts>
+							<BinCount>1576</BinCount>
+							<BinCount>901</BinCount>
+							<BinCount>399</BinCount>
+							<BinCount>0</BinCount>
+						</BinCounts>
+						<BinLabels>
+							<BinLabel>Empty</BinLabel>
+							<BinLabel>Productive</BinLabel>
+							<BinLabel>Other</BinLabel>
+							<BinLabel>NotDefined</BinLabel>
+						</BinLabels>
+					</ProdDist>
+					<ReadTypeDist>
+						<NumBins>9</NumBins>
+						<MetricDescription>Read Type</MetricDescription>
+						<BinCounts>
+							<BinCount>1474</BinCount>
+							<BinCount>799</BinCount>
+							<BinCount>4</BinCount>
+							<BinCount>181</BinCount>
+							<BinCount>92</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>326</BinCount>
+							<BinCount>0</BinCount>
+						</BinCounts>
+						<BinLabels>
+							<BinLabel>Empty</BinLabel>
+							<BinLabel>FullHqRead0</BinLabel>
+							<BinLabel>FullHqRead1</BinLabel>
+							<BinLabel>PartialHqRead0</BinLabel>
+							<BinLabel>PartialHqRead1</BinLabel>
+							<BinLabel>PartialHqRead2</BinLabel>
+							<BinLabel>Multiload</BinLabel>
+							<BinLabel>Indeterminate</BinLabel>
+							<BinLabel>NotDefined</BinLabel>
+						</BinLabels>
+					</ReadTypeDist>
+					<ReadLenDist>
+						<SampleSize>901</SampleSize>
+						<SampleMean>4528.69384765625</SampleMean>
+						<SampleMed>5227</SampleMed>
+						<SampleStd>2322.8055598026981</SampleStd>
+						<Sample95thPct>7367</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>418.89999389648438</BinWidth>
+						<MinOutlierValue>77</MinOutlierValue>
+						<MinBinValue>77</MinBinValue>
+						<MaxBinValue>12225.1025390625</MaxBinValue>
+						<MaxOutlierValue>12644</MaxOutlierValue>
+						<MetricDescription>Polymerase Read Length</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>62</BinCount>
+							<BinCount>39</BinCount>
+							<BinCount>36</BinCount>
+							<BinCount>29</BinCount>
+							<BinCount>37</BinCount>
+							<BinCount>19</BinCount>
+							<BinCount>29</BinCount>
+							<BinCount>37</BinCount>
+							<BinCount>32</BinCount>
+							<BinCount>32</BinCount>
+							<BinCount>40</BinCount>
+							<BinCount>45</BinCount>
+							<BinCount>54</BinCount>
+							<BinCount>73</BinCount>
+							<BinCount>77</BinCount>
+							<BinCount>97</BinCount>
+							<BinCount>95</BinCount>
+							<BinCount>49</BinCount>
+							<BinCount>17</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+						</BinCounts>
+					</ReadLenDist>
+					<ReadQualDist>
+						<SampleSize>901</SampleSize>
+						<SampleMean>0.82736450433731079</SampleMean>
+						<SampleMed>0.83167940378189087</SampleMed>
+						<SampleStd>0.029663275550147809</SampleStd>
+						<Sample95thPct>0.86801999807357788</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>0.0049052196554839611</BinWidth>
+						<MinOutlierValue>0.7500004768371582</MinOutlierValue>
+						<MinBinValue>0.7500004768371582</MinBinValue>
+						<MaxBinValue>0.89225196838378906</MaxBinValue>
+						<MaxOutlierValue>0.89715707302093506</MaxOutlierValue>
+						<MetricDescription>Polymerase Read Quality</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>12</BinCount>
+							<BinCount>8</BinCount>
+							<BinCount>11</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>17</BinCount>
+							<BinCount>11</BinCount>
+							<BinCount>20</BinCount>
+							<BinCount>24</BinCount>
+							<BinCount>24</BinCount>
+							<BinCount>34</BinCount>
+							<BinCount>25</BinCount>
+							<BinCount>38</BinCount>
+							<BinCount>52</BinCount>
+							<BinCount>33</BinCount>
+							<BinCount>45</BinCount>
+							<BinCount>48</BinCount>
+							<BinCount>57</BinCount>
+							<BinCount>47</BinCount>
+							<BinCount>69</BinCount>
+							<BinCount>65</BinCount>
+							<BinCount>55</BinCount>
+							<BinCount>51</BinCount>
+							<BinCount>57</BinCount>
+							<BinCount>42</BinCount>
+							<BinCount>29</BinCount>
+							<BinCount>12</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>1</BinCount>
+						</BinCounts>
+					</ReadQualDist>
+					<InsertReadLenDist>
+						<SampleSize>901</SampleSize>
+						<SampleMean>460.4583740234375</SampleMean>
+						<SampleMed>350</SampleMed>
+						<SampleStd>592.13342124584449</SampleStd>
+						<Sample95thPct>751</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>98</BinWidth>
+						<MinOutlierValue>78</MinOutlierValue>
+						<MinBinValue>78</MinBinValue>
+						<MaxBinValue>2920</MaxBinValue>
+						<MaxOutlierValue>5969</MaxOutlierValue>
+						<MetricDescription>Read Length of Insert</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>21</BinCount>
+							<BinCount>25</BinCount>
+							<BinCount>621</BinCount>
+							<BinCount>151</BinCount>
+							<BinCount>23</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>7</BinCount>
+							<BinCount>4</BinCount>
+							<BinCount>4</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>5</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>15</BinCount>
+						</BinCounts>
+					</InsertReadLenDist>
+					<InsertReadQualDist>
+						<SampleSize>901</SampleSize>
+						<SampleMean>0.96519893407821655</SampleMean>
+						<SampleMed>0.99638187885284424</SampleMed>
+						<SampleStd>0.063829474832070937</SampleStd>
+						<Sample95thPct>0.9987713098526001</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>0.0082235373556613922</BinWidth>
+						<MinOutlierValue>0.75223779678344727</MinOutlierValue>
+						<MinBinValue>0.75223779678344727</MinBinValue>
+						<MaxBinValue>0.990720272064209</MaxBinValue>
+						<MaxOutlierValue>0.99894392490386963</MaxOutlierValue>
+						<MetricDescription>Read Quality of Insert</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>6</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>7</BinCount>
+							<BinCount>11</BinCount>
+							<BinCount>14</BinCount>
+							<BinCount>13</BinCount>
+							<BinCount>12</BinCount>
+							<BinCount>13</BinCount>
+							<BinCount>6</BinCount>
+							<BinCount>5</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>9</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>5</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>9</BinCount>
+							<BinCount>9</BinCount>
+							<BinCount>11</BinCount>
+							<BinCount>16</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>16</BinCount>
+							<BinCount>24</BinCount>
+							<BinCount>35</BinCount>
+							<BinCount>59</BinCount>
+							<BinCount>579</BinCount>
+							<BinCount>1</BinCount>
+						</BinCounts>
+					</InsertReadQualDist>
+					<MedianInsertDist>
+						<SampleSize>973</SampleSize>
+						<SampleMean>513.21990966796875</SampleMean>
+						<SampleMed>350</SampleMed>
+						<SampleStd>575.83020626506971</SampleStd>
+						<Sample95thPct>1322</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>93.333335876464844</BinWidth>
+						<MinOutlierValue>289</MinOutlierValue>
+						<MinBinValue>289</MinBinValue>
+						<MaxBinValue>2995.666259765625</MaxBinValue>
+						<MaxOutlierValue>6278</MaxOutlierValue>
+						<MetricDescription>Median Insert</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>714</BinCount>
+							<BinCount>101</BinCount>
+							<BinCount>32</BinCount>
+							<BinCount>18</BinCount>
+							<BinCount>13</BinCount>
+							<BinCount>8</BinCount>
+							<BinCount>5</BinCount>
+							<BinCount>6</BinCount>
+							<BinCount>7</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>4</BinCount>
+							<BinCount>6</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>7</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>4</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>9</BinCount>
+						</BinCounts>
+					</MedianInsertDist>
+				</SummaryStats>
+			</DataSetMetadata>
+		</DataSet>
+		<DataSet CreatedAt="2015-07-09T17:04:25" MetaType="PacBio.DataSet.SubreadSet" Name="" Tags="" UniqueId="d3ece804-9945-e736-3be0-91b7a104dd71" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+			<ExternalResources>
+				<ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="/home/UNIXHOME/mdsmith/p4/mainline/software/smrtanalysis/bioinformatics/lib/python/pbcore/pbcore/data/datasets/pbalchemysim0.subreads.bam">
+					<FileIndices/>
+				</ExternalResource>
+			</ExternalResources>
+			<DataSetMetadata>
+				<NumRecords>-1</NumRecords>
+				<TotalLength>-1</TotalLength>
+				<SummaryStats>
+					<NumSequencingZmws>150292</NumSequencingZmws>
+					<AdapterDimerFraction>1.3307428206424827E-05</AdapterDimerFraction>
+					<ShortInsertFraction>3.3268570516062067E-05</ShortInsertFraction>
+					<ProdDist>
+						<NumBins>4</NumBins>
+						<MetricDescription>Productivity</MetricDescription>
+						<BinCounts>
+							<BinCount>119403</BinCount>
+							<BinCount>24222</BinCount>
+							<BinCount>6667</BinCount>
+							<BinCount>0</BinCount>
+						</BinCounts>
+						<BinLabels>
+							<BinLabel>Empty</BinLabel>
+							<BinLabel>Productive</BinLabel>
+							<BinLabel>Other</BinLabel>
+							<BinLabel>NotDefined</BinLabel>
+						</BinLabels>
+					</ProdDist>
+					<ReadTypeDist>
+						<NumBins>9</NumBins>
+						<MetricDescription>Read Type</MetricDescription>
+						<BinCounts>
+							<BinCount>87568</BinCount>
+							<BinCount>13735</BinCount>
+							<BinCount>195</BinCount>
+							<BinCount>4186</BinCount>
+							<BinCount>8034</BinCount>
+							<BinCount>110</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>36464</BinCount>
+							<BinCount>0</BinCount>
+						</BinCounts>
+						<BinLabels>
+							<BinLabel>Empty</BinLabel>
+							<BinLabel>FullHqRead0</BinLabel>
+							<BinLabel>FullHqRead1</BinLabel>
+							<BinLabel>PartialHqRead0</BinLabel>
+							<BinLabel>PartialHqRead1</BinLabel>
+							<BinLabel>PartialHqRead2</BinLabel>
+							<BinLabel>Multiload</BinLabel>
+							<BinLabel>Indeterminate</BinLabel>
+							<BinLabel>NotDefined</BinLabel>
+						</BinLabels>
+					</ReadTypeDist>
+					<ReadLenDist>
+						<SampleSize>24222</SampleSize>
+						<SampleMean>13965.3232421875</SampleMean>
+						<SampleMed>11484</SampleMed>
+						<SampleStd>10580.691922120142</SampleStd>
+						<Sample95thPct>34936</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>2078.2333984375</BinWidth>
+						<MinOutlierValue>63</MinOutlierValue>
+						<MinBinValue>63</MinBinValue>
+						<MaxBinValue>60331.7890625</MaxBinValue>
+						<MaxOutlierValue>62410</MaxOutlierValue>
+						<MetricDescription>Polymerase Read Length</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>2564</BinCount>
+							<BinCount>2649</BinCount>
+							<BinCount>2135</BinCount>
+							<BinCount>1875</BinCount>
+							<BinCount>1977</BinCount>
+							<BinCount>1829</BinCount>
+							<BinCount>1533</BinCount>
+							<BinCount>1304</BinCount>
+							<BinCount>1148</BinCount>
+							<BinCount>1065</BinCount>
+							<BinCount>919</BinCount>
+							<BinCount>854</BinCount>
+							<BinCount>803</BinCount>
+							<BinCount>650</BinCount>
+							<BinCount>605</BinCount>
+							<BinCount>609</BinCount>
+							<BinCount>646</BinCount>
+							<BinCount>562</BinCount>
+							<BinCount>333</BinCount>
+							<BinCount>132</BinCount>
+							<BinCount>23</BinCount>
+							<BinCount>5</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+						</BinCounts>
+					</ReadLenDist>
+					<ReadQualDist>
+						<SampleSize>24222</SampleSize>
+						<SampleMean>0.85828709602355957</SampleMean>
+						<SampleMed>0.868025004863739</SampleMed>
+						<SampleStd>0.030875069589002442</SampleStd>
+						<Sample95thPct>0.88995635509490967</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>0.00517143402248621</BinWidth>
+						<MinOutlierValue>0.75007820129394531</MinOutlierValue>
+						<MinBinValue>0.75007820129394531</MinBinValue>
+						<MaxBinValue>0.90004932880401611</MaxBinValue>
+						<MaxOutlierValue>0.90522122383117676</MaxOutlierValue>
+						<MetricDescription>Polymerase Read Quality</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>122</BinCount>
+							<BinCount>136</BinCount>
+							<BinCount>130</BinCount>
+							<BinCount>144</BinCount>
+							<BinCount>163</BinCount>
+							<BinCount>177</BinCount>
+							<BinCount>186</BinCount>
+							<BinCount>213</BinCount>
+							<BinCount>237</BinCount>
+							<BinCount>251</BinCount>
+							<BinCount>280</BinCount>
+							<BinCount>338</BinCount>
+							<BinCount>412</BinCount>
+							<BinCount>426</BinCount>
+							<BinCount>527</BinCount>
+							<BinCount>563</BinCount>
+							<BinCount>686</BinCount>
+							<BinCount>790</BinCount>
+							<BinCount>925</BinCount>
+							<BinCount>1095</BinCount>
+							<BinCount>1377</BinCount>
+							<BinCount>1525</BinCount>
+							<BinCount>1775</BinCount>
+							<BinCount>2133</BinCount>
+							<BinCount>2515</BinCount>
+							<BinCount>2973</BinCount>
+							<BinCount>2823</BinCount>
+							<BinCount>1192</BinCount>
+							<BinCount>103</BinCount>
+							<BinCount>4</BinCount>
+							<BinCount>1</BinCount>
+						</BinCounts>
+					</ReadQualDist>
+					<InsertReadLenDist>
+						<SampleSize>24222</SampleSize>
+						<SampleMean>12747.7919921875</SampleMean>
+						<SampleMed>10937</SampleMed>
+						<SampleStd>9364.0618727600377</SampleStd>
+						<Sample95thPct>29948</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>1436.566650390625</BinWidth>
+						<MinOutlierValue>1</MinOutlierValue>
+						<MinBinValue>1</MinBinValue>
+						<MaxBinValue>41661.4296875</MaxBinValue>
+						<MaxOutlierValue>43098</MaxOutlierValue>
+						<MetricDescription>Read Length of Insert</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>1660</BinCount>
+							<BinCount>2384</BinCount>
+							<BinCount>1828</BinCount>
+							<BinCount>1455</BinCount>
+							<BinCount>1315</BinCount>
+							<BinCount>1302</BinCount>
+							<BinCount>1365</BinCount>
+							<BinCount>1296</BinCount>
+							<BinCount>1200</BinCount>
+							<BinCount>1088</BinCount>
+							<BinCount>965</BinCount>
+							<BinCount>856</BinCount>
+							<BinCount>822</BinCount>
+							<BinCount>820</BinCount>
+							<BinCount>878</BinCount>
+							<BinCount>756</BinCount>
+							<BinCount>747</BinCount>
+							<BinCount>754</BinCount>
+							<BinCount>654</BinCount>
+							<BinCount>535</BinCount>
+							<BinCount>382</BinCount>
+							<BinCount>321</BinCount>
+							<BinCount>275</BinCount>
+							<BinCount>181</BinCount>
+							<BinCount>167</BinCount>
+							<BinCount>117</BinCount>
+							<BinCount>64</BinCount>
+							<BinCount>22</BinCount>
+							<BinCount>9</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>1</BinCount>
+						</BinCounts>
+					</InsertReadLenDist>
+					<InsertReadQualDist>
+						<SampleSize>24222</SampleSize>
+						<SampleMean>0.86046397686004639</SampleMean>
+						<SampleMed>0.8684546947479248</SampleMed>
+						<SampleStd>0.035067464845073885</SampleStd>
+						<Sample95thPct>0.89117431640625</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>0.0082973875105381012</BinWidth>
+						<MinOutlierValue>0.75007820129394531</MinOutlierValue>
+						<MinBinValue>0.75007820129394531</MinBinValue>
+						<MaxBinValue>0.99070233106613159</MaxBinValue>
+						<MaxOutlierValue>0.998999834060669</MaxOutlierValue>
+						<MetricDescription>Read Quality of Insert</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>196</BinCount>
+							<BinCount>221</BinCount>
+							<BinCount>249</BinCount>
+							<BinCount>282</BinCount>
+							<BinCount>308</BinCount>
+							<BinCount>392</BinCount>
+							<BinCount>438</BinCount>
+							<BinCount>619</BinCount>
+							<BinCount>701</BinCount>
+							<BinCount>871</BinCount>
+							<BinCount>1141</BinCount>
+							<BinCount>1465</BinCount>
+							<BinCount>1989</BinCount>
+							<BinCount>2463</BinCount>
+							<BinCount>3213</BinCount>
+							<BinCount>4186</BinCount>
+							<BinCount>4264</BinCount>
+							<BinCount>744</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>9</BinCount>
+							<BinCount>4</BinCount>
+							<BinCount>7</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>32</BinCount>
+							<BinCount>51</BinCount>
+							<BinCount>13</BinCount>
+							<BinCount>21</BinCount>
+							<BinCount>29</BinCount>
+							<BinCount>46</BinCount>
+							<BinCount>247</BinCount>
+							<BinCount>1</BinCount>
+						</BinCounts>
+					</InsertReadQualDist>
+					<MedianInsertDist>
+						<SampleSize>570</SampleSize>
+						<SampleMean>5406.65625</SampleMean>
+						<SampleMed>2860</SampleMed>
+						<SampleStd>6322.1968204214045</SampleStd>
+						<Sample95thPct>18508</Sample95thPct>
+						<NumBins>30</NumBins>
+						<BinWidth>1090.199951171875</BinWidth>
+						<MinOutlierValue>0</MinOutlierValue>
+						<MinBinValue>0</MinBinValue>
+						<MaxBinValue>31615.791015625</MaxBinValue>
+						<MaxOutlierValue>64092</MaxOutlierValue>
+						<MetricDescription>Median Insert</MetricDescription>
+						<BinCounts>
+							<BinCount>0</BinCount>
+							<BinCount>28</BinCount>
+							<BinCount>159</BinCount>
+							<BinCount>125</BinCount>
+							<BinCount>65</BinCount>
+							<BinCount>30</BinCount>
+							<BinCount>26</BinCount>
+							<BinCount>25</BinCount>
+							<BinCount>9</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>10</BinCount>
+							<BinCount>12</BinCount>
+							<BinCount>7</BinCount>
+							<BinCount>11</BinCount>
+							<BinCount>11</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>9</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>5</BinCount>
+							<BinCount>4</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>3</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>1</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>0</BinCount>
+							<BinCount>2</BinCount>
+							<BinCount>4</BinCount>
+						</BinCounts>
+					</MedianInsertDist>
+				</SummaryStats>
+			</DataSetMetadata>
+		</DataSet>
+	</DataSets>
+</DataSet>
diff --git a/pbcore/io/FastaIO.py b/pbcore/io/FastaIO.py
index 8afda41..5e2d7bf 100644
--- a/pbcore/io/FastaIO.py
+++ b/pbcore/io/FastaIO.py
@@ -218,7 +218,7 @@ class FastaReader(ReaderBase):
             for part in parts:
                 yield FastaRecord.fromString(">" + part)
         except AssertionError:
-            raise ValueError("Invalid FASTA file")
+            raise ValueError("Invalid FASTA file {f}".format(f=self.filename))
 
 
 class FastaWriter(WriterBase):
diff --git a/pbcore/io/align/PacBioBamIndex.py b/pbcore/io/align/PacBioBamIndex.py
index 83e96af..d9cff21 100644
--- a/pbcore/io/align/PacBioBamIndex.py
+++ b/pbcore/io/align/PacBioBamIndex.py
@@ -63,7 +63,45 @@ class PacBioBamIndex(object):
         (self.magic, self.vPatch, self.vMinor,
          self.vMajor, self.pbiFlags, self.nReads) = header
 
+    @property
+    def hasMappingInfo(self):
+        return (self.pbiFlags & PBI_FLAGS_MAPPED)
+
+    @property
+    def hasAdapterInfo(self):
+        return (self.pbiFlags & PBI_FLAGS_BARCODE_ADAPTER)
+
     def _loadMainIndex(self, f):
+        # Main index holds subread and mapping info
+        SUBREADS_INDEX_DTYPE = [
+            ("qId"               , "i4"),
+            ("qStart"            , "i4"),
+            ("qEnd"              , "i4"),
+            ("holeNumber"        , "i4"),
+            ("readQual"          , "u2"),
+            ("virtualFileOffset" , "i8") ]
+
+        MAPPING_INDEX_DTYPE = [
+            ("tId"               , "i4"),
+            ("tStart"            , "u4"),
+            ("tEnd"              , "u4"),
+            ("aStart"            , "u4"),
+            ("aEnd"              , "u4"),
+            ("isReverseStrand"   , "u1"),
+            ("nM"                , "u4"),
+            ("nMM"               , "u4"),
+            ("mapQV"             , "u1") ]
+
+        COMPUTED_COLUMNS_DTYPE = [
+            ("nIns"              , "u4"),
+            ("nDel"              , "u4") ]
+
+        JOINT_DTYPE = SUBREADS_INDEX_DTYPE + MAPPING_INDEX_DTYPE + COMPUTED_COLUMNS_DTYPE
+
+        if self.hasMappingInfo:
+            tbl = np.zeros(shape=(self.nReads,), dtype=JOINT_DTYPE).view(np.recarray)
+        else:
+            tbl = np.zeros(shape=(self.nReads,), dtype=SUBREADS_INDEX_DTYPE).view(np.recarray)
 
         def peek(type_, length):
             flavor, width = type_
@@ -71,41 +109,16 @@ class PacBioBamIndex(object):
 
         if True:
             # BASIC data always present
-            qId               = peek("i4", self.nReads)
-            qStart            = peek("i4", self.nReads)
-            qEnd              = peek("i4", self.nReads)
-            holeNumber        = peek("i4", self.nReads)
-            readQual          = peek("u2", self.nReads)
-            virtualFileOffset = peek("i8", self.nReads)
-
-            tbl = np.rec.fromarrays(
-                [qId, qStart, qEnd, holeNumber, readQual, virtualFileOffset],
-                names="qId, qStart, qEnd, holeNumber, readQual, virtualFileOffset")
-
-        if (self.pbiFlags & PBI_FLAGS_MAPPED):
-            tId               = peek("i4", self.nReads)
-            tStart            = peek("u4", self.nReads)
-            tEnd              = peek("u4", self.nReads)
-            aStart            = peek("u4", self.nReads)
-            aEnd              = peek("u4", self.nReads)
-            isReverseStrand   = peek("u1", self.nReads)
-            nM                = peek("u4", self.nReads)
-            nMM               = peek("u4", self.nReads)
-            mapQV             = peek("u1", self.nReads)
+            for columnName, columnType in SUBREADS_INDEX_DTYPE:
+                tbl[columnName] = peek(columnType, self.nReads)
 
-            # Computed columns
-            nIns = aEnd - aStart - nM - nMM
-            nDel = tEnd - tStart - nM - nMM
+        if self.hasMappingInfo:
+            for columnName, columnType in MAPPING_INDEX_DTYPE:
+                tbl[columnName] = peek(columnType, self.nReads)
 
-            mapping = np.rec.fromarrays(
-                [tId, tStart, tEnd, aStart, aEnd, isReverseStrand, nM, nMM, nIns, nDel, mapQV],
-                names="tId, tStart, tEnd, aStart, aEnd, isReverseStrand, nM, nMM, nIns, nDel, mapQV")
-
-            tbl = nlr.merge_arrays([tbl, mapping], flatten=True).view(np.recarray)
-
-        if (self.pbiFlags & PBI_FLAGS_BARCODE_ADAPTER):
-            # TODO!
-            pass
+            # Computed columns
+            tbl.nIns = tbl.aEnd - tbl.aStart - tbl.nM - tbl.nMM
+            tbl.nDel = tbl.tEnd - tbl.tStart - tbl.nM - tbl.nMM
 
         self._tbl = tbl
         self._checkForBrokenColumns()
@@ -176,8 +189,8 @@ class PacBioBamIndex(object):
         return ix
 
     def _checkForBrokenColumns(self):
-        if np.all( (self.nM  == 0) &
-                   (self.nMM == 0) ):
+        if ((self.pbiFlags & PBI_FLAGS_MAPPED) and
+            np.all((self.nM  == 0) & (self.nMM == 0))):
             raise IncompatibleFile, \
                 "This bam.pbi file was generated by a version of pbindex with" \
                 " a bug.  Please rerun pbindex."
diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py
index c4d12df..02b4868 100755
--- a/pbcore/io/dataset/DataSetIO.py
+++ b/pbcore/io/dataset/DataSetIO.py
@@ -8,21 +8,28 @@ where they are stored, manipulated, filtered, merged, etc.
 import hashlib
 import copy
 import os
+import errno
 import logging
+import xml.dom.minidom
+import tempfile
 from functools import wraps
+import itertools
 import numpy as np
 from urlparse import urlparse
+from pbcore.util.Process import backticks
 from pbcore.io.opener import (openAlignmentFile, openIndexedAlignmentFile,
                               FastaReader, IndexedFastaReader, CmpH5Reader,
                               IndexedBamReader)
-from pbcore.io.FastaIO import splitFastaHeader
-
-from pbcore.io.dataset import DataSetReader
+from pbcore.io.FastaIO import splitFastaHeader, FastaWriter
+from pbcore.io import PacBioBamIndex, BaxH5Reader
+from pbcore.io.align._BamSupport import UnavailableFeature
+from pbcore.io.dataset.DataSetReader import (parseStats, populateDataSet,
+                                             resolveLocation, wrapNewResource,
+                                             xmlRootType)
 from pbcore.io.dataset.DataSetWriter import toXml
 from pbcore.io.dataset.DataSetValidator import validateString
 from pbcore.io.dataset.DataSetMembers import (DataSetMetadata,
                                               SubreadSetMetadata,
-                                              ReferenceSetMetadata,
                                               ContigSetMetadata,
                                               BarcodeSetMetadata,
                                               ExternalResources, Filters)
@@ -48,6 +55,42 @@ def filtered(generator):
 def _toDsId(x):
     return "PacBio.DataSet.{x}".format(x=x)
 
+def _dsIdToName(x):
+    if DataSetMetaTypes.isValid(x):
+        return x.split('.')[-1]
+
+def _dsIdToType(x):
+    if DataSetMetaTypes.isValid(x):
+        types = DataSet.castableTypes()
+        return types[_dsIdToName(x)]
+
+def _dsIdToSuffix(x):
+    dsIds = DataSetMetaTypes.ALL
+    suffixMap = {dsId: _dsIdToName(dsId) for dsId in dsIds}
+    suffixMap[_toDsId("DataSet")] = 'DataSet'
+    if DataSetMetaTypes.isValid(x):
+        suffix = suffixMap[x]
+        suffix = suffix.lower()
+        suffix += '.xml'
+        return suffix
+
+def openDataSet(*files, **kwargs):
+    # infer from the first:
+    first = DataSet(files[0], **kwargs)
+    dsId = first.objMetadata.get('MetaType')
+    # hdfsubreadset metatypes are subreadset. Fix:
+    if files[0].endswith('xml'):
+        xml_rt = xmlRootType(files[0])
+        if _dsIdToName(dsId) != xml_rt:
+            log.warn("XML metatype does not match root tag")
+            if xml_rt == 'HdfSubreadSet':
+                dsId = _toDsId(xml_rt)
+    tbrType = _dsIdToType(dsId)
+    if tbrType:
+        return tbrType(*files, **kwargs)
+    else:
+        return DataSet(*files, **kwargs)
+
 
 class DataSetMetaTypes(object):
     """
@@ -80,78 +123,22 @@ def _fileType(fname):
     ftype = ftype.strip('.')
     return ftype
 
-
-def openDataSet(*files):
-    """Generic factory function to open DataSets (or attempt to open a subtype
-    if one can be inferred from the file(s).
-
-    A replacement for the generic DataSet(*files) constructor"""
-    if files:
-        # The factory that does the heavy lifting:
-        dataset = DataSetReader.parseFiles(files)
-    else:
-        dataset = DataSet()
-    if not dataset.uuid:
-        dataset.newUuid()
-    log.debug('Updating counts')
-    dataset.fileNames = files
-    dataset.updateCounts()
-    dataset.close()
-    log.debug('Done creating {c}'.format(c=str(dataset)))
-    return dataset
-
-
-class MetaDataSet(type):
-    """This metaclass acts as a factory for DataSet and subtypes,
-    intercepting constructor calls and returning a DataSet or subclass
-    instance as appropriate (inferred from file contents)."""
-
-
-    def __call__(cls, *files):
-        """Factory function for DataSet and subtypes
-
-        Args:
-            files: one or more files to parse
-
-        Returns:
-            A dataset (or subtype) object.
-
-        """
-        if files:
-            # The factory that does the heavy lifting:
-            dataset = DataSetReader.parseFiles(files)
-            # give the user what they call for, usually
-            if cls != DataSet and type(dataset) == DataSet:
-                dataset = dataset.copy(asType=cls.__name__)
-        else:
-            dataset = object.__new__(cls)
-            dataset.__init__()
-        if not dataset.uuid:
-            dataset.newUuid()
-        log.debug('Updating counts')
-        dataset.fileNames = files
-        dataset.updateCounts()
-        dataset.close()
-        log.debug('Done creating {c}'.format(c=cls.__name__))
-        return dataset
-
-
 class DataSet(object):
     """The record containing the DataSet information, with possible type
     specific subclasses"""
 
-    __metaclass__ = MetaDataSet
     datasetType = DataSetMetaTypes.ALL
 
-    def __init__(self, *files):
+    def __init__(self, *files, **kwargs):
         """DataSet constructor
 
         Initialize representations of the ExternalResources, MetaData,
         Filters, and LabeledSubsets, parse inputs if possible
 
         Args:
-            (HANDLED BY METACLASS __call__) *files: one or more filenames or
-                                                    uris to read
+            *files: one or more filenames or uris to read
+            strict=False: strictly require all index files
+            skipCounts=False: skip updating counts for faster opening
 
         Doctest:
             >>> import os, tempfile
@@ -185,10 +172,7 @@ class DataSet(object):
             >>> ds1 = DataSet(data.getFofn())
             >>> ds1.numExternalResources
             2
-            >>> # DataSet types are autodetected:
-            >>> DataSet(data.getSubreadSet()) # doctest:+ELLIPSIS
-            <SubreadSet...
-            >>> # But can also be used directly
+            >>> # Constructors should be used directly
             >>> SubreadSet(data.getSubreadSet()) # doctest:+ELLIPSIS
             <SubreadSet...
             >>> # Even with untyped inputs
@@ -217,6 +201,9 @@ class DataSet(object):
             True
 
         """
+        self._strict = kwargs.get('strict', False)
+        self._skipCounts = kwargs.get('skipCounts', False)
+
         # The metadata concerning the DataSet or subtype itself (e.g.
         # name, version, UniqueId)
         self.objMetadata = {}
@@ -232,12 +219,43 @@ class DataSet(object):
         self.subdatasets = []
 
         # Why not keep this around... (filled by file reader)
-        self.fileNames = []
+        self.fileNames = files
+
+        # parse files
+        log.debug('Containers specified')
+        populateDataSet(self, files)
+        log.debug('Done populating')
+        # update uuid
+        if not self.uuid:
+            self.newUuid()
+
+        self.objMetadata.setdefault("Name", "")
+        self.objMetadata.setdefault("Tags", "")
+        dsType = self.objMetadata.setdefault(
+            "MetaType", "PacBio.DataSet." + self.__class__.__name__)
+        if self._strict:
+            if dsType != _toDsId('DataSet'):
+                if dsType not in self._castableDataSetTypes:
+                    if not (isinstance(self, HdfSubreadSet) and
+                            dsType == 'PacBio.DataSet.SubreadSet'):
+                        raise IOError(errno.EIO,
+                                      "Cannot create {c} from {f}".format(
+                                          c=self.datasetType, f=dsType),
+                                      files[0])
 
         # State tracking:
         self._cachedFilters = []
         self.noFiltering = False
         self._openReaders = []
+        self._referenceInfoTable = None
+        self._referenceDict = {}
+
+        # update counts
+        if files:
+            if not self.totalLength or not self.numRecords:
+                self.updateCounts()
+            elif self.totalLength <= 0 or self.numRecords <= 0:
+                self.updateCounts()
 
     def __repr__(self):
         """Represent the dataset with an informative string:
@@ -283,35 +301,60 @@ class DataSet(object):
             >>> ds3.numExternalResources == expected
             True
         """
-        if (otherDataset.__class__.__name__ == self.__class__.__name__ or
-                otherDataset.__class__.__name__ == 'DataSet'):
-            if not self.filters.testCompatibility(otherDataset.filters):
+        return self.merge(otherDataset)
+
+    def merge(self, other, copyOnMerge=True):
+        if (other.__class__.__name__ == self.__class__.__name__ or
+                other.__class__.__name__ == 'DataSet' or
+                self.__class__.__name__ == 'DataSet'):
+            firstIn = False
+            if not self.toExternalFiles():
+                firstIn = True
+            if (not firstIn and
+                    not self.filters.testCompatibility(other.filters)):
                 log.warning("Filter incompatibility has blocked the addition "
                             "of two datasets")
                 return None
+            else:
+                self.addFilters(other.filters)
             self._cachedFilters = []
-            self._checkObjMetadata(otherDataset.objMetadata)
-            result = self.copy()
-            result.addMetadata(otherDataset.metadata)
-            result.addExternalResources(otherDataset.externalResources)
+            self._checkObjMetadata(other.objMetadata)
+            # There is probably a cleaner way to do this:
+            self.objMetadata.update(other.objMetadata)
+            if copyOnMerge:
+                result = self.copy()
+            else:
+                result = self
+            result.addMetadata(other.metadata)
+            # skip updating counts because other's metadata should be up to
+            # date
+            result.addExternalResources(other.externalResources,
+                                        updateCount=False)
             # DataSets may only be merged if they have identical filters,
             # So there is nothing new to add.
-            result.addDatasets(otherDataset.copy())
+            if other.subdatasets or not firstIn:
+                if copyOnMerge:
+                    result.addDatasets(other.copy())
+                else:
+                    result.addDatasets(other)
             # If this dataset has no subsets representing it, add self as a
             # subdataset to the result
-            if not self.subdatasets:
+            # TODO: this is a stopgap to prevent spurious subdatasets when
+            # creating datasets from dataset xml files...
+            if not self.subdatasets and not firstIn:
                 result.addDatasets(self.copy())
             return result
         else:
             raise TypeError('DataSets can only be merged with records of the '
                             'same type or of type DataSet')
 
+
     def __deepcopy__(self, memo):
         """Deep copy this Dataset by recursively deep copying the members
         (objMetadata, DataSet metadata, externalResources, filters and
         subdatasets)
         """
-        tbr = type(self)()
+        tbr = type(self)(skipCounts=True)
         memo[id(self)] = tbr
         tbr.objMetadata = copy.deepcopy(self.objMetadata, memo)
         tbr.metadata = copy.deepcopy(self._metadata, memo)
@@ -348,21 +391,27 @@ class DataSet(object):
             try:
                 reader.close()
             except AttributeError:
-                log.info("Reader not opened properly, therefore not closed "
-                         "properly.")
+                if not self._strict:
+                    log.info("Reader not opened properly, therefore not "
+                             "closed properly.")
+                else:
+                    raise
         self._openReaders = []
 
     def __exit__(self, *exec_info):
         self.close()
 
     def __len__(self):
-        count = 0
-        if self._filters:
-            count = len(self.indexRecords)
-        else:
-            for reader in self.resourceReaders():
-                count += sum(1 for _ in reader)
-        return count
+        if self.numRecords == -1:
+            if self._filters:
+                log.warn("Base class DataSet length cannot be calculate when "
+                         "filters present")
+            else:
+                count = 0
+                for reader in self.resourceReaders():
+                    count += len(reader)
+                self.numRecords = count
+        return self.numRecords
 
     def newUuid(self, setter=True):
         """Generate and enforce the uniqueness of an ID for a new DataSet.
@@ -440,7 +489,8 @@ class DataSet(object):
             ...                    ds1.subdatasets for ds2d in
             ...                    ds2.subdatasets])
             >>> # But types are maintained:
-            >>> ds1 = DataSet(data.getXml(no=10))
+            >>> # TODO: turn strict back on once sim sets are indexable
+            >>> ds1 = SubreadSet(data.getXml(no=10), strict=False)
             >>> ds1.metadata # doctest:+ELLIPSIS
             <SubreadSetMetadata...
             >>> ds2 = ds1.copy()
@@ -464,11 +514,12 @@ class DataSet(object):
         """
         if asType:
             try:
-                tbr = self._castableTypes[asType]()
+                tbr = self.__class__.castableTypes()[asType]()
             except KeyError:
                 raise TypeError("Cannot cast from {s} to "
-                                "{t}".format(s=type(self).__name__, t=asType))
-            tbr.__dict__.update(copy.deepcopy(self.__dict__))
+                                "{t}".format(s=type(self).__name__,
+                                             t=asType))
+            tbr.merge(self)
             # update the metatypes: (TODO: abstract out 'iterate over all
             # resources and modify the element that contains them')
             tbr.makePathsAbsolute()
@@ -477,10 +528,25 @@ class DataSet(object):
         result.newUuid()
         return result
 
-    def split(self, chunks=0, ignoreSubDatasets=False, contigs=False):
+    def split(self, chunks=0, ignoreSubDatasets=False, contigs=False,
+              maxChunks=0, breakContigs=False):
         """Deep copy the DataSet into a number of new DataSets containing
         roughly equal chunks of the ExternalResources or subdatasets.
 
+        Examples:
+            - split into exactly n datasets where each addresses a different
+              piece of the collection of contigs:
+                dset.split(contigs=True, chunks=n)
+
+            - split into at most n datasets where each addresses a different
+              piece of the collection of contigs, but contigs are kept whole:
+                dset.split(contigs=True, maxChunks=n)
+
+            - split into at most n datasets where each addresses a different
+              piece of the collection of contigs and the number of chunks is in
+              part based on the number of reads:
+                dset.split(contigs=True, maxChunks=n, breakContigs=True)
+
         Args:
             chunks: the number of chunks to split the DataSet. When chunks=0,
                     create one DataSet per subdataset, or failing that
@@ -489,6 +555,12 @@ class DataSet(object):
                                on ExternalResources
             contigs: (False) split on contigs instead of external resources or
                      subdatasets
+            maxChunks: The upper limit on the number of chunks. If chunks=0
+                       and breakcontigs=False, keeping contigs whole places an
+                       additional upper bound on the number of chunks
+            breakContigs: Whether or not to break contigs when using maxChunks.
+                          If True, something resembling an efficient number of
+                          chunks for the dataset size will be produced.
         Returns:
             A list of new DataSet objects (all other information deep copied).
 
@@ -540,7 +612,7 @@ class DataSet(object):
             True
         """
         if contigs:
-            return self._split_contigs(chunks)
+            return self._split_contigs(chunks, maxChunks, breakContigs)
 
         # Lets only split on datasets if actual splitting will occur,
         # And if all subdatasets have the required balancing key (totalLength)
@@ -548,7 +620,7 @@ class DataSet(object):
                 and not ignoreSubDatasets):
             return self._split_subdatasets(chunks)
 
-        atoms = self.externalResources
+        atoms = self.externalResources.resources
         balanceKey = len
 
         # If chunks not specified, split to one DataSet per
@@ -593,7 +665,8 @@ class DataSet(object):
             largest[2] += 1
         return atoms
 
-    def _split_contigs(self, chunks):
+    def _split_contigs(self, chunks, maxChunks=0, breakContigs=False,
+                       targetSize=5000):
         """Split a dataset into reference windows based on contigs.
 
         Args:
@@ -608,25 +681,44 @@ class DataSet(object):
         # contigs with associated records
 
         # The format is rID, start, end, for a reference window
-        refNames = self.refNames
+        log.debug("Fetching reference names and lengths")
+        # pull both at once so you only have to mess with the
+        # referenceInfoTable once.
+        refLens = self.refLengths
+        refNames = refLens.keys()
         log.debug("{i} references found".format(i=len(refNames)))
         log.debug("Finding contigs")
         if len(refNames) < 100:
-            atoms = [(rn, None, None) for rn in refNames if
-                     next(self.readsInReference(rn), None)]
+            atoms = [(rn, 0, 0) for rn in refNames if
+                     next(self._indexReadsInReference(rn), None)]
         else:
             log.debug("Skipping records for each reference check")
-            atoms = [(rn, None, None) for rn in self.refNames]
+            atoms = [(rn, 0, 0) for rn in refNames]
 
         # The window length is used for balancing
-        # TODO switch it to countRecords
+        # TODO switch it to countRecords?
         balanceKey = lambda x: x[2] - x[1]
+
+        # By providing maxChunks and not chunks, this combination will set
+        # chunks down to < len(atoms) < maxChunks
         if not chunks:
             chunks = len(atoms)
-        log.debug("Fetching reference lengths")
-        refLens = self.refLengths
+        if maxChunks and chunks > maxChunks:
+            chunks = maxChunks
+
+        # Decide whether to intelligently limit chunk size:
+        if maxChunks and breakContigs:
+            # The bounds:
+            minNum = 2
+            maxNum = maxChunks
+            # Adjust
+            dataSize = self.numRecords
+            chunks = int(dataSize/targetSize)
+            # Respect bounds:
+            chunks = minNum if chunks < minNum else chunks
+            chunks = maxNum if chunks > maxNum else chunks
+
         # refwindow format: rId, start, end
-        atoms = [(rn, 0, refLens[rn]) for rn, _, _ in atoms]
         if chunks > len(atoms):
             # splitting atom format is slightly different (but more compatible
             # going forward with countRecords): (rId, size, segments)
@@ -645,7 +737,7 @@ class DataSet(object):
                 sub_segments.append(tmp)
                 segments.extend(sub_segments)
             atoms = segments
-        log.debug("Done defining chunks")
+        log.debug("Done defining {n} chunks".format(n=chunks))
 
         # duplicate
         log.debug("Making copies")
@@ -657,7 +749,7 @@ class DataSet(object):
         log.debug("Done chunking")
         log.debug("Modifying filters or resources")
         for result, chunk in zip(results, chunks):
-            if not atoms[0][2] is None:
+            if atoms[0][2]:
                 result.filters.addRequirement(
                     rname=[('=', c[0]) for c in chunk],
                     tStart=[('>', c[1]) for c in chunk],
@@ -708,7 +800,7 @@ class DataSet(object):
             results.append(newCopy)
         return results
 
-    def write(self, outFile, validate=True, relPaths=False):
+    def write(self, outFile, validate=True, relPaths=False, pretty=True):
         """Write to disk as an XML file
 
         Args:
@@ -729,23 +821,27 @@ class DataSet(object):
             >>> ds1 == ds2
             True
         """
-        # prep for writing, fill some fields that aren't filled elsewhere:
-        self.objMetadata.setdefault(
-            "MetaType", "PacBio.DataSet." + self.__class__.__name__)
-        self.objMetadata.setdefault("Name", "")
-        self.objMetadata.setdefault("Tags", "")
         # fix paths if validate:
         if validate:
             if relPaths:
                 self.makePathsRelative(os.path.dirname(outFile))
             else:
                 self.makePathsAbsolute()
-        xml = toXml(self)
+        xml_string = toXml(self)
+        if pretty:
+            xml_string = xml.dom.minidom.parseString(xml_string).toprettyxml()
         if validate:
-            validateString(xml, relTo=outFile)
+            validateString(xml_string, relTo=outFile)
         fileName = urlparse(outFile).path.strip()
+        if self._strict and not isinstance(self.datasetType, tuple):
+            if not fileName.endswith(_dsIdToSuffix(self.datasetType)):
+                raise IOError(errno.EIO,
+                              "Given filename does not meet standards, "
+                              "should end with {s}".format(
+                                  s=_dsIdToSuffix(self.datasetType)),
+                              fileName)
         with open(fileName, 'w') as outFile:
-            outFile.writelines(xml)
+            outFile.writelines(xml_string)
 
     def loadStats(self, filename):
         """Load pipeline statistics from a <moviename>.sts.xml file. The subset
@@ -760,21 +856,31 @@ class DataSet(object):
             >>> import pbcore.data.datasets as data
             >>> from pbcore.io import AlignmentSet
             >>> ds1 = AlignmentSet(data.getXml(8))
-            >>> # TODO: renable when up to date sts.xml test files available
-            >>> #ds1.loadStats(data.getStats())
-            >>> #ds2 = AlignmentSet(data.getXml(11))
-            >>> #ds2.loadStats(data.getStats())
-            >>> #ds3 = ds1 + ds2
-            >>> #ds1.metadata.summaryStats.prodDist.bins
-            >>> #[1576, 901, 399, 0]
-            >>> #ds2.metadata.summaryStats.prodDist.bins
-            >>> #[1576, 901, 399, 0]
-            >>> #ds3.metadata.summaryStats.prodDist.bins
-            >>> #[3152, 1802, 798, 0]
+            >>> ds1.loadStats(data.getStats())
+            >>> ds2 = AlignmentSet(data.getXml(11))
+            >>> ds2.loadStats(data.getStats())
+            >>> ds3 = ds1 + ds2
+            >>> ds1.metadata.summaryStats.prodDist.bins
+            [1576, 901, 399, 0]
+            >>> ds2.metadata.summaryStats.prodDist.bins
+            [1576, 901, 399, 0]
+            >>> ds3.metadata.summaryStats.prodDist.bins
+            [3152, 1802, 798, 0]
 
         """
-        statsMetadata = DataSetReader.parseStats(filename)
-        self.metadata.append(statsMetadata)
+        if isinstance(filename, str):
+            statsMetadata = parseStats(filename)
+        else:
+            statsMetadata = filename
+        if self.metadata.summaryStats:
+            newSub = self.copy()
+            newSub.metadata.removeChildren('SummaryStats')
+            newSub.loadStats(statsMetadata)
+            self.addDatasets(self.copy())
+            self.addDatasets(newSub)
+            self.metadata.summaryStats.merge(statsMetadata)
+        else:
+            self.metadata.append(statsMetadata)
 
     def processFilters(self):
         """Generate a list of functions to apply to a read, all of which return
@@ -797,17 +903,6 @@ class DataSet(object):
         self._cachedFilters = filters
         return filters
 
-    def _resolveLocation(self, fname, possibleRelStart):
-        """Find the absolute path of a file that exists relative to '.' or
-        possibleRelStart."""
-        if os.path.exists(fname):
-            return os.path.abspath(fname)
-        if os.path.exists(possibleRelStart):
-            if os.path.exists(os.path.join(possibleRelStart, fname)):
-                return os.path.abspath(os.path.join(possibleRelStart, fname))
-        log.error("Including unresolved file: {f}".format(f=fname))
-        return fname
-
     def makePathsAbsolute(self, curStart="."):
         """As part of the validation process, make all ResourceIds absolute
         URIs rather than relative paths. Generally not called by API users.
@@ -816,7 +911,7 @@ class DataSet(object):
             curStart: The location from which relative paths should emanate.
         """
         self._changePaths(
-            lambda x, s=curStart: self._resolveLocation(x, s))
+            lambda x, s=curStart: resolveLocation(x, s))
 
     def makePathsRelative(self, outDir=False):
         """Make things easier for writing test cases: make all
@@ -832,6 +927,25 @@ class DataSet(object):
         else:
             self._changePaths(os.path.relpath)
 
+    def _modResources(self, func):
+        # check all ExternalResources
+        stack = list(self.externalResources)
+        while stack:
+            item = stack.pop()
+            resId = item.resourceId
+            if not resId:
+                continue
+            func(item)
+            try:
+                stack.extend(list(item.indices))
+            except AttributeError:
+                # Some things just don't have indices
+                pass
+
+        # check all SubDatasets
+        for dataset in self.subdatasets:
+            dataset._modResources(func)
+
     def _changePaths(self, osPathFunc, checkMetaType=True):
         """Change all resourceId paths in this DataSet according to the
         osPathFunc provided.
@@ -866,6 +980,9 @@ class DataSet(object):
         for dataset in self.subdatasets:
             dataset._changePaths(osPathFunc)
 
+    def _populateMetaTypes(self):
+        self._modResources(self._updateMetaType)
+
     def _updateMetaType(self, resource):
         """Infer and set the metatype of 'resource' if it doesn't already have
         one."""
@@ -880,6 +997,10 @@ class DataSet(object):
         # no metatypes for generic DataSet
         return {}
 
+    def copyFiles(self, outdir):
+        backticks('cp {i} {o}'.format(i=' '.join(self.toExternalFiles()),
+                                      o=outdir))
+
     def disableFilters(self):
         """Disable read filtering for this object"""
         self.noFiltering = True
@@ -925,10 +1046,12 @@ class DataSet(object):
             newMetadata: The object metadata of a DataSet being considered for
                          merging
         """
-        if newMetadata.get('Version') > self.objMetadata.get('Version'):
-            raise ValueError("Wrong dataset version for merging {v1} vs "
-                             "{v2}".format(v1=newMetadata.get('Version'),
-                                           v2=self.objMetadata.get('Version')))
+        if self.objMetadata.get('Version'):
+            if newMetadata.get('Version') > self.objMetadata.get('Version'):
+                raise ValueError("Wrong dataset version for merging {v1} vs "
+                                 "{v2}".format(
+                                        v1=newMetadata.get('Version'),
+                                        v2=self.objMetadata.get('Version')))
 
 
     def addMetadata(self, newMetadata, **kwargs):
@@ -969,12 +1092,11 @@ class DataSet(object):
             >>> ds2._metadata.totalLength += 100000
             >>> ds2._metadata.totalLength
             200000
-            >>> #TODO: renable when up to date sts.xml test files available
-            >>> #ds3 = DataSet(data.getXml(no=8))
-            >>> #ds3.loadStats(data.getStats())
-            >>> #ds4 = DataSet(data.getXml(no=11))
-            >>> #ds4.loadStats(data.getStats())
-            >>> #ds5 = ds3 + ds4
+            >>> ds3 = DataSet(data.getXml(no=8))
+            >>> ds3.loadStats(data.getStats())
+            >>> ds4 = DataSet(data.getXml(no=11))
+            >>> ds4.loadStats(data.getStats())
+            >>> ds5 = ds3 + ds4
         """
         if newMetadata:
             # if this record is not wrapped, wrap for easier merging
@@ -991,21 +1113,32 @@ class DataSet(object):
             self.metadata.addMetadata(key, value)
 
     def updateCounts(self):
-        try:
-            if not self.hasPbi:
-                log.debug("Updating counts not supported without pbi")
-                return
-            self.metadata.numRecords = len(self)
-            self.metadata.totalLength = self._totalLength
-        # I would prefer to just catch IOError and UnavailableFeature
-        except Exception:
-            log.debug("Files not found, metadata not "
-                      "populated")
-            self.metadata.numRecords = -1
-            self.metadata.totalLength = -1
+        self.metadata.totalLength = -1
+        self.metadata.numRecords = -1
+
+    def assertIndexed(self):
+        if not self._openReaders:
+            try:
+                tmp = self._strict
+                self._openFiles()
+            except Exception:
+                # Catch everything to recover the strictness status, then raise
+                # whatever error was found.
+                self._strict = tmp
+                raise
+            finally:
+                self._strict = tmp
+        else:
+            for fname, reader in zip(self.toExternalFiles(),
+                                     self.resourceReaders()):
+                if (not isinstance(reader, IndexedBamReader) and
+                        not isinstance(reader, CmpH5Reader)):
+                    raise IOError(errno.EIO,
+                                  "File not indexed: {f}".format(f=fname),
+                                  fname)
 
 
-    def addExternalResources(self, newExtResources):
+    def addExternalResources(self, newExtResources, updateCount=True):
         """Add additional ExternalResource objects, ensuring no duplicate
         resourceIds. Most often used by the __add__ method, rather than
         directly.
@@ -1026,15 +1159,15 @@ class DataSet(object):
             >>> er2.resourceId = "test2.bam"
             >>> er3 = ExternalResource()
             >>> er3.resourceId = "test1.bam"
-            >>> ds.addExternalResources([er1])
+            >>> ds.addExternalResources([er1], updateCount=False)
             >>> len(ds.externalResources)
             1
             >>> # different resourceId: succeeds
-            >>> ds.addExternalResources([er2])
+            >>> ds.addExternalResources([er2], updateCount=False)
             >>> len(ds.externalResources)
             2
             >>> # same resourceId: fails
-            >>> ds.addExternalResources([er3])
+            >>> ds.addExternalResources([er3], updateCount=False)
             >>> len(ds.externalResources)
             2
             >>> # but it is probably better to add them a little deeper:
@@ -1046,6 +1179,9 @@ class DataSet(object):
                        self.externalResources]
 
         for newExtRes in newExtResources:
+            if isinstance(newExtRes, str):
+                newExtRes = wrapNewResource(newExtRes)
+
             # merge duplicates instead of adding them
             if newExtRes.resourceId in resourceIds:
                 first = resourceIds.index(newExtRes.resourceId)
@@ -1055,6 +1191,9 @@ class DataSet(object):
             else:
                 self.externalResources.append(newExtRes)
                 resourceIds.append(newExtRes.resourceId)
+        if updateCount:
+            self._openFiles()
+            self.updateCounts()
 
     def addDatasets(self, otherDataSet):
         """Add subsets to a DataSet object using other DataSets.
@@ -1076,6 +1215,9 @@ class DataSet(object):
         """Open the files (assert they exist, assert they are of the proper
         type before accessing any file)
         """
+        if self._openReaders:
+            log.debug("Closing old readers...")
+            self.close()
         log.debug("Opening resources")
         for extRes in self.externalResources:
             location = urlparse(extRes.resourceId).path
@@ -1084,13 +1226,18 @@ class DataSet(object):
                     location,
                     referenceFastaFname=refFile)
             except (IOError, ValueError):
-                log.info("pbi file missing for {f}, operating with "
-                         "reduced speed and functionality".format(
-                             f=location))
-                resource = openAlignmentFile(location,
-                                             referenceFastaFname=refFile)
+                if not self._strict:
+                    log.info("pbi file missing for {f}, operating with "
+                             "reduced speed and functionality".format(
+                                 f=location))
+                    resource = openAlignmentFile(location,
+                                                 referenceFastaFname=refFile)
+                else:
+                    raise
             if not resource:
-                raise IOError("{f} fails to open".format(f=location))
+                raise IOError(errno.EIO,
+                              "{f} fails to open".format(f=location),
+                              location)
             self._openReaders.append(resource)
         log.debug("Done opening resources")
 
@@ -1133,32 +1280,24 @@ class DataSet(object):
 
     @property
     def indexRecords(self):
-        """Return a recarray summarizing all of the records in all of the
-        resources that conform to those filters addressing parameters cached in
-        the pbi.
-
+        """Yields chunks of recarray summarizing all of the records in all of
+        the resources that conform to those filters addressing parameters
+        cached in the pbi.
         """
-        assert self.hasPbi
-        tbr = []
+        self.assertIndexed()
         for rr in self.resourceReaders():
             indices = rr.index
-            tIdMap = {n: name
-                      for n, name in enumerate(rr.referenceInfoTable['Name'])}
-            filts = self._filters.tests(readType='pbi', tIdMap=tIdMap)
-
-            if not filts or self.noFiltering:
-                tbr.append(indices)
+            if not self._filters or self.noFiltering:
+                yield indices
                 continue
+            nameMap = {name: n
+                       for n, name in enumerate(rr.referenceInfoTable['Name'])}
 
-            mask = np.zeros(len(indices), dtype=bool)
-            log.info("{n} rows found in this pbi".format(n=len(indices)))
-            # remove reads per filter
-            for i, read in enumerate(indices):
-                if any([filt(read) for filt in filts]):
-                    mask[i] = True
-            tbr.append(np.extract(mask, indices))
-        tbr = reduce(np.append, tbr)
-        return tbr
+            passes = self._filters.filterIndexRecords(indices, nameMap)
+            # This is faster than np.extract or iterating over zip
+            for i in xrange(len(indices)):
+                if passes[i]:
+                    yield indices[i]
 
     @property
     @filtered
@@ -1188,16 +1327,6 @@ class DataSet(object):
         for record in self.records:
             yield record
 
-
-    @property
-    def _totalLength(self):
-        """Used to populate metadata in updateCounts"""
-        tot = 0
-        for record in self.indexRecords:
-            tot += record.aEnd - record.aStart
-        return tot
-
-
     @property
     def subSetNames(self):
         """The subdataset names present in this DataSet"""
@@ -1278,6 +1407,11 @@ class DataSet(object):
         """A dict of refName: refLength"""
         return {name: length for name, length in self.refInfo('Length')}
 
+    @property
+    def refIds(self):
+        """A dict of refName: refId"""
+        return {name: rId for name, rId in self.refInfo('ID')}
+
     def refLength(self, rname):
         """The length of reference 'rname'. This is expensive, so if you're
         going to do many lookups cache self.refLengths locally and use that."""
@@ -1296,26 +1430,49 @@ class DataSet(object):
         Args:
             key: a key for the referenceInfoTable of each resource
         Returns:
-            A dictionary of refrence name: key_result pairs"""
-        # sample
-        names = []
-        infos = []
-        #for resource in self.resourceReaders():
-            #names.extend(resource.referenceInfoTable['FullName'])
-            #infos.extend(resource.referenceInfoTable[key])
+            A list of tuples of refrence name, key_result pairs
+
+        """
+        log.debug("Sampling references")
         names = self.referenceInfoTable['Name']
         infos = self.referenceInfoTable[key]
-        # remove dupes
+
+        log.debug("Removing duplicate reference entries")
         sampled = zip(names, infos)
-        sampled = list(set(sampled))
-        # filter
-        if not self.noFiltering:
+        sampled = set(sampled)
+
+        log.debug("Filtering reference entries")
+        if not self.noFiltering or not self._filters:
             sampled = [(name, info) for name, info in sampled
                        if self._filters.testParam('rname', name)]
         return sampled
 
+    def _indexReadsInReference(self, refName):
+        if isinstance(refName, np.int64):
+            refName = str(refName)
+        if refName.isdigit():
+            if (not refName in self.refNames
+                    and not refName in self.fullRefNames):
+                try:
+                    refName = self._idToRname(int(refName))
+                except AttributeError:
+                    raise StopIteration
+
+        # I would love to use readsInRange(refName, None, None), but
+        # IndexedBamReader breaks this (works for regular BamReader).
+        # So I have to do a little hacking...
+        refLen = 0
+        for resource in self.resourceReaders():
+            if (refName in resource.referenceInfoTable['Name'] or
+                    refName in resource.referenceInfoTable['FullName']):
+                refLen = resource.referenceInfo(refName).Length
+                break
+        if refLen:
+            for read in self._indexReadsInRange(refName, 0, refLen):
+                yield read
+
     @filtered
-    def readsInReference(self, refName, justIndices=False):
+    def readsInReference(self, refName):
         """A generator of (usually) BamAlignment objects for the
         reads in one or more Bam files pointed to by the ExternalResources in
         this DataSet that are mapped to the specified reference genome.
@@ -1335,32 +1492,56 @@ class DataSet(object):
             hn: ...
         """
 
-        if not refName in self.refNames and not refName in self.fullRefNames:
-            try:
-                _ = int(refName)
-            except ValueError:
-                raise StopIteration
-            refName = self._idToRname(refName)
+        if isinstance(refName, np.int64):
+            refName = str(refName)
+        if refName.isdigit():
+            if (not refName in self.refNames
+                    and not refName in self.fullRefNames):
+                try:
+                    refName = self._idToRname(int(refName))
+                except AttributeError:
+                    raise StopIteration
 
         # I would love to use readsInRange(refName, None, None), but
         # IndexedBamReader breaks this (works for regular BamReader).
-
         # So I have to do a little hacking...
         refLen = 0
         for resource in self.resourceReaders():
             if (refName in resource.referenceInfoTable['Name'] or
-                    #refName in resource.referenceInfoTable['ID'] or
                     refName in resource.referenceInfoTable['FullName']):
                 refLen = resource.referenceInfo(refName).Length
+                break
         if refLen:
-            # TODO if the bam file is indexed readsInRange returns a list.
-            # Calling this on the alignment results in every read in a list all
-            # at once. We can block it into smaller calls...
-            for read in self.readsInRange(refName, 0, refLen, justIndices):
+            for read in self.readsInRange(refName, 0, refLen):
                 yield read
 
+
+    def _indexReadsInRange(self, refName, start, end):
+        if isinstance(refName, np.int64):
+            refName = str(refName)
+        if refName.isdigit():
+            if (not refName in self.refNames and
+                    not refName in self.fullRefNames):
+                # we need the real refName, which may be hidden behind a
+                # mapping to resolve duplicate refIds between resources...
+                refName = self._idToRname(int(refName))
+        for reader in self.resourceReaders():
+            tIdMap = {n: name
+                      for n, name in enumerate(
+                          reader.referenceInfoTable['Name'])}
+            filts = self._filters.tests(readType='pbi', tIdMap=tIdMap)
+            index = reader.index
+            winId = reader.referenceInfo(refName).ID
+            for rec_i in index.rangeQuery(winId, start, end):
+                read = index[rec_i]
+                if filts:
+                    if any([filt(read) for filt in filts]):
+                        yield read
+                else:
+                    yield read
+
     @filtered
-    def readsInRange(self, refName, start, end, justIndices=False):
+    def readsInRange(self, refName, start, end, buffsize=50):
         """A generator of (usually) BamAlignment objects for the reads in one
         or more Bam files pointed to by the ExternalResources in this DataSet
         that have at least one coordinate within the specified range in the
@@ -1375,7 +1556,6 @@ class DataSet(object):
             start: the start of the range (inclusive, index relative to
                    reference)
             end: the end of the range (inclusive, index relative to reference)
-            justIndices: Not yet implemented
 
         Yields:
             BamAlignment objects
@@ -1388,44 +1568,90 @@ class DataSet(object):
             ...     print 'hn: %i' % read.holeNumber # doctest:+ELLIPSIS
             hn: ...
         """
-        # This all breaks pretty horribly if refName is actually a refId.
-        if not refName in self.refNames and not refName in self.fullRefNames:
-            # we can think of this as an id then, I guess
-            _ = int(refName)
-            # we need the real refName, which may be hidden behind a mapping to
-            # resolve duplicate refIds between resources...
-            refName = self._idToRname(refName)
+        if isinstance(refName, np.int64):
+            refName = str(refName)
+        if refName.isdigit():
+            if (not refName in self.refNames and
+                    not refName in self.fullRefNames):
+                # we need the real refName, which may be hidden behind a
+                # mapping to resolve duplicate refIds between resources...
+                refName = self._idToRname(int(refName))
+
+        # correct the cmp.h5 reference names before reads go out the door
+        if self.isCmpH5:
+            for res in self.resourceReaders():
+                for row in res.referenceInfoTable:
+                    row.FullName = row.FullName.split(' ')[0]
 
         # merge sort before yield
         if self.numExternalResources > 1:
-            if justIndices:
-                try:
-                    read_its = [iter(rr.readsInRange(refName, start, end,
-                                                     justIndices))
-                                for rr in self.resourceReaders()]
-                except Exception:
-                    log.warn("This would be faster with a .pbi file")
-                    read_its = [iter(rr.readsInRange(refName, start, end))
-                                for rr in self.resourceReaders()]
+            if buffsize > 1:
+                # create read/reader caches
+                read_its = [iter(rr.readsInRange(refName, start, end))
+                            for rr in self.resourceReaders()]
+                deep_buf = [[next(it, None) for _ in range(buffsize)]
+                            for it in read_its]
+
+                # remove empty iterators
+                read_its = [it for it, cur in zip(read_its, deep_buf)
+                            if cur[0]]
+                deep_buf = [buf for buf in deep_buf if buf[0]]
+
+                # populate starting values/scratch caches
+                buf_indices = [0 for _ in read_its]
+                tStarts = [cur[0].tStart for cur in deep_buf]
+
+                while len(read_its) != 0:
+                    # pick the first one to yield
+                    # this should be a bit faster than taking the min of an
+                    # enumeration of currents with a key function accessing a
+                    # field...
+                    first = min(tStarts)
+                    first_i = tStarts.index(first)
+                    buf_index = buf_indices[first_i]
+                    first = deep_buf[first_i][buf_index]
+                    # update the buffers
+                    buf_index += 1
+                    buf_indices[first_i] += 1
+                    if buf_index == buffsize:
+                        buf_index = 0
+                        buf_indices[first_i] = 0
+                        deep_buf[first_i] = [next(read_its[first_i], None)
+                                             for _ in range(buffsize)]
+                    if not deep_buf[first_i][buf_index]:
+                        del read_its[first_i]
+                        del tStarts[first_i]
+                        del deep_buf[first_i]
+                        del buf_indices[first_i]
+                    else:
+                        tStarts[first_i] = deep_buf[first_i][buf_index].tStart
+                    yield first
             else:
                 read_its = [iter(rr.readsInRange(refName, start, end))
                             for rr in self.resourceReaders()]
-            # buffer one element from each generator
-            currents = [next(its, None) for its in read_its]
-            # remove empty iterators
-            read_its = [it for it, cur in zip(read_its, currents) if cur]
-            currents = [cur for cur in currents if cur]
-            while len(read_its) != 0:
-                # pick the first one to yield
-                first_i, first = min(enumerate(currents),
-                                     key=lambda x: x[1].tStart)
-                # update the buffer
-                try:
-                    currents[first_i] = next(read_its[first_i])
-                except StopIteration:
-                    del read_its[first_i]
-                    del currents[first_i]
-                yield first
+                # buffer one element from each generator
+                currents = [next(its, None) for its in read_its]
+                # remove empty iterators
+                read_its = [it for it, cur in zip(read_its, currents) if cur]
+                currents = [cur for cur in currents if cur]
+                tStarts = [cur.tStart for cur in currents]
+                while len(read_its) != 0:
+                    # pick the first one to yield
+                    # this should be a bit faster than taking the min of an
+                    # enumeration of currents with a key function accessing a
+                    # field...
+                    first = min(tStarts)
+                    first_i = tStarts.index(first)
+                    first = currents[first_i]
+                    # update the buffers
+                    try:
+                        currents[first_i] = next(read_its[first_i])
+                        tStarts[first_i] = currents[first_i].tStart
+                    except StopIteration:
+                        del read_its[first_i]
+                        del currents[first_i]
+                        del tStarts[first_i]
+                    yield first
         else:
             # the above will work in either case, but this might be ever so
             # slightly faster
@@ -1447,8 +1673,7 @@ class DataSet(object):
         if self.isCmpH5:
             rId -= 1
         if self.isCmpH5:
-            # This is so not how it is supposed to be. This is what CmpIO
-            # recognizes as the 'shortname' pretty much throughout...
+            # This is what CmpIO recognizes as the 'shortname'
             refName = self.resourceReaders()[
                 resNo].referenceInfoTable[rId]['FullName']
         else:
@@ -1463,8 +1688,8 @@ class DataSet(object):
             outfn: (None) the file to which the resouce filenames are to be
                    written. If None, the only emission is a returned list of
                    file names.
-            uri: T/F (False) write the resource filenames as URIs.
-            relative: (False) emit paths relative to outfofn or '.' if no
+            uri: (t/F) write the resource filenames as URIs.
+            relative: (t/F) emit paths relative to outfofn or '.' if no
                       outfofn
 
         Returns:
@@ -1475,7 +1700,7 @@ class DataSet(object):
 
         Doctest:
             >>> from pbcore.io import DataSet
-            >>> DataSet("bam1.bam", "bam2.bam").toFofn(uri=False)
+            >>> DataSet("bam1.bam", "bam2.bam", strict=False).toFofn(uri=False)
             ['bam1.bam', 'bam2.bam']
         """
         lines = [er.resourceId for er in self.externalResources]
@@ -1499,11 +1724,18 @@ class DataSet(object):
         files = self.externalResources.resourceIds
         tbr = []
         for fname in files:
-            tbr.append(self._resolveLocation(fname, '.'))
+            tbr.append(resolveLocation(fname, '.'))
         return tbr
 
     @property
-    def _castableTypes(self):
+    def _castableDataSetTypes(self):
+        if isinstance(self.datasetType, tuple):
+            return self.datasetType
+        else:
+            return (_toDsId('DataSet'), self.datasetType)
+
+    @classmethod
+    def castableTypes(cls):
         """The types to which this DataSet type may be cast. This is a property
         instead of a member variable as we can enforce casting limits here (and
         modify if needed by overriding them in subclasses).
@@ -1511,9 +1743,9 @@ class DataSet(object):
         Returns:
             A dictionary of MetaType->Class mappings, e.g. 'DataSet': DataSet
         """
-        if type(self).__name__ != 'DataSet':
+        if cls.__name__ != 'DataSet':
             return {'DataSet': DataSet,
-                    type(self).__name__: type(self)}
+                    cls.__name__: cls}
         return {'DataSet': DataSet,
                 'SubreadSet': SubreadSet,
                 'HdfSubreadSet': HdfSubreadSet,
@@ -1557,10 +1789,10 @@ class DataSet(object):
         needed
         """
         self._cachedFilters = []
-        self.updateCounts()
-        # TODO, somethign like this but that works:
-        #if self.metadata.summaryStats:
-            #self.metadata.summaryStats = None
+        self.metadata.totalLength = -1
+        self.metadata.numRecords = -1
+        if self.metadata.summaryStats:
+            self.metadata.removeChildren('SummaryStats')
 
     @property
     def numRecords(self):
@@ -1637,12 +1869,16 @@ class DataSet(object):
                                                            IndexedBamReader))
             return self._unifyResponses(res)
         except ResourceMismatchError:
-            log.error("Resources inconsistently indexed")
-            return False
+            if not self._strict:
+                log.error("Resources inconsistently indexed")
+                return False
+            else:
+                raise
 
     def referenceInfo(self, refName):
         """Select a row from the DataSet.referenceInfoTable using the reference
         name as a unique key"""
+        # TODO: upgrade to use _referenceDict
         if not self.isCmpH5:
             for row in self.referenceInfoTable:
                 if row.Name == refName:
@@ -1659,22 +1895,27 @@ class DataSet(object):
         is preferred). Record.Names are remapped for cmp.h5 files to be
         consistent with bam files.
         """
-        # This isn't really possible for cmp.h5 files (rowStart, rowEnd, for
-        # instance). Use the resource readers directly instead.
-        responses = self._pollResources(lambda x: x.referenceInfoTable)
-        if len(responses) > 1:
-            assert not self.isCmpH5 # see above
-            tbr = reduce(np.append, responses)
-            tbr = np.unique(tbr)
-            for i, rec in enumerate(tbr):
-                rec.ID = i
-            return tbr
-        else:
-            table = responses[0]
-            if self.isCmpH5:
-                for rec in table:
-                    rec.Name = self._cleanCmpName(rec.FullName)
-            return table
+        if self._referenceInfoTable is None:
+            # This isn't really right for cmp.h5 files (rowStart, rowEnd, for
+            # instance). Use the resource readers directly instead.
+            responses = self._pollResources(lambda x: x.referenceInfoTable)
+            if len(responses) > 1:
+                assert not self.isCmpH5 # see above
+                tbr = np.concatenate(responses)
+                tbr = np.unique(tbr)
+                for i, rec in enumerate(tbr):
+                    rec.ID = i
+                self._referenceInfoTable = tbr
+            else:
+                table = responses[0]
+                if self.isCmpH5:
+                    for rec in table:
+                        rec.Name = self._cleanCmpName(rec.FullName)
+                self._referenceInfoTable = table
+            #TODO: Turn on when needed
+            #self._referenceDict.update(zip(self.refIds.values(),
+                                           #self._referenceInfoTable))
+        return self._referenceInfoTable
 
     @property
     def _referenceIdMap(self):
@@ -1716,15 +1957,33 @@ class DataSet(object):
         responses = self._pollResources(lambda x: x.pulseFeaturesAvailable())
         return self._unifyResponses(responses)
 
-    def __getattr__(self, key):
-        identicalList = ['sequencingChemistry', 'isSorted', 'isEmpty',
-                         'readType', 'tStart', 'tEnd']
-        if key in identicalList:
-            responses = self._pollResources(lambda x: getattr(x, key))
-            return self._unifyResponses(responses)
-        else:
-            raise AttributeError("{c} has no attribute {a}".format(
-                c=self.__class__.__name__, a=key))
+    @property
+    def sequencingChemistry(self):
+        return self._checkIdentical('sequencingChemistry')
+
+    @property
+    def isSorted(self):
+        return self._checkIdentical('isSorted')
+
+    @property
+    def isEmpty(self):
+        return self._checkIdentical('isEmpty')
+
+    @property
+    def readType(self):
+        return self._checkIdentical('readType')
+
+    @property
+    def tStart(self):
+        return self._checkIdentical('tStart')
+
+    @property
+    def tEnd(self):
+        return self._checkIdentical('tEnd')
+
+    def _checkIdentical(self, key):
+        responses = self._pollResources(lambda x: getattr(x, key))
+        return self._unifyResponses(responses)
 
     def _chunkList(self, inlist, chunknum, balanceKey=len):
         """Divide <inlist> members into <chunknum> chunks roughly evenly using
@@ -1740,10 +1999,17 @@ class DataSet(object):
             chunkSizes.sort()
             # Add one to the emptiest bin
             chunks[chunkSizes[0][1]].append(item)
-            chunkSizes[0][0] += balanceKey(item)
+            mass = balanceKey(item)
+            if mass == 0:
+                mass += 1
+            chunkSizes[0][0] += mass
         return chunks
 
-class ResourceMismatchError(Exception):
+class InvalidDataSetIOError(Exception):
+    """The base class for all DataSetIO related custom exceptions (hopefully)
+    """
+
+class ResourceMismatchError(InvalidDataSetIOError):
 
     def __init__(self, responses):
         super(ResourceMismatchError, self).__init__()
@@ -1753,13 +2019,46 @@ class ResourceMismatchError(Exception):
         return "Resources responded differently: " + ', '.join(
             map(str, self.responses))
 
+
 class ReadSet(DataSet):
     """Base type for read sets, should probably never be used as a concrete
     class"""
 
-    def __init__(self, *files):
-        super(ReadSet, self).__init__()
-        self._metadata = SubreadSetMetadata()
+    def __init__(self, *files, **kwargs):
+        super(ReadSet, self).__init__(*files, **kwargs)
+        self._metadata = SubreadSetMetadata(self._metadata)
+
+    def _openFiles(self):
+        """Open the files (assert they exist, assert they are of the proper
+        type before accessing any file)
+        """
+        if self._openReaders:
+            log.debug("Closing old readers...")
+            self.close()
+        log.debug("Opening SubreadSet resources")
+        for extRes in self.externalResources:
+            refFile = extRes.reference
+            log.debug("Using reference: {r}".format(r=refFile))
+            location = urlparse(extRes.resourceId).path
+            try:
+                resource = openIndexedAlignmentFile(
+                    location,
+                    referenceFastaFname=refFile)
+            except (IOError, ValueError):
+                if not self._strict:
+                    log.info("pbi file missing for {f}, operating with "
+                             "reduced speed and functionality".format(
+                                 f=location))
+                    resource = openAlignmentFile(
+                        location, referenceFastaFname=refFile)
+                else:
+                    raise
+            if not resource:
+                raise IOError(errno.EIO,
+                              "{f} fails to open".format(f=location),
+                              location)
+            self._openReaders.append(resource)
+        log.debug("Done opening resources")
 
     def addMetadata(self, newMetadata, **kwargs):
         """Add metadata specific to this subtype, while leaning on the
@@ -1779,10 +2078,47 @@ class ReadSet(DataSet):
         # Pull generic values, kwargs, general treatment in super
         super(ReadSet, self).addMetadata(newMetadata, **kwargs)
 
+    @property
+    def _length(self):
+        """Used to populate metadata in updateCounts"""
+        length = -1
+        count = -1
+        return count, length
+
+    def __len__(self):
+        if self.numRecords == -1:
+            if self._filters:
+                self.updateCounts()
+            else:
+                count = 0
+                for reader in self.resourceReaders():
+                    count += len(reader)
+                self.numRecords = count
+        return self.numRecords
+
 class HdfSubreadSet(ReadSet):
 
     datasetType = DataSetMetaTypes.HDF_SUBREAD
 
+    def __init__(self, *files, **kwargs):
+        log.debug("Opening HdfSubreadSet")
+        kwargs['skipCounts'] = True
+        super(HdfSubreadSet, self).__init__(*files, **kwargs)
+
+    def _openFiles(self):
+        """Open the files (assert they exist, assert they are of the proper
+        type before accessing any file)
+        """
+        if self._openReaders:
+            log.debug("Closing old readers...")
+            self.close()
+        log.debug("Opening resources")
+        for extRes in self.externalResources:
+            location = urlparse(extRes.resourceId).path
+            resource = BaxH5Reader(location)
+            self._openReaders.append(resource)
+        log.debug("Done opening resources")
+
     @staticmethod
     def _metaTypeMapping():
         return {'bax.h5':'PacBio.SubreadFile.SubreadBaxFile', }
@@ -1793,11 +2129,11 @@ class SubreadSet(ReadSet):
 
     DocTest:
 
-        >>> from pbcore.io import DataSet, SubreadSet
+        >>> from pbcore.io import SubreadSet
         >>> from pbcore.io.dataset.DataSetMembers import ExternalResources
         >>> import pbcore.data.datasets as data
-        >>> ds1 = DataSet(data.getXml(no=5))
-        >>> ds2 = DataSet(data.getXml(no=5))
+        >>> ds1 = SubreadSet(data.getXml(no=5))
+        >>> ds2 = SubreadSet(data.getXml(no=5))
         >>> # So they don't conflict:
         >>> ds2.externalResources = ExternalResources()
         >>> ds1 # doctest:+ELLIPSIS
@@ -1826,6 +2162,29 @@ class SubreadSet(ReadSet):
 
     datasetType = DataSetMetaTypes.SUBREAD
 
+    def __init__(self, *files, **kwargs):
+        log.debug("Opening SubreadSet")
+        super(SubreadSet, self).__init__(*files, **kwargs)
+
+    @property
+    def _length(self):
+        """Used to populate metadata in updateCounts"""
+        length = 0
+        count = 0
+        endkey = 'qEnd'
+        startkey = 'qStart'
+        for rec in self.indexRecords:
+            if isinstance(rec, np.ndarray):
+                count += len(rec)
+                length += sum(rec[endkey] - rec[startkey])
+            elif isinstance(rec, PacBioBamIndex):
+                count += len(rec)
+                length += sum(rec.aEnd - rec.aStart)
+            else:
+                count += 1
+                length += rec.aEnd - rec.aStart
+        return count, length
+
     @staticmethod
     def _metaTypeMapping():
         # This doesn't work for scraps.bam, whenever that is implemented
@@ -1842,13 +2201,8 @@ class ConsensusReadSet(ReadSet):
 
     Doctest:
         >>> import pbcore.data.datasets as data
-        >>> from pbcore.io import DataSet, ConsensusReadSet
-        >>> ds1 = DataSet(data.getXml(2))
-        >>> ds1 # doctest:+ELLIPSIS
-        <ConsensusReadSet...
-        >>> ds1._metadata # doctest:+ELLIPSIS
-        <SubreadSetMetadata...
-        >>> ds2 = ConsensusReadSet(data.getXml(2))
+        >>> from pbcore.io import ConsensusReadSet
+        >>> ds2 = ConsensusReadSet(data.getXml(2), strict=False)
         >>> ds2 # doctest:+ELLIPSIS
         <ConsensusReadSet...
         >>> ds2._metadata # doctest:+ELLIPSIS
@@ -1857,22 +2211,107 @@ class ConsensusReadSet(ReadSet):
 
     datasetType = DataSetMetaTypes.CCS
 
-class AlignmentSet(DataSet):
+class AlignmentSet(ReadSet):
     """DataSet type specific to Alignments. No type specific Metadata exists,
     so the base class version is OK (this just ensures type representation on
     output and expandability"""
 
     datasetType = DataSetMetaTypes.ALIGNMENT
 
+    def __init__(self, *files, **kwargs):
+        """ An AlignmentSet
+
+        Args:
+            *files: handled by super
+            referenceFastaFname=None: the reference fasta filename for this
+                                      alignment.
+            strict=False: see base class
+            skipCounts=False: see base class
+        """
+        log.debug("Opening AlignmentSet with {f}".format(f=files))
+        super(AlignmentSet, self).__init__(*files, **kwargs)
+        fname = kwargs.get('referenceFastaFname', None)
+        if fname:
+            self.addReference(fname)
+
     def addReference(self, fname):
-        reference = ReferenceSet(fname).externalResources.resourceIds
+        if isinstance(fname, ReferenceSet):
+            reference = fname.externalResources.resourceIds
+        else:
+            reference = ReferenceSet(fname).externalResources.resourceIds
         if len(reference) > 1:
-            log.warn("Multiple references found, cannot open with reads")
+            if len(reference) != self.numExternalResources:
+                raise ResourceMismatchError(
+                    "More than one reference found, but not enough for one "
+                    "per resource")
+            for res, ref in zip(self.externalResources, reference):
+                res.reference = ref
         else:
-            self._openFiles(refFile=reference[0])
+            for res in self.externalResources:
+                res.reference = reference[0]
+            self._openFiles()
 
     @property
-    def records(self):
+    def _referenceFile(self):
+        responses = [res.reference for res in self.externalResources]
+        return self._unifyResponses(responses)
+
+    def updateCounts(self):
+        if self._skipCounts:
+            log.debug("SkipCounts is true, skipping updateCounts()")
+            self.metadata.totalLength = -1
+            self.metadata.numRecords = -1
+            return
+        try:
+            self.assertIndexed()
+            log.debug('Updating counts')
+            numRecords, totalLength = self._length
+            self.metadata.totalLength = totalLength
+            self.metadata.numRecords = numRecords
+        # I would prefer to just catch IOError and UnavailableFeature
+        except (IOError, UnavailableFeature):
+            if not self._strict:
+                log.debug("File problem, metadata not populated")
+                self.metadata.totalLength = -1
+                self.metadata.numRecords = -1
+            else:
+                raise
+
+    @property
+    def _length(self):
+        """Used to populate metadata in updateCounts"""
+        length = 0
+        count = 0
+        endkey = 'aEnd'
+        startkey = 'aStart'
+        if self.isCmpH5:
+            endkey = 'rEnd'
+            startkey = 'rStart'
+        for rec in self.indexRecords:
+            if isinstance(rec, np.ndarray):
+                count += len(rec)
+                length += sum(rec[endkey] - rec[startkey])
+            elif isinstance(rec, PacBioBamIndex):
+                count += len(rec)
+                length += sum(rec.aEnd - rec.aStart)
+            else:
+                count += 1
+                length += rec.aEnd - rec.aStart
+        return count, length
+
+    def __len__(self):
+        if self.numRecords == -1:
+            if self._filters:
+                self.updateCounts()
+            else:
+                count = 0
+                for reader in self.resourceReaders():
+                    count += len(reader)
+                self.numRecords = count
+        return self.numRecords
+
+    @property
+    def recordsByReference(self):
         """ The records in this AlignmentSet, sorted by tStart. """
         # we only care about aligned sequences here, so we can make this a
         # chain of readsInReferences to add pre-filtering by rname, instead of
@@ -1891,46 +2330,158 @@ class AlignmentSet(DataSet):
                }
 
 
-class ReferenceSet(DataSet):
-    """DataSet type specific to References"""
+class ContigSet(DataSet):
+    """DataSet type specific to Contigs"""
 
-    datasetType = DataSetMetaTypes.REFERENCE
+    datasetType = DataSetMetaTypes.CONTIG
+
+    def __init__(self, *files, **kwargs):
+        log.debug("Opening ContigSet")
+        super(ContigSet, self).__init__(*files, **kwargs)
+        self._metadata = ContigSetMetadata(self._metadata)
+        self._updateMetadata()
+
+    def consolidate(self, outfn=None):
+        """Consolidation should be implemented for window text in names and
+        for filters in ContigSets"""
+
+        # In general "name" here refers to the contig.id only, which is why we
+        # also have to keep track of comments.
+        log.debug("Beginning consolidation")
+        # Get the possible keys
+        names = self.contigNames
+        winless_names = [self._removeWindow(name) for name in names]
+
+        # Put them into buckets
+        matches = {}
+        for con in self.contigs:
+            conId = con.id
+            window = self._parseWindow(conId)
+            if not window is None:
+                conId = self._removeWindow(conId)
+            if conId not in matches:
+                matches[conId] = [con]
+            else:
+                matches[conId].append(con)
+        for name, match_list in matches.items():
+            matches[name] = np.array(match_list)
+
+        writeTemp = False
+        # consolidate multiple files into one
+        if len(self.toExternalFiles()) > 1:
+            writeTemp = True
+        writeMatches = {}
+        writeComments = {}
+        for name, match_list in matches.items():
+            if len(match_list) > 1:
+                log.debug("Multiple matches found for {i}".format(i=name))
+                # look for the quiver window indication scheme from quiver:
+                windows = np.array([self._parseWindow(match.id)
+                                    for match in match_list])
+                for win in windows:
+                    if win is None:
+                        log.debug("Windows not found for all items with a "
+                                  "matching id, consolidation aborted")
+                        return
+                # order windows
+                order = np.argsort([window[0] for window in windows])
+                match_list = match_list[order]
+                windows = windows[order]
+                # TODO: check to make sure windows/lengths are compatible,
+                # complete
+
+                # collapse matches
+                new_name = self._removeWindow(name)
+                new_seq = ''.join([match.sequence for match in match_list])
+
+                # set to write
+                writeTemp = True
+                writeMatches[new_name] = new_seq
+                writeComments[new_name] = match_list[0].comment
+            else:
+                log.debug("One match found for {i}".format(i=name))
+                writeMatches[name] = match_list[0].sequence
+                writeComments[name] = match_list[0].comment
+        if writeTemp:
+            log.debug("Writing a new file is necessary")
+            if not outfn:
+                log.debug("Writing to a temp directory as no path given")
+                outdir = tempfile.mkdtemp(suffix="consolidated-contigset")
+                outfn = os.path.join(outdir,
+                                     'consolidated.contigset.xml')
+            with FastaWriter(outfn) as outfile:
+                log.debug("Writing new resource {o}".format(o=outfn))
+                for name, seq in writeMatches.items():
+                    if writeComments[name]:
+                        name = ' '.join([name, writeComments[name]])
+                    outfile.writeRecord(name, seq)
+            # replace resources
+            log.debug("Replacing resources")
+            self.externalResources = ExternalResources()
+            self.addExternalResources([outfn])
+            # replace contig info
+            log.debug("Replacing metadata")
+            self._metadata.contigs = []
+            self._populateContigMetadata()
+
+    def _popSuffix(self, name):
+        observedSuffixes = ['|quiver']
+        for suff in observedSuffixes:
+            if name.endswith(suff):
+                log.debug("Suffix found: {s}".format(s=suff))
+                return name.replace(suff, ''), suff
+        return name, ''
+
+    def _removeWindow(self, name):
+        if isinstance(self._parseWindow(name), np.ndarray):
+            name, suff = self._popSuffix(name)
+            return '_'.join(name.split('_')[:-2]) + suff
+        return name
+
+    def _parseWindow(self, name):
+        name, _ = self._popSuffix(name)
+        possibilities = name.split('_')[-2:]
+        for pos in possibilities:
+            if not pos.isdigit():
+                return None
+        return np.array(map(int, possibilities))
+
+    def _updateMetadata(self):
+        # update contig specific metadata:
+        if not self._metadata.organism:
+            self._metadata.organism = ''
+        if not self._metadata.ploidy:
+            self._metadata.ploidy = ''
+        if not self._metadata.contigs:
+            self._metadata.contigs = []
+            self._populateContigMetadata()
+
+    def _populateContigMetadata(self):
+        for contig in self.contigs:
+            self._metadata.addContig(contig)
 
     def updateCounts(self):
+        if self._skipCounts:
+            self.metadata.totalLength = -1
+            self.metadata.numRecords = -1
+            return
         try:
-            self.metadata.numRecords = 0
+            log.debug('Updating counts')
             self.metadata.totalLength = 0
+            self.metadata.numRecords = 0
             for res in self.resourceReaders():
-                #self.metadata.numRecords += len(res)
-                self.metadata.numRecords += sum(1 for _ in res)
+                self.metadata.numRecords += len(res)
                 for index in res.fai:
                     self.metadata.totalLength += index.length
-        except Exception:
-            # This will be fixed soon, or log an appropriate error
-            # (additionally, unindexed fasta files will soon be disallowed)
-            self.metadata.numRecords = -1
-            self.metadata.totalLength = -1
-
-    def __init__(self, *files):
-        super(ReferenceSet, self).__init__()
-        self._metadata = ReferenceSetMetadata()
-        self._indexedOnly = False
-
-    def processFilters(self):
-        # Allows us to not process all of the filters each time. This is marked
-        # as dirty (= []) by addFilters etc. Filtration can be turned off by
-        # setting this to [lambda x: True], which can be reversed by marking
-        # the cache dirty. See disableFilters/enableFilters
-        if self._cachedFilters:
-            return self._cachedFilters
-        filters = self.filters.tests(readType="fasta")
-        # Having no filters means no opportunity to pass. Fix by filling with
-        # always-true (e.g. disableFilters())
-        if not filters:
-            self._cachedFilters = [lambda x: True]
-            return self._cachedFilters
-        self._cachedFilters = filters
-        return filters
+        except (IOError, UnavailableFeature, TypeError):
+            # IOError for missing files
+            # UnavailableFeature for files without companion files
+            # TypeError for FastaReader, which doesn't have a len()
+            if not self._strict:
+                self.metadata.totalLength = -1
+                self.metadata.numRecords = -1
+            else:
+                raise
 
     def addMetadata(self, newMetadata, **kwargs):
         """Add metadata specific to this subtype, while leaning on the
@@ -1939,16 +2490,16 @@ class ReferenceSet(DataSet):
         # Validate, clean and prep input data
         if newMetadata:
             if isinstance(newMetadata, dict):
-                newMetadata = ReferenceSetMetadata(newMetadata)
-            elif isinstance(newMetadata, ReferenceSetMetadata) or (
+                newMetadata = ContigSetMetadata(newMetadata)
+            elif isinstance(newMetadata, ContigSetMetadata) or (
                     type(newMetadata).__name__ == 'DataSetMetadata'):
-                newMetadata = ReferenceSetMetadata(newMetadata.record)
+                newMetadata = ContigSetMetadata(newMetadata.record)
             else:
-                raise TypeError("Cannot extend ReferenceSetMetadata with "
+                raise TypeError("Cannot extend ContigSetMetadata with "
                                 "{t}".format(t=type(newMetadata).__name__))
 
         # Pull generic values, kwargs, general treatment in super
-        super(ReferenceSet, self).addMetadata(newMetadata, **kwargs)
+        super(ContigSet, self).addMetadata(newMetadata, **kwargs)
 
         # Pull subtype specific values where important
         if newMetadata:
@@ -1968,30 +2519,16 @@ class ReferenceSet(DataSet):
             try:
                 resource = IndexedFastaReader(location)
             except IOError:
-                if not self._indexedOnly:
+                if not self._strict:
                     log.debug('Companion reference index (.fai) missing. '
-                             'Use "samtools faidx <refname>" to generate one.')
+                              'Use "samtools faidx <refname>" to generate '
+                              'one.')
                     resource = FastaReader(location)
                 else:
                     raise
             self._openReaders.append(resource)
         log.debug("Done opening resources")
 
-    def assertIndexed(self):
-        self._indexedOnly = True
-        self._openFiles()
-        return True
-
-    @property
-    def isIndexed(self):
-        try:
-            res = self._pollResources(
-                lambda x: isinstance(x, IndexedFastaReader))
-            return self._unifyResponses(res)
-        except ResourceMismatchError:
-            log.error("Not all resource are equally indexed.")
-            return False
-
     def resourceReaders(self, refName=None):
         """A generator of fastaReader objects for the ExternalResources in this
         ReferenceSet.
@@ -2012,20 +2549,6 @@ class ReferenceSet(DataSet):
             log.error("Specifying a contig name not yet implemented")
         self._openFiles()
         return self._openReaders
-        #for resource in self._openReaders:
-            #yield resource
-        #self.close()
-
-    @property
-    def refNames(self):
-        """The reference names assigned to the External Resources, or contigs,
-        if no name assigned."""
-        refNames = []
-        for contig in self.contigs:
-            if (self.noFiltering
-                    or self._filters.testParam('id', contig.id, str)):
-                refNames.append(contig.id)
-        return sorted(list(set(refNames)))
 
     @property
     @filtered
@@ -2052,58 +2575,97 @@ class ReferenceSet(DataSet):
 
     def get_contig(self, contig_id):
         """Get a contig by ID"""
+        # TODO: have this use getitem for indexed fasta readers:
         for contig in self.contigs:
             if contig.id == contig_id or contig.name == contig_id:
                 return contig
 
+    def assertIndexed(self):
+        self._strict = True
+        self._openFiles()
+        return True
+
+    @property
+    def isIndexed(self):
+        try:
+            res = self._pollResources(
+                lambda x: isinstance(x, IndexedFastaReader))
+            return self._unifyResponses(res)
+        except ResourceMismatchError:
+            if not self._strict:
+                log.error("Not all resource are equally indexed.")
+                return False
+            else:
+                raise
+
+    @property
+    def contigNames(self):
+        """The names assigned to the External Resources, or contigs if no name
+        assigned."""
+        names = []
+        for contig in self.contigs:
+            if (self.noFiltering
+                    or self._filters.testParam('id', contig.id, str)):
+                names.append(contig.id)
+        return sorted(list(set(names)))
+
     @staticmethod
     def _metaTypeMapping():
-        return {'fasta':'PacBio.ReferenceFile.ReferenceFastaFile',
+        return {'fasta':'PacBio.ContigFile.ContigFastaFile',
                 'fai':'PacBio.Index.SamIndex',
                 'sa':'PacBio.Index.SaWriterIndex',
                }
 
 
-class ContigSet(DataSet):
-    """DataSet type specific to Contigs"""
+class ReferenceSet(ContigSet):
+    """DataSet type specific to References"""
 
-    datasetType = DataSetMetaTypes.CONTIG
+    datasetType = DataSetMetaTypes.REFERENCE
 
-    def __init__(self, *files):
-        super(ContigSet, self).__init__()
-        self._metadata = ContigSetMetadata()
+    def __init__(self, *files, **kwargs):
+        log.debug("Opening ReferenceSet")
+        super(ReferenceSet, self).__init__(*files, **kwargs)
 
-    def addMetadata(self, newMetadata, **kwargs):
-        """Add metadata specific to this subtype, while leaning on the
-        superclass method for generic metadata. Also enforce metadata type
-        correctness."""
-        # Validate, clean and prep input data
-        if newMetadata:
-            if isinstance(newMetadata, dict):
-                newMetadata = ContigSetMetadata(newMetadata)
-            elif isinstance(newMetadata, ContigSetMetadata) or (
-                    type(newMetadata).__name__ == 'DataSetMetadata'):
-                newMetadata = ContigSetMetadata(newMetadata.record)
-            else:
-                raise TypeError("Cannot extend ContigSetMetadata with "
-                                "{t}".format(t=type(newMetadata).__name__))
+    def processFilters(self):
+        # Allows us to not process all of the filters each time. This is marked
+        # as dirty (= []) by addFilters etc. Filtration can be turned off by
+        # setting this to [lambda x: True], which can be reversed by marking
+        # the cache dirty. See disableFilters/enableFilters
+        if self._cachedFilters:
+            return self._cachedFilters
+        filters = self.filters.tests(readType="fasta")
+        # Having no filters means no opportunity to pass. Fix by filling with
+        # always-true (e.g. disableFilters())
+        if not filters:
+            self._cachedFilters = [lambda x: True]
+            return self._cachedFilters
+        self._cachedFilters = filters
+        return filters
 
-        # Pull generic values, kwargs, general treatment in super
-        super(ContigSet, self).addMetadata(newMetadata, **kwargs)
+    @property
+    def refNames(self):
+        """The reference names assigned to the External Resources, or contigs
+        if no name assigned."""
+        return self.contigNames
+
+
+    @staticmethod
+    def _metaTypeMapping():
+        return {'fasta':'PacBio.ReferenceFile.ReferenceFastaFile',
+                'fai':'PacBio.Index.SamIndex',
+                'sa':'PacBio.Index.SaWriterIndex',
+               }
 
-        # Pull subtype specific values where important
-        if newMetadata:
-            if newMetadata.contigs:
-                self._metadata.contigs.extend(newMetadata.contigs)
 
 class BarcodeSet(DataSet):
     """DataSet type specific to Barcodes"""
 
     datasetType = DataSetMetaTypes.BARCODE
 
-    def __init__(self, *files):
-        super(BarcodeSet, self).__init__()
-        self._metadata = BarcodeSetMetadata()
+    def __init__(self, *files, **kwargs):
+        log.debug("Opening BarcodeSet")
+        super(BarcodeSet, self).__init__(*files, **kwargs)
+        self._metadata = BarcodeSetMetadata(self._metadata)
 
     def addMetadata(self, newMetadata, **kwargs):
         """Add metadata specific to this subtype, while leaning on the
diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py
index 809a218..4a1f891 100755
--- a/pbcore/io/dataset/DataSetMembers.py
+++ b/pbcore/io/dataset/DataSetMembers.py
@@ -49,6 +49,7 @@ serve two pruposes:
 import copy
 import operator as OP
 import re
+import numpy as np
 import logging
 from functools import partial as P
 
@@ -288,9 +289,14 @@ class RecordWrapper(object):
     def description(self):
         return self.getV('attrib', 'Description')
 
+    @description.setter
+    def description(self, value):
+        return self.setV(value, 'attrib', 'Description')
+
 def filter_read(accessor, operator, value, read):
     return operator(accessor(read), value)
 
+
 class Filters(RecordWrapper):
 
     def __init__(self, record=None):
@@ -419,6 +425,21 @@ class Filters(RecordWrapper):
                 'tend': (lambda x: int(x.tStart)), # see above
                }
 
+    def _pbiVecAccMap(self, tIdMap):
+        # tStart/tEnd is a hack for overlapping ranges. you're not testing
+        # whether the tStart/tEnd is within a range, you're testing if it
+        # overlaps with the tStart/tEnd in the filter (overlaps with a
+        # reference window)
+        return {'rname': (lambda x, m=tIdMap: m[x.tId]),
+                'length': (lambda x: x.aEnd - x.aStart),
+                'qname': (lambda x: x.qId),
+                'zm': (lambda x: x.holeNumber),
+                'pos': (lambda x: x.tStart),
+                'readstart': (lambda x: x.aStart),
+                'tstart': (lambda x: x.tEnd), # see above
+                'tend': (lambda x: x.tStart), # see above
+               }
+
     @property
     def _bamTypeMap(self):
         return {'rname': str,
@@ -463,9 +484,31 @@ class Filters(RecordWrapper):
                 value = typeMap[param](req.value)
                 operator = self.opMap(req.operator)
                 reqTests.append(P(filter_read, accMap[param], operator, value))
-            tests.append(lambda x, reqTests=reqTests: all([f(x) for f in reqTests]))
+            tests.append(
+                lambda x, reqTests=reqTests: all([f(x) for f in reqTests]))
         return tests
 
+    def filterIndexRecords(self, indexRecords, nameMap):
+        filterResultList = []
+        typeMap = self._bamTypeMap
+        accMap = self._pbiVecAccMap({})
+        accMap['rname'] = lambda x: x.tId
+        for filt in self:
+            thisFilterResultList = []
+            for req in filt:
+                param = req.name
+                value = typeMap[param](req.value)
+                if param == 'rname':
+                    value = nameMap[value]
+                operator = self.opMap(req.operator)
+                reqResultsForRecords = operator(accMap[param](indexRecords),
+                                                value)
+                thisFilterResultList.append(reqResultsForRecords)
+            thisFilterResultList = np.logical_and.reduce(thisFilterResultList)
+            filterResultList.append(thisFilterResultList)
+        filterResultList = np.logical_or.reduce(filterResultList)
+        return filterResultList
+
     def addRequirement(self, **kwargs):
         """Use this to add requirements. Members of the list will be considered
         options for fulfilling this requirement, all other filters will be
@@ -684,13 +727,6 @@ class ExternalResource(RecordWrapper):
             return True
         return False
 
-    #def __getitem__(self, index):
-        #return FileIndex(self.record['children'][index])
-
-    #def __iter__(self):
-        #for child in self.record['children']:
-            #yield FileIndex(child)
-
     def merge(self, other):
         if self.metaType:
             if self.metaType != other.metaType:
@@ -724,13 +760,76 @@ class ExternalResource(RecordWrapper):
         self.setV(value, 'attrib', 'Tags')
 
     @property
+    def bam(self):
+        return self.resourceId
+
+    @property
+    def pbi(self):
+        indices = self.indices
+        for index in indices:
+            if index.metaType == 'PacBio.Index.PacBioIndex':
+                return index.resourceId
+
+    @property
+    def bai(self):
+        indices = self.indices
+        for index in indices:
+            if index.metaType == 'PacBio.Index.BamIndex':
+                return index.resourceId
+
+    @property
+    def scraps(self):
+        return self._getSubResByMetaType('PacBio.SubreadFile.ScrapsBamFile')
+
+    @scraps.setter
+    def scraps(self, value):
+        self._setSubResByMetaType('PacBio.SubreadFile.ScrapsBamFile', value)
+
+    @property
+    def reference(self):
+        return self._getSubResByMetaType(
+            'PacBio.ReferenceFile.ReferenceFastaFile')
+
+    @reference.setter
+    def reference(self, value):
+        self._setSubResByMetaType('PacBio.ReferenceFile.ReferenceFastaFile',
+                                  value)
+
+    def _getSubResByMetaType(self, mType):
+        resources = self.externalResources
+        for res in resources:
+            if res.metaType == mType:
+                return res.resourceId
+
+    def _setSubResByMetaType(self, mType, value):
+        tmp = ExternalResource()
+        tmp.resourceId = value
+        tmp.metaType = mType
+        resources = self.externalResources
+        # externalresources objects have a tag by default, which means their
+        # truthiness is true. Perhaps a truthiness change is in order
+        if len(resources) == 0:
+            resources = ExternalResources()
+            resources.append(tmp)
+            self.append(resources)
+        else:
+            resources.append(tmp)
+
+    @property
+    def externalResources(self):
+        current = list(self.findChildren('ExternalResources'))
+        if current:
+            return ExternalResources(current[0])
+        else:
+            return ExternalResources()
+
+    @property
     def indices(self):
         current = list(self.findChildren('FileIndices'))
         if current:
             return FileIndices(current[0])
         else:
             return FileIndices()
-        #return [ind for ind in self]
 
     @indices.setter
     def indices(self, indexList):
@@ -890,18 +989,18 @@ class SubreadSetMetadata(DataSetMetadata):
                                             container='children'))
 
 
-class ReferenceSetMetadata(DataSetMetadata):
-    """The DataSetMetadata subtype specific to ReferenceSets."""
+class ContigSetMetadata(DataSetMetadata):
+    """The DataSetMetadata subtype specific to ContigSets."""
 
 
     def __init__(self, record=None):
         if record:
             if (not isinstance(record, dict) and
-                    not isinstance(record, ReferenceSetMetadata) and
+                    not isinstance(record, ContigSetMetadata) and
                     type(record).__name__ != 'DataSetMetadata'):
-                raise TypeError("Cannot create ReferenceSetMetadata from "
+                raise TypeError("Cannot create ContigSetMetadata from "
                                 "{t}".format(t=type(record).__name__))
-        super(ReferenceSetMetadata, self).__init__(record)
+        super(ContigSetMetadata, self).__init__(record)
 
     def merge(self, other):
         super(self.__class__, self).merge(other)
@@ -909,37 +1008,48 @@ class ReferenceSetMetadata(DataSetMetadata):
 
     @property
     def organism(self):
-        return self.getMemberV('Organism')
+        try:
+            return self.getMemberV('Organism')
+        except ValueError:
+            return None
+
+    @organism.setter
+    def organism(self, value):
+        self.setMemberV('Organism', value)
 
     @property
     def ploidy(self):
-        return self.getMemberV('Ploidy')
+        try:
+            return self.getMemberV('Ploidy')
+        except ValueError:
+            return None
+
+    @ploidy.setter
+    def ploidy(self, value):
+        self.setMemberV('Ploidy', value)
 
     @property
     def contigs(self):
+        # to this so that the absence of contigs is adequately conveyed.
+        if not list(self.findChildren('Contigs')):
+            return None
         return ContigsMetadata(self.getV('children', 'Contigs'))
 
+    @contigs.setter
+    def contigs(self, value):
+        if not value:
+            self.removeChildren('Contigs')
+            self.append(ContigsMetadata())
 
-class ContigSetMetadata(DataSetMetadata):
-    """The DataSetMetadata subtype specific to ContigSets."""
-
-
-    def __init__(self, record=None):
-        if record:
-            if (not isinstance(record, dict) and
-                    not isinstance(record, ContigMetadata) and
-                    type(record).__name__ != 'DataSetMetadata'):
-                raise TypeError("Cannot create ContigSetMetadata from "
-                                "{t}".format(t=type(record).__name__))
-        super(ContigSetMetadata, self).__init__(record)
-
-    def merge(self, other):
-        super(self.__class__, self).merge(other)
-        self.contigs.merge(other.contigs)
-
-    @property
-    def contigs(self):
-        return ContigsMetadata(self.getV('children', 'Contigs'))
+    def addContig(self, newContig):
+        if not self.contigs:
+            self.contigs = []
+        tmp = ContigMetadata()
+        tmp.name = newContig.id if newContig.id else ''
+        tmp.description = newContig.comment if newContig.comment else ''
+        tmp.digest = 'DEPRECATED'
+        tmp.length = len(newContig)
+        self.contigs.append(tmp)
 
 
 class BarcodeSetMetadata(DataSetMetadata):
@@ -962,6 +1072,9 @@ class BarcodeSetMetadata(DataSetMetadata):
 
 class ContigsMetadata(RecordWrapper):
 
+    def __init__(self, record=None):
+        super(self.__class__, self).__init__(record)
+        self.record['tag'] = 'Contigs'
 
     def __getitem__(self, index):
         return ContigMetadata(self.record['children'][index])
@@ -976,14 +1089,25 @@ class ContigsMetadata(RecordWrapper):
 
 class ContigMetadata(RecordWrapper):
 
+    def __init__(self, record=None):
+        super(self.__class__, self).__init__(record)
+        self.record['tag'] = 'Contig'
 
     @property
     def digest(self):
         return self.getV('attrib', 'Digest')
 
+    @digest.setter
+    def digest(self, value):
+        return self.setV(value, 'attrib', 'Digest')
+
     @property
     def length(self):
-        return self.getV('attrib', 'Length')
+        return int(self.getV('attrib', 'Length'))
+
+    @length.setter
+    def length(self, value):
+        return self.setV(str(value), 'attrib', 'Length')
 
 
 class CollectionsMetadata(RecordWrapper):
@@ -1395,7 +1519,9 @@ class DiscreteDistribution(RecordWrapper):
 
     @property
     def labels(self):
-        return [child.metavalue for child in self.findChildren('BinLabel')]
+        binLabels = RecordWrapper(self.getV('children', 'BinLabels'))
+        return [child.metavalue
+                for child in binLabels.findChildren('BinLabel')]
 
     @property
     def description(self):
@@ -1527,9 +1653,9 @@ class BioSamplesMetadata(RecordWrapper):
     """The metadata for the list of BioSamples
 
         Doctest:
-            >>> from pbcore.io import DataSet
+            >>> from pbcore.io import SubreadSet
             >>> import pbcore.data.datasets as data
-            >>> ds = DataSet(data.getSubreadSet())
+            >>> ds = SubreadSet(data.getSubreadSet())
             >>> ds.metadata.bioSamples[0].name
             'consectetur purus'
             >>> for bs in ds.metadata.bioSamples:
diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py
index 22b3b39..1031935 100755
--- a/pbcore/io/dataset/DataSetReader.py
+++ b/pbcore/io/dataset/DataSetReader.py
@@ -5,53 +5,45 @@ import functools
 import xml.etree.ElementTree as ET
 import logging
 from urlparse import urlparse
-import DataSetIO
 from pbcore.io.dataset.DataSetMembers import (ExternalResource,
                                               ExternalResources,
-                                              DataSetMetadata, RecordWrapper,
-                                              Filters, AutomationParameters)
+                                              DataSetMetadata,
+                                              Filters, AutomationParameters,
+                                              StatsMetadata)
 from pbcore.io.dataset.DataSetWriter import _eleFromDictList
 
 XMLNS = "http://pacificbiosciences.com/PacBioDataModel.xsd"
 
-__all__ = ['parseFiles']
-
 log = logging.getLogger(__name__)
 
-def parseFiles(filenames):
-    """Open files with a helper function, feed any available members into a
-    dictionary to be collected by the DataSet object
+def resolveLocation(fname, possibleRelStart='.'):
+    """Find the absolute path of a file that exists relative to '.' or
+    possibleRelStart."""
+    if os.path.exists(fname):
+        return os.path.abspath(fname)
+    if os.path.exists(possibleRelStart):
+        if os.path.exists(os.path.join(possibleRelStart, fname)):
+            return os.path.abspath(os.path.join(possibleRelStart, fname))
+    log.error("Including unresolved file: {f}".format(f=fname))
+    return fname
 
-    Args:
-        filenames: a list of filenames to parse
-    Returns:
-        A list of dictionaries of what will be copied into DataSet members
-    Doctest:
-        >>> from pbcore.io import DataSet
-        >>> import pbcore.data.datasets as data
-        >>> inBam = data.getBam()
-        >>> ds = parseFiles([inBam])
-        >>> type(ds).__name__
-        'DataSet'
-    """
-    dataSetRecords = []
-    # Create a DataSet object for each filename
+def populateDataSet(dset, filenames):
     for filename in filenames:
-        dataSetRecords.append(_parseFile(filename))
-    tbr = reduce(lambda x, y: x + y, dataSetRecords)
-    log.debug("Done parsing files")
-    return tbr
+        _addFile(dset, filename)
+    dset._populateMetaTypes()
 
-def _parseFile(filename):
-    """Opens a single filename, returns a list of one or more member
-    dictionaries (more than one in the case of XML files or malformed
-    concatenated XML
-    files)."""
-    handledTypes = {'xml': _openXmlFile,
-                    'bam': _openBamFile,
-                    'fofn': _openFofnFile,
-                    '': _openUnknownFile,
-                    'file': _openUnknownFile}
+def xmlRootType(fname):
+    with open(fname, 'rb') as xml_file:
+        tree = ET.parse(xml_file)
+    root = tree.getroot()
+    return _tagCleaner(root.tag)
+
+def _addFile(dset, filename):
+    handledTypes = {'xml': _addXmlFile,
+                    'bam': _addGenericFile,
+                    'fofn': _addFofnFile,
+                    '': _addUnknownFile,
+                    'file': _addUnknownFile}
     url = urlparse(filename)
     fileType = url.scheme
     fileLocation = url.path.strip()
@@ -59,13 +51,17 @@ def _parseFile(filename):
         fileLocation = url.netloc
     elif os.path.exists(fileLocation):
         fileLocation = os.path.abspath(fileLocation)
-    else:
-        log.error("{f} not found".format(f=fileLocation))
-    dataSetRecord = handledTypes[fileType](fileLocation)
-    dataSetRecord.makePathsAbsolute(curStart=os.path.dirname(fileLocation))
-    return dataSetRecord
+    handledTypes[fileType](dset, fileLocation)
 
-def _openFofnFile(path):
+def _addXmlFile(dset, path):
+    with open(path, 'rb') as xml_file:
+        tree = ET.parse(xml_file)
+    root = tree.getroot()
+    tmp = _parseXml(dset, root)
+    tmp.makePathsAbsolute(curStart=os.path.dirname(path))
+    dset.merge(tmp, copyOnMerge=False)
+
+def _addFofnFile(dset, path):
     """Open a fofn file by calling parseFiles on the new filename list"""
     with open(path, 'r') as fofnFile:
         files = []
@@ -75,71 +71,47 @@ def _openFofnFile(path):
                 files.append(tmp)
             else:
                 files.append(os.path.join(os.path.dirname(path), infn))
-        return parseFiles(files)
+        populateDataSet(dset, files)
 
-def _openUnknownFile(path):
+def _addUnknownFile(dset, path):
     """Open non-uri files
     """
     if path.endswith('xml'):
-        return _openXmlFile(path)
+        return _addXmlFile(dset, path)
     elif path.endswith('bam'):
-        return _openBamFile(path)
+        return _addGenericFile(dset, path)
     elif path.endswith('fofn'):
-        return _openFofnFile(path)
+        return _addFofnFile(dset, path)
     else:
-        return _openGenericFile(path)
+        return _addGenericFile(dset, path)
 
-def _openGenericFile(path):
+def _addGenericFile(dset, path):
     """Create and populate an Element object, put it in an available members
     dictionary, return"""
-    extRes = ExternalResource()
-    extRes.resourceId = path
-    # Perhaps this should be in its own _openFastaFile function. Or
-    # _openBamFile should be rolled into this...
-    possible_indices = ['.fai']
-    extRes.addIndices([path + ext for ext in possible_indices if
-                       os.path.exists(path + ext)])
+    extRes = wrapNewResource(path)
     extRess = ExternalResources()
     extRess.append(extRes)
-    tbr = DataSetIO.DataSet()
-    tbr.addExternalResources(extRess)
-    tbr.newUuid()
-    return tbr
+    # We'll update them all at the end, skip updating counts for now
+    dset.addExternalResources(extRess, updateCount=False)
 
-def _openBamFile(path):
-    """Create and populate an Element object, put it in an available members
-    dictionary, return"""
+def wrapNewResource(path):
+    possible_indices = ['.fai', '.pbi', '.bai', '.metadata.xml']
+    for ext in possible_indices:
+        if path.endswith(ext):
+            log.debug('Index file {f} given as regular file, will be treated '
+                      ' as an index file instead'.format(f=path))
+            return
     extRes = ExternalResource()
+    path = resolveLocation(path)
     extRes.resourceId = path
-    possible_indices = ['.pbi', '.bai', '.metadata.xml']
-    extRes.addIndices([path + ext for ext in possible_indices if
-                       os.path.exists(path + ext)])
-    extRess = ExternalResources()
-    extRess.append(extRes)
-    tbr = DataSetIO.DataSet()
-    tbr.addExternalResources(extRess)
-    tbr.newUuid()
-    return tbr
+    index_files = [path + ext for ext in possible_indices if
+                   os.path.exists(path + ext)]
+    if index_files:
+        extRes.addIndices(index_files)
+    return extRes
 
-def _openXmlFile(path):
-    """Open the XML file, extract information, create and populate Element
-    objects, put them in an available members dictionary, return
 
-    Doctest:
-        >>> import pbcore.data.datasets as data
-        >>> import xml.etree.ElementTree as ET
-        >>> dsr = _openXmlFile(data.getXml(8).split(':')[1])
-        >>> dsr.externalResources != None
-        True
-        >>> dsr.filters != None
-        True
-    """
-    with open(path, 'rb') as xml_file:
-        tree = ET.parse(xml_file)
-    root = tree.getroot()
-    return _parseXmlDataSet(root)
-
-def _parseXmlDataSet(element):
+def _parseXml(dset, element):
     """Parse an XML DataSet tag, or the root tag of a DataSet XML file (they
     should be equivalent)
 
@@ -149,20 +121,7 @@ def _parseXmlDataSet(element):
         >>> type(ds).__name__
         'SubreadSet'
     """
-    dsTypeMap = {'DataSet': DataSetIO.DataSet,
-                 'SubreadSet': DataSetIO.SubreadSet,
-                 'HdfSubreadSet': DataSetIO.HdfSubreadSet,
-                 'ConsensusReadSet': DataSetIO.ConsensusReadSet,
-                 'AlignmentSet': DataSetIO.AlignmentSet,
-                 'ReferenceSet': DataSetIO.ReferenceSet,
-                 'ContigSet': DataSetIO.ContigSet,
-                 'BarcodeSet': DataSetIO.BarcodeSet}
-    try:
-        result = dsTypeMap[_tagCleaner(element.tag)]()
-    except KeyError:
-        # Fall back to the base type (from which the others are formed) for
-        # unkown DataSet types
-        result = dsTypeMap['DataSet']()
+    result = type(dset)()
     result.objMetadata = element.attrib
     namer = functools.partial(_namespaceTag, XMLNS)
     element = _updateDataset(element)
@@ -170,7 +129,7 @@ def _parseXmlDataSet(element):
         if child.tag == namer('ExternalResources'):
             result.externalResources = _parseXmlExtResources(child)
         elif child.tag == namer('DataSets'):
-            result.subdatasets = _parseXmlDataSets(child)
+            result.subdatasets = _parseXmls(dset, child)
         elif child.tag == namer('Filters'):
             result.filters = _parseXmlFilters(child)
         elif child.tag == namer('DataSetMetadata'):
@@ -180,6 +139,14 @@ def _parseXmlDataSet(element):
             pass
     return result
 
+def _parseXmls(dset, element):
+    """DataSets can exist as elements in other datasets, representing subsets.
+    Pull these datasets, parse them, and return a list of them."""
+    result = []
+    for child in element:
+        result.append(_parseXml(dset, child))
+    return result
+
 def _updateDataset(element):
     namer = functools.partial(_namespaceTag, XMLNS)
     updateMap = {
@@ -210,6 +177,51 @@ def _updateDataset(element):
         autonele.append(_eleFromDictList(newParams.record))
     return element
 
+def _updateStats(element):
+    namer = functools.partial(
+        _namespaceTag,
+        "http://pacificbiosciences.com/PipelineStats/PipeStats.xsd")
+    binCounts = [
+        './/' + namer('MedianInsertDist') + '/' + namer('BinCount'),
+        './/' + namer('ProdDist') + '/' + namer('BinCount'),
+        './/' + namer('ReadTypeDist') + '/' + namer('BinCount'),
+        './/' + namer('ReadLenDist') + '/' + namer('BinCount'),
+        './/' + namer('ReadQualDist') + '/' + namer('BinCount'),
+        './/' + namer('InsertReadQualDist') + '/' + namer('BinCount'),
+        './/' + namer('InsertReadLenDist') + '/' + namer('BinCount'),
+        './/' + namer('ControlReadQualDist') + '/' + namer('BinCount'),
+        './/' + namer('ControlReadLenDist') + '/' + namer('BinCount'),
+        ]
+    binLabels = [
+        './/' + namer('ProdDist') + '/' + namer('BinLabel'),
+        './/' + namer('ReadTypeDist') + '/' + namer('BinLabel'),
+        ]
+    for tag in binCounts:
+        if element.findall(tag):
+            log.error("Outdated stats XML received")
+            finds = element.findall(tag)
+            parent = tag.split('Dist')[0] + 'Dist'
+            parent = element.find(parent)
+            for sub_ele in finds:
+                parent.remove(sub_ele)
+            binCounts = ET.Element(namer('BinCounts'))
+            for sub_ele in finds:
+                binCounts.append(sub_ele)
+            parent.append(binCounts)
+    for tag in binLabels:
+        if element.findall(tag):
+            log.error("Outdated stats XML received")
+            finds = element.findall(tag)
+            parent = tag.split('Dist')[0] + 'Dist'
+            parent = element.find(parent)
+            for sub_ele in finds:
+                parent.remove(sub_ele)
+            binCounts = ET.Element(namer('BinLabels'))
+            for sub_ele in finds:
+                binCounts.append(sub_ele)
+            parent.append(binCounts)
+    return element
+
 def _namespaceTag(xmlns, tagName):
     """Preface an XML tag name with the provided namespace"""
     return ''.join(["{", xmlns, "}", tagName])
@@ -246,14 +258,6 @@ def _eleToDictList(element):
     return {'tag': tag, 'text': text, 'attrib': attrib,
             'children': children}
 
-def _parseXmlDataSets(element):
-    """DataSets can exist as elements in other datasets, representing subsets.
-    Pull these datasets, parse them, and return a list of them."""
-    result = []
-    for child in element:
-        result.append(_parseXmlDataSet(child))
-    return result
-
 def _parseXmlFilters(element):
     """Pull filters from XML file, put them in a list of dictionaries, where
     each dictionary contains a representation of a Filter tag: key, value pairs
@@ -292,7 +296,8 @@ def parseStats(filename):
         fileLocation = url.netloc
     tree = ET.parse(fileLocation)
     root = tree.getroot()
-    stats = RecordWrapper(_eleToDictList(root))
+    root = _updateStats(root)
+    stats = StatsMetadata(_eleToDictList(root))
     stats.record['tag'] = 'SummaryStats'
     whitelist = ['ShortInsertFraction', 'AdapterDimerFraction',
                  'MedianInsertDist', 'ProdDist', 'ReadTypeDist',
diff --git a/pbcore/io/dataset/DataSetValidator.py b/pbcore/io/dataset/DataSetValidator.py
index 20f12d0..58441b3 100755
--- a/pbcore/io/dataset/DataSetValidator.py
+++ b/pbcore/io/dataset/DataSetValidator.py
@@ -4,10 +4,21 @@ import os
 import re
 from urlparse import urlparse
 import xml.etree.ElementTree as ET
+import logging
 
 XMLNS = "http://pacificbiosciences.com/PacBioDataModel.xsd"
 
-def validateResources(xmlroot, relTo=False):
+log = logging.getLogger(__name__)
+
+def validateResources(xmlroot, relTo='.'):
+    """Validate the resources in an XML file.
+
+    Args:
+        xmlroot: The ET root of an xml tree
+        relTo: ('.') The path relative to which resources may reside. This will
+               work poorly if relTo is not set to the location of the incoming
+               XML file.
+    """
     stack = [xmlroot]
     while stack:
         element = stack.pop()
@@ -21,30 +32,52 @@ def validateResources(xmlroot, relTo=False):
                                                    rfn)):
                     raise IOError, "{f} not found".format(f=rfn)
 
-def validateXml(xmlroot):
+def validateLxml(xml_fn, xsd_fn):
+    try:
+        from lxml import etree
+        schema = etree.XMLSchema(etree.parse(xsd_fn))
+        xml_file = etree.parse(xml_fn)
+        if not schema.validate(xml_file):
+            print schema.error_log
+    except ImportError:
+        log.debug('lxml not found, validation disabled')
+
+def validateMiniXsv(xml_fn, xsd_fn):
+    try:
+        from minixsv import pyxsval
+        pyxsval.parseAndValidate(xml_fn, xsd_fn)
+    except ImportError:
+        log.debug('minixsv not found, validation disabled')
+
+def validateXml(xmlroot, skipResources=False):
     # pyxb precompiles and therefore does not need the original xsd file.
     #if not os.path.isfile(XSD):
         #raise SystemExit, "Validation xsd {s} not found".format(s=XSD)
 
-    validateResources(xmlroot)
+    if not skipResources: # XXX generally a bad idea, but useful for pbvalidate
+        validateResources(xmlroot)
 
     # Conceal the first characters of UniqueIds if they are legal numbers that
     # would for some odd reason be considered invalid. Let all illegal
     # characters fall through to the validator.
-    from pbcore.io.dataset import DataSetXsd
-    fixedString = re.sub('UniqueId="[0-9]', 'UniqueId="f',
-                         ET.tostring(xmlroot))
-    DataSetXsd.CreateFromDocument(fixedString)
+    try:
+        from pbcore.io.dataset import DataSetXsd
+        log.debug('Validating with PyXb')
+        fixedString = re.sub('UniqueId="[0-9]', 'UniqueId="f',
+                             ET.tostring(xmlroot))
+        DataSetXsd.CreateFromDocument(fixedString)
+    except ImportError:
+        log.debug('PyXb not found, validation disabled')
 
-def validateFile(xmlfn):
+def validateFile(xmlfn, skipResources=False):
     if ':' in xmlfn:
         xmlfn = urlparse(xmlfn).path.strip()
     with open(xmlfn, 'r') as xmlfile:
         #root = etree.parse(xmlfile)
         root = ET.parse(xmlfile).getroot()
-        return validateXml(root)
+        return validateXml(root, skipResources=skipResources)
 
-def validateString(xmlstring, relTo=None):
+def validateString(xmlstring, relTo='.'):
     #root = etree.parse(xmlfile)
     root = ET.fromstring(xmlstring)
     validateResources(root, relTo=relTo)
diff --git a/pbcore/io/dataset/DataSetWriter.py b/pbcore/io/dataset/DataSetWriter.py
index 160fb70..85f7bb8 100755
--- a/pbcore/io/dataset/DataSetWriter.py
+++ b/pbcore/io/dataset/DataSetWriter.py
@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET
 
 XMLNS = "http://pacificbiosciences.com/PacBioDataModel.xsd"
 # Not actually sure what this should be:
-XMLVERSION = '2.3.0'
+XMLVERSION = '3.0.0'
 
 __all__ = ['toXml']
 
@@ -41,8 +41,6 @@ def _toElementTree(dataSet, root=None, core=False):
     # 'if not root:' would conflict with testing root for children
     if root is None:
         rootType = type(dataSet).__name__
-        #rootType = dataSet.metadata.get(
-            #'MetaType', 'PacBio.DataSet.SubreadSet').split('.')[2]
         attribs = dataSet.objMetadata
         if core:
             attribs = _coreClean(attribs)
@@ -113,15 +111,11 @@ def _addDataSetMetadataElement(dataSet, root):
     """
     if dataSet.metadata:
         dsmd = ET.SubElement(root, 'DataSetMetadata')
-        #for key, value in dataSet.datasetMetadata.items():
-        #for key, value in dataSet.metadata.record.items():
         for child in dataSet.metadata.record['children']:
-            #dsmd.append(_eleFromEleDict(key, value))
-            #if isinstance(value, (int, float, str)):
-                #ET.SubElement(dsmd, key).text = value
-            #elif isinstance(value, dict):
-                #dsmd.append(_eleFromDict(key, value))
             dsmd.append(_eleFromDictList(child))
+        #tl = dsmd.find('TotalLength')
+        #tl = dsmd.remove(tl)
+        #dsmd.insert(0, tl)
 
 def _eleFromDict(tag, eleAsDict):
     """A last ditch capture method for uknown Elements"""
@@ -154,11 +148,6 @@ def _addFiltersElement(dataset, root, core=False):
         filters = ET.SubElement(root, 'Filters')
         for child in dataset.filters.record['children']:
             filters.append(_eleFromDictList(child, core))
-        #for filt in dataset.filters:
-            #filtElement = ET.SubElement(filters, 'Filter')
-            #parameters = ET.SubElement(filtElement, 'Parameters')
-            #for key, value in filt.items():
-                #ET.SubElement(parameters, 'Parameter', Name=key, Value=value)
 
 def _addDataSetsElement(dataset, root):
     """Add DataSet Elements to root, which essentially nests ElementTrees.
diff --git a/pbcore/io/dataset/EntryPoints.py b/pbcore/io/dataset/EntryPoints.py
index d212288..fcc2e1f 100755
--- a/pbcore/io/dataset/EntryPoints.py
+++ b/pbcore/io/dataset/EntryPoints.py
@@ -2,17 +2,16 @@
 
 import os
 import argparse
-from pbcore.io import DataSet
+from pbcore.io import DataSet, ContigSet
 from pbcore.io.dataset.DataSetValidator import validateFile
 import logging
 
 log = logging.getLogger(__name__)
 
 def createXml(args):
-    dset = DataSet(*args.infile)
+    dsTypes = DataSet.castableTypes()
+    dset = dsTypes[args.dsType](*args.infile)
     log.debug("Dataset created")
-    dset = dset.copy(asType=args.dsType)
-    log.debug("Dataset cast")
     dset.write(args.outfile, validate=args.novalidate, relPaths=args.relative)
     log.debug("Dataset written")
 
@@ -34,10 +33,10 @@ def create_options(parser):
     parser.set_defaults(func=createXml)
 
 def filterXml(args):
-    log.error("Filtering is temporarily out of order")
+    log.error("Adding filters via CLI is temporarily out of order")
     exit(1)
     if args.infile.endswith('xml'):
-        dataSet = DataSet(args.infile)
+        dataSet = DataSet(args.infile, strict=args.strict)
         filters = []
         separators = ['<=', '>=', '!=', '==', '>', '<', '=']
         for filt in args.filters:
@@ -66,13 +65,16 @@ def filter_options(parser):
 
 def splitXml(args):
     log.debug("Starting split")
-    dataSet = DataSet(args.infile)
+    dataSet = DataSet(args.infile, strict=args.strict)
     chunks = len(args.outfiles)
     if args.chunks:
         chunks = args.chunks
     dss = dataSet.split(chunks=chunks,
                         ignoreSubDatasets=(not args.subdatasets),
-                        contigs=args.contigs)
+                        contigs=args.contigs,
+                        maxChunks=args.maxChunks,
+                        breakContigs=args.breakContigs)
+    log.debug("Split into {i} chunks".format(i=len(dss)))
     infix = 'chunk{i}'
     if args.contigs:
         infix += 'contigs'
@@ -88,9 +90,18 @@ def splitXml(args):
             args.outfiles = [os.path.join(args.outdir,
                                           os.path.basename(outfn))
                              for outfn in args.outfiles]
-    log.debug("Finished splitting")
+            num = len(dss)
+            end = ''
+            if num > 5:
+                num = 5
+                end = '...'
+            log.debug("Emitting {f} {e}".format(
+                f=', '.join(args.outfiles[:num]),
+                e=end))
+    log.debug("Finished splitting, now writing")
     for out_fn, dset in zip(args.outfiles, dss):
         dset.write(out_fn)
+    log.debug("Done writing files")
 
 def split_options(parser):
     parser.description = "Split the dataset"
@@ -100,6 +111,10 @@ def split_options(parser):
                         help="Split on contigs")
     parser.add_argument("--chunks", default=False, type=int,
                         help="Split contigs into <chunks> total windows")
+    parser.add_argument("--maxChunks", default=False, type=int,
+                        help="Split contig list into at most <chunks> groups")
+    parser.add_argument("--breakContigs", default=False, action='store_true',
+                        help="Break contigs to get closer to maxCounts")
     parser.add_argument("--subdatasets", default=False, action='store_true',
                         help="Split on subdatasets")
     parser.add_argument("--outdir", default=False, type=str,
@@ -111,7 +126,7 @@ def split_options(parser):
 def mergeXml(args):
     dss = []
     for infn in args.infiles:
-        dss.append(DataSet(infn))
+        dss.append(DataSet(infn, strict=args.strict))
     reduce(lambda ds1, ds2: ds1 + ds2, dss).write(args.outfile)
 
 def merge_options(parser):
@@ -124,7 +139,7 @@ def merge_options(parser):
     parser.set_defaults(func=mergeXml)
 
 def loadStatsXml(args):
-    dset = DataSet(args.infile)
+    dset = DataSet(args.infile, strict=args.strict)
     dset.loadStats(args.statsfile)
     if args.outfile:
         dset.write(args.outfile, validate=False)
@@ -142,29 +157,33 @@ def loadStatsXml_options(parser):
     parser.set_defaults(func=loadStatsXml)
 
 def validateXml(args):
-    validateFile(args.infile)
+    validateFile(args.infile, args.skipFiles)
     print("{f} is valid DataSet XML with valid ResourceId "
           "references".format(f=args.infile))
 
 def validate_options(parser):
     parser.description = 'Validate XML and ResourceId files'
-    #parser.add_argument("infile", type=validate_file,
     parser.add_argument("infile", type=str,
                         help="The XML file to validate")
+    parser.add_argument("--skipFiles",
+                        default=False, action='store_true',
+                        help="Skip validating external resources")
     parser.set_defaults(func=validateXml)
 
 def consolidateXml(args):
     """Combine BAMs and apply the filters described in the XML file, producing
     one consolidated XML"""
-    raise NotImplementedError(
-        "Consolidation requires sequence file manipulation, which may never "
-        "be in the scope of this API")
+    dset = ContigSet(args.infile)
+    dset.consolidate(args.datafile)
+    dset.write(args.xmlfile)
 
 def consolidate_options(parser):
     parser.description = 'Consolidate the XML files'
     #parser.add_argument("infile", type=validate_file,
     parser.add_argument("infile", type=str,
                         help="The XML file to consolidate")
-    parser.add_argument("outfile", type=str,
+    parser.add_argument("datafile", type=str,
+                        help="The resulting data file")
+    parser.add_argument("xmlfile", type=str,
                         help="The resulting XML file")
     parser.set_defaults(func=consolidateXml)
diff --git a/pbcore/util/ToolRunner.py b/pbcore/util/ToolRunner.py
index 37ba71e..66dd07f 100644
--- a/pbcore/util/ToolRunner.py
+++ b/pbcore/util/ToolRunner.py
@@ -56,6 +56,9 @@ class PBToolRunner(object):
     #
     def __init__(self, description):
         self._setupParsers(description)
+        self._addStandardArguments()
+
+    def _addStandardArguments(self):
         self.parser.add_argument(
             "--verbose", "-v",
             dest="verbosity", action="count",
diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py
index 4803e9a..9f3739d 100644
--- a/tests/test_pbdataset.py
+++ b/tests/test_pbdataset.py
@@ -9,9 +9,13 @@ import unittest
 from unittest.case import SkipTest
 
 from pbcore.io import openIndexedAlignmentFile
-from pbcore.io import DataSet, SubreadSet, ReferenceSet, AlignmentSet
+from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
+                       openDataSet, DataSetMetaTypes)
+from pbcore.io.dataset.DataSetIO import _dsIdToSuffix
 from pbcore.io.dataset.DataSetMembers import ExternalResource, Filters
+from pbcore.io.dataset.DataSetWriter import toXml
 from pbcore.io.dataset.DataSetValidator import validateFile
+from pbcore.util.Process import backticks
 import pbcore.data.datasets as data
 
 log = logging.getLogger(__name__)
@@ -45,10 +49,7 @@ class TestDataSet(unittest.TestCase):
         self.assertEquals(ds1.numExternalResources, 2)
         ds1 = DataSet(data.getFofn())
         self.assertEquals(ds1.numExternalResources, 2)
-        # DataSet types are autodetected:
-        self.assertEquals(type(DataSet(data.getSubreadSet())).__name__,
-                          'SubreadSet')
-        # But can also be used directly
+        # New! Use the correct constructor:
         self.assertEquals(type(SubreadSet(data.getSubreadSet())).__name__,
                           'SubreadSet')
         # Even with untyped inputs
@@ -77,13 +78,81 @@ class TestDataSet(unittest.TestCase):
             ds.externalResources[-1].indices[0].resourceId ==
             "IdontExist.bam.pbi")
 
-
     def test_empty_metatype(self):
         inBam = data.getBam()
         d = DataSet(inBam)
         for extRes in d.externalResources:
             self.assertEqual(extRes.metaType, "")
 
+    def test_nonempty_metatype(self):
+        inBam = data.getBam()
+        d = AlignmentSet(inBam)
+        for extRes in d.externalResources:
+            self.assertEqual(extRes.metaType,
+                             "PacBio.SubreadFile.SubreadBamFile")
+
+    def test_loading_reference(self):
+        log.info('Opening Reference')
+        r = ReferenceSet(data.getRef()).toExternalFiles()[0]
+        log.info('Done Opening Reference')
+        log.info('Opening AlignmentSet')
+        d = AlignmentSet(data.getBam(), referenceFastaFname=r)
+        log.info('Done Opening AlignmentSet')
+        bfile = openIndexedAlignmentFile(data.getBam(),
+                                         referenceFastaFname=r)
+        self.assertTrue(bfile.isReferenceLoaded)
+        for res in d.resourceReaders():
+            self.assertTrue(res.isReferenceLoaded)
+
+        aln = AlignmentSet(data.getBam())
+        aln.addReference(r)
+        for res in aln.resourceReaders():
+            self.assertTrue(res.isReferenceLoaded)
+
+    def test_factory_function(self):
+        bam = data.getBam()
+        aln = data.getXml(8)
+        ref = data.getXml(9)
+        sub = data.getXml(10)
+        inTypes = [bam, aln, ref, sub]
+        expTypes = [DataSet, AlignmentSet, ReferenceSet, SubreadSet]
+        for infn, exp in zip(inTypes, expTypes):
+            # TODO enable this for all when simulated subread files can be
+            # pbi'd
+            if exp in [DataSet, ReferenceSet, AlignmentSet]:
+                ds = openDataSet(infn, strict=True)
+            else:
+                ds = openDataSet(infn)
+            self.assertEqual(type(ds), exp)
+
+
+    def test_dsIdToSuffix(self):
+        suffixes = ['subreadset.xml', 'hdfsubreadset.xml', 'alignmentset.xml',
+                    'barcodeset.xml', 'consensusreadset.xml',
+                    'consensusalignmentset.xml',
+                    'referenceset.xml', 'contigset.xml']
+        for dsId, exp in zip(DataSetMetaTypes.ALL, suffixes):
+            self.assertEqual(_dsIdToSuffix(dsId), exp)
+
+    def test_updateCounts_without_pbi(self):
+        log.info("Testing updateCounts without pbi")
+        data_fname = data.getBam(0)
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        tempout = os.path.join(outdir, os.path.basename(data_fname))
+        backticks('cp {i} {o}'.format(i=data_fname, o=tempout))
+        aln = AlignmentSet(tempout, strict=False)
+        self.assertEqual(aln.totalLength, -1)
+        self.assertEqual(aln.numRecords, -1)
+
+    def test_attributes(self):
+        aln = AlignmentSet(data.getBam(0))
+        self.assertEqual(aln.sequencingChemistry, ['unknown'])
+        self.assertEqual(aln.isSorted, True)
+        self.assertEqual(aln.isEmpty, False)
+        self.assertEqual(aln.readType, 'standard')
+        self.assertEqual(len(aln.tStart), aln.metadata.numRecords)
+        self.assertEqual(len(aln.tEnd), aln.metadata.numRecords)
+
     def test_updateCounts(self):
         log.info("Testing updateCounts without filters")
         aln = AlignmentSet(data.getBam(0))
@@ -110,6 +179,7 @@ class TestDataSet(unittest.TestCase):
 
         log.info("Testing whether filters are respected")
         aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
+        aln.updateCounts()
         accLen = aln.metadata.totalLength
         accNum = aln.metadata.numRecords
 
@@ -235,7 +305,8 @@ class TestDataSet(unittest.TestCase):
                           [ds1d is ds2d for ds1d in
                            ds1.subdatasets for ds2d in
                            ds2.subdatasets])
-        ds1 = DataSet(data.getXml(no=10))
+        # TODO: once simulated files are indexable, turn on strict:
+        ds1 = SubreadSet(data.getXml(no=10), strict=False)
         self.assertEquals(type(ds1.metadata).__name__,
                           'SubreadSetMetadata')
         ds2 = ds1.copy()
@@ -258,13 +329,17 @@ class TestDataSet(unittest.TestCase):
     def test_write(self):
         outdir = tempfile.mkdtemp(suffix="dataset-unittest")
         outfile = os.path.join(outdir, 'tempfile.xml')
-        ds1 = DataSet(data.getBam())
+        ds1 = AlignmentSet(data.getBam())
         ds1.write(outfile)
-        # TODO: turn back on when pyxb present:
-        #validateFile(outfile)
-        ds2 = DataSet(outfile)
+        log.debug('Validated file: {f}'.format(f=outfile))
+        validateFile(outfile)
+        ds2 = AlignmentSet(outfile)
         self.assertTrue(ds1 == ds2)
 
+        # Should fail when strict:
+        ds3 = AlignmentSet(data.getBam())
+        ds3.write(outfile)
+
     def test_addFilters(self):
         ds1 = DataSet()
         filt = Filters()
@@ -296,13 +371,13 @@ class TestDataSet(unittest.TestCase):
         er2.resourceId = "test2.bam"
         er3 = ExternalResource()
         er3.resourceId = "test1.bam"
-        ds.addExternalResources([er1])
+        ds.addExternalResources([er1], updateCount=False)
         self.assertEquals(ds.numExternalResources, 1)
         # different resourceId: succeeds
-        ds.addExternalResources([er2])
+        ds.addExternalResources([er2], updateCount=False)
         self.assertEquals(ds.numExternalResources, 2)
         # same resourceId: fails
-        ds.addExternalResources([er3])
+        ds.addExternalResources([er3], updateCount=False)
         self.assertEquals(ds.numExternalResources, 2)
         for extRef in ds.externalResources:
             self.assertEqual(type(extRef).__name__, "ExternalResource")
@@ -325,7 +400,8 @@ class TestDataSet(unittest.TestCase):
         self.assertTrue(len(list(ds.records)), 112)
 
     def test_toFofn(self):
-        self.assertEquals(DataSet("bam1.bam", "bam2.bam").toFofn(),
+        self.assertEquals(DataSet("bam1.bam", "bam2.bam",
+                                  strict=False).toFofn(),
                           ['bam1.bam', 'bam2.bam'])
         realDS = DataSet(data.getXml(8))
         files = realDS.toFofn()
@@ -338,10 +414,11 @@ class TestDataSet(unittest.TestCase):
         self.assertFalse(os.path.isabs(files[0]))
 
     def test_toExternalFiles(self):
-        bogusDS = DataSet("bam1.bam", "bam2.bam")
+        bogusDS = DataSet("bam1.bam", "bam2.bam", strict=False)
         self.assertEqual(['bam1.bam', 'bam2.bam'],
                          bogusDS.externalResources.resourceIds)
-        self.assertEquals(DataSet("bam1.bam", "bam2.bam").toExternalFiles(),
+        self.assertEquals(DataSet("bam1.bam", "bam2.bam",
+                                  strict=False).toExternalFiles(),
                           ['bam1.bam', 'bam2.bam'])
         realDS = DataSet(data.getXml(8))
         files = realDS.toExternalFiles()
@@ -395,6 +472,16 @@ class TestDataSet(unittest.TestCase):
         reads = ds2.readsInRange("E.faecalis.1", 0, 1400)
         self.assertEqual(len(list(reads)), 20)
 
+        lengths = ds.refLengths
+        for rname, rId in ds.refInfo('ID'):
+            rn = ds._idToRname(rId)
+            self.assertEqual(rname, rn)
+            rlen = lengths[rn]
+            self.assertEqual(len(list(ds.readsInReference(rn))),
+                             len(list(ds.readsInReference(rId))))
+            self.assertEqual(len(list(ds.readsInRange(rn, 0, rlen))),
+                             len(list(ds.readsInRange(rId, 0, rlen))))
+
     def test_filter(self):
         ds2 = DataSet(data.getXml(8))
         ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
@@ -430,6 +517,72 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(count(ds2.records), 53552)
         self.assertEqual(ds2.countRecords(), 53552)
 
+    def test_split_by_contigs_with_split_and_maxChunks(self):
+        # test to make sure the refWindows work when chunks == # refs
+        ds3 = DataSet(data.getBam())
+        dss = ds3.split(contigs=True)
+        self.assertEqual(len(dss), 12)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        # not all references have something mapped to them, refWindows doesn't
+        # care...
+        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
+        random_few = [('C.beijerinckii.13', 0, 1433),
+                      ('B.vulgatus.4', 0, 1449),
+                      ('E.faecalis.1', 0, 1482)]
+        for reference in random_few:
+            found = False
+            for ref in refWindows:
+                if ref == reference:
+                    found = True
+            self.assertTrue(found)
+        old_refWindows = refWindows
+
+        dss = ds3.split(contigs=True, maxChunks=1)
+        self.assertEqual(len(dss), 1)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        self.assertEqual(refWindows, old_refWindows)
+
+        dss = ds3.split(contigs=True, maxChunks=24)
+        # This isn't expected if num refs >= 100, as map check isn't made
+        # for now (too expensive)
+        # There are only 12 refs represented in this set, however...
+        self.assertEqual(len(dss), 12)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            if not found:
+                log.debug(ref)
+            self.assertTrue(found)
+
+        dss = ds3.split(contigs=True, maxChunks=36)
+        self.assertEqual(len(dss), 12)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
+        dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True)
+        self.assertEqual(len(dss), 2)
+        refWindows = sorted(reduce(lambda x, y: x + y,
+                                   [ds.refWindows for ds in dss]))
+        for ref in random_few:
+            found = False
+            for window in refWindows:
+                if ref == window:
+                    found = True
+            self.assertTrue(found)
+
     def test_split_by_contigs_with_split(self):
         # test to make sure the refWindows work when chunks == # refs
         ds3 = DataSet(data.getBam())
@@ -458,6 +611,7 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(refWindows, old_refWindows)
 
         dss = ds3.split(contigs=True, chunks=24)
+        self.assertEqual(len(dss), 24)
         refWindows = sorted(reduce(lambda x, y: x + y,
                                    [ds.refWindows for ds in dss]))
 
@@ -473,6 +627,7 @@ class TestDataSet(unittest.TestCase):
             self.assertTrue(found)
 
         dss = ds3.split(contigs=True, chunks=36)
+        self.assertEqual(len(dss), 36)
         refWindows = sorted(reduce(lambda x, y: x + y,
                                    [ds.refWindows for ds in dss]))
         random_few = [('E.faecalis.2', 0, 494),
@@ -536,10 +691,10 @@ class TestDataSet(unittest.TestCase):
         log.debug(dss[0].filters)
         log.debug(dss[1].filters)
         self.assertTrue(
-            '( rname = E.faecalis.2 AND tstart > 0 AND tend < 1482 ) '
+            '( rname = E.faecalis.2 ) '
             in str(dss[0].filters)
             or
-            '( rname = E.faecalis.2 AND tstart > 0 AND tend < 1482 ) '
+            '( rname = E.faecalis.2 ) '
             in str(dss[1].filters))
         ds = DataSet(data.getBam())
         ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'),
@@ -750,8 +905,6 @@ class TestDataSet(unittest.TestCase):
                                   rep))
         self.assertTrue(re.search('pbalchemysim0.pbalign.bam', rep))
 
-    # TODO Turn back on when up to date stats metadata is available
-    @SkipTest
     def test_stats_metadata(self):
         ds = DataSet(data.getBam())
         ds.loadStats(data.getStats())
@@ -842,3 +995,13 @@ class TestDataSet(unittest.TestCase):
         ds3 = ds1 + ds2
         self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
                          [1, 1, 1, 0, 2, 2, 2])
+
+        # now lets test the subdataset metadata retention:
+        ss = SubreadSet(data.getXml(10))
+        ss.loadStats(data.getStats(0))
+        ss.loadStats(data.getStats(1))
+        self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws)
+        self.assertEqual(
+            2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws)
+        self.assertEqual(
+            150292.0, ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py
index fb7b383..3344cdc 100644
--- a/tests/test_pbdataset_subtypes.py
+++ b/tests/test_pbdataset_subtypes.py
@@ -1,11 +1,19 @@
 
 import logging
+from urlparse import urlparse
 import unittest
+import tempfile
+import os
 
-#from pbcore.util.Process import backticks
+from pbcore.util.Process import backticks
 from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet,
-                       ReferenceSet, ContigSet, AlignmentSet)
+                       ReferenceSet, ContigSet, AlignmentSet,
+                       FastaReader, FastaWriter, IndexedFastaReader,
+                       HdfSubreadSet)
+import pbcore.data as upstreamData
 import pbcore.data.datasets as data
+from pbcore.io.dataset.DataSetValidator import validateXml
+import xml.etree.ElementTree as ET
 
 log = logging.getLogger(__name__)
 
@@ -15,8 +23,8 @@ class TestDataSet(unittest.TestCase):
 
 
     def test_subread_build(self):
-        ds1 = DataSet(data.getXml(no=5))
-        ds2 = DataSet(data.getXml(no=5))
+        ds1 = SubreadSet(data.getXml(no=5))
+        ds2 = SubreadSet(data.getXml(no=5))
         self.assertEquals(type(ds1).__name__, 'SubreadSet')
         self.assertEquals(ds1._metadata.__class__.__name__,
                           'SubreadSetMetadata')
@@ -31,6 +39,10 @@ class TestDataSet(unittest.TestCase):
         self.assertEquals(type(ds4._metadata).__name__, 'SubreadSetMetadata')
         self.assertEquals(len(ds4.metadata.collections), 1)
 
+    @unittest.skip("XSD can't handle multiple contigs?")
+    def test_valid_referencesets(self):
+        validateXml(ET.parse(data.getXml(9)).getroot(), skipResources=True)
+
     def test_autofilled_metatypes(self):
         ds = ReferenceSet(data.getXml(9))
         for extRes in ds.externalResources:
@@ -88,15 +100,15 @@ class TestDataSet(unittest.TestCase):
             self.assertTrue(ds.get_contig(name))
 
     def test_ccsread_build(self):
-        ds1 = DataSet(data.getXml(2))
+        ds1 = ConsensusReadSet(data.getXml(2), strict=False)
         self.assertEquals(type(ds1).__name__, 'ConsensusReadSet')
         self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata')
-        ds2 = ConsensusReadSet(data.getXml(2))
+        ds2 = ConsensusReadSet(data.getXml(2), strict=False)
         self.assertEquals(type(ds2).__name__, 'ConsensusReadSet')
         self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
 
     def test_contigset_build(self):
-        ds1 = DataSet(data.getXml(3))
+        ds1 = ContigSet(data.getXml(3))
         self.assertEquals(type(ds1).__name__, 'ContigSet')
         self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata')
         ds2 = ContigSet(data.getXml(3))
@@ -105,4 +117,152 @@ class TestDataSet(unittest.TestCase):
         for contigmd in ds2.metadata.contigs:
             self.assertEquals(type(contigmd).__name__, 'ContigMetadata')
 
+    def test_contigset_consolidate(self):
+        #build set to merge
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+
+        inFas = os.path.join(outdir, 'infile.fasta')
+        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
+        outFas2 = os.path.join(outdir, 'tempfile2.fasta')
+
+        # copy fasta reference to hide fai and ensure FastaReader is used
+        backticks('cp {i} {o}'.format(
+                      i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
+                      o=inFas))
+        rs1 = ContigSet(inFas)
+
+        singletons = ['A.baumannii.1', 'A.odontolyticus.1']
+        double = 'B.cereus.1'
+        reader = rs1.resourceReaders()[0]
+        exp_double = rs1.get_contig(double)
+        exp_singles = [rs1.get_contig(name) for name in singletons]
+
+        # todo: modify the names first:
+        with FastaWriter(outFas1) as writer:
+            writer.writeRecord(exp_singles[0])
+            writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence)
+        with FastaWriter(outFas2) as writer:
+            writer.writeRecord(exp_double.name + '_0_10',
+                               exp_double.sequence + 'ATCGATCGATCG')
+            writer.writeRecord(exp_singles[1])
+
+        exp_double_seq = ''.join([exp_double.sequence,
+                                  'ATCGATCGATCG',
+                                  exp_double.sequence])
+        exp_single_seqs = [rec.sequence for rec in exp_singles]
+
+        acc_file = ContigSet(outFas1, outFas2)
+        log.debug(acc_file.toExternalFiles())
+        acc_file.consolidate()
+        log.debug(acc_file.toExternalFiles())
+
+        # open acc and compare to exp
+        for name, seq in zip(singletons, exp_single_seqs):
+            self.assertEqual(acc_file.get_contig(name).sequence, seq)
+        self.assertEqual(acc_file.get_contig(double).sequence, exp_double_seq)
+
+
+    def test_split_hdfsubreadset(self):
+        hdfds = HdfSubreadSet(*upstreamData.getBaxH5_v23())
+        self.assertEqual(len(hdfds.toExternalFiles()), 3)
+        hdfdss = hdfds.split(chunks=2, ignoreSubDatasets=True)
+        self.assertEqual(len(hdfdss), 2)
+        self.assertEqual(len(hdfdss[0].toExternalFiles()), 2)
+        self.assertEqual(len(hdfdss[1].toExternalFiles()), 1)
+
+
+    def test_alignment_reference(self):
+        rs1 = ReferenceSet(data.getXml(9))
+        fasta_res = rs1.externalResources[0]
+        fasta_file = urlparse(fasta_res.resourceId).path
+
+        ds1 = AlignmentSet(data.getXml(8),
+            referenceFastaFname=rs1)
+        aln_ref = None
+        for aln in ds1:
+            aln_ref = aln.reference()
+            break
+        self.assertTrue(aln_ref is not None)
+
+        ds1 = AlignmentSet(data.getXml(8),
+            referenceFastaFname=fasta_file)
+        aln_ref = None
+        for aln in ds1:
+            aln_ref = aln.reference()
+            break
+        self.assertTrue(aln_ref is not None)
+
+        ds1 = AlignmentSet(data.getXml(8))
+        ds1.addReference(fasta_file)
+        aln_ref = None
+        for aln in ds1:
+            aln_ref = aln.reference()
+            break
+        self.assertTrue(aln_ref is not None)
+
+    def test_nested_external_resources(self):
+        log.debug("Testing nested externalResources in AlignmentSets")
+        aln = AlignmentSet(data.getXml(0))
+        self.assertTrue(aln.externalResources[0].pbi)
+        self.assertTrue(aln.externalResources[0].reference)
+        self.assertEqual(
+            aln.externalResources[0].externalResources[0].metaType,
+            'PacBio.ReferenceFile.ReferenceFastaFile')
+        self.assertEqual(aln.externalResources[0].scraps, None)
+
+        log.debug("Testing nested externalResources in SubreadSets")
+        subs = SubreadSet(data.getXml(5))
+        self.assertTrue(subs.externalResources[0].scraps)
+        self.assertEqual(
+            subs.externalResources[0].externalResources[0].metaType,
+            'PacBio.SubreadFile.ScrapsBamFile')
+        self.assertEqual(subs.externalResources[0].reference, None)
+
+        log.debug("Testing added nested externalResoruces to SubreadSet")
+        subs = SubreadSet(data.getXml(10))
+        self.assertFalse(subs.externalResources[0].scraps)
+        subs.externalResources[0].scraps = 'fake.fasta'
+        self.assertTrue(subs.externalResources[0].scraps)
+        self.assertEqual(
+            subs.externalResources[0].externalResources[0].metaType,
+            'PacBio.SubreadFile.ScrapsBamFile')
+
+        log.debug("Testing adding nested externalResources to AlignmetnSet "
+                  "manually")
+        aln = AlignmentSet(data.getXml(8))
+        self.assertTrue(aln.externalResources[0].bai)
+        self.assertTrue(aln.externalResources[0].pbi)
+        self.assertFalse(aln.externalResources[0].reference)
+        aln.externalResources[0].reference = 'fake.fasta'
+        self.assertTrue(aln.externalResources[0].reference)
+        self.assertEqual(
+            aln.externalResources[0].externalResources[0].metaType,
+            'PacBio.ReferenceFile.ReferenceFastaFile')
+
+        # Disabling until this feature is considered valuable. At the moment I
+        # think it might cause accidental pollution.
+        #log.debug("Testing adding nested externalResources to AlignmetnSet "
+        #          "on construction")
+        #aln = AlignmentSet(data.getXml(8), referenceFastaFname=data.getXml(9))
+        #self.assertTrue(aln.externalResources[0].bai)
+        #self.assertTrue(aln.externalResources[0].pbi)
+        #self.assertTrue(aln.externalResources[0].reference)
+        #self.assertEqual(
+        #    aln.externalResources[0].externalResources[0].metaType,
+        #    'PacBio.ReferenceFile.ReferenceFastaFile')
+
+        #log.debug("Testing adding nested externalResources to "
+        #          "AlignmentSets with multiple external resources "
+        #          "on construction")
+        #aln = AlignmentSet(data.getXml(12), referenceFastaFname=data.getXml(9))
+        #for i in range(1):
+        #    self.assertTrue(aln.externalResources[i].bai)
+        #    self.assertTrue(aln.externalResources[i].pbi)
+        #    self.assertTrue(aln.externalResources[i].reference)
+        #    self.assertEqual(
+        #        aln.externalResources[i].externalResources[0].metaType,
+        #        'PacBio.ReferenceFile.ReferenceFastaFile')
+
+
+
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git



More information about the debian-med-commit mailing list