[med-svn] [python-pbcore] 01/03: Imported Upstream version 1.2.4+dfsg

Afif Elghraoui afif-guest at moszumanska.debian.org
Mon Oct 26 08:24:42 UTC 2015


This is an automated email from the git hooks/post-receive script.

afif-guest pushed a commit to branch master
in repository python-pbcore.

commit a8e05e349e793653d6fcaa1a8d466d4fd4271028
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Mon Oct 26 01:12:01 2015 -0700

    Imported Upstream version 1.2.4+dfsg
---
 CHANGELOG.org                                      |   4 +
 doc/pacbio-theme/static/headerGradient.jpg         | Bin 7099 -> 0 bytes
 doc/pacbio-theme/static/pacbio.css                 | 238 ----------------
 doc/pacbio-theme/static/pacbioLogo.png             | Bin 3128 -> 0 bytes
 doc/pacbio-theme/static/pygments.css               |  55 ----
 doc/pacbio-theme/theme.conf                        |   4 -
 doc/pbcore.io.dataset.rst                          |  95 ++++++-
 pbcore/__init__.py                                 |   2 +-
 pbcore/data/datasets/__init__.py                   |   3 +
 ...00001823085912221377_s1_X0.subreads.fake_bc.bam | Bin 0 -> 192565 bytes
 ...1823085912221377_s1_X0.subreads.fake_bc.bam.bai | Bin 0 -> 16 bytes
 ...1823085912221377_s1_X0.subreads.fake_bc.bam.pbi | Bin 0 -> 1523 bytes
 pbcore/data/datasets/pbalchemysim0.pbalign.bam     | Bin 303745 -> 303947 bytes
 pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi | Bin 2147 -> 2159 bytes
 pbcore/data/datasets/pbalchemysim0.subreads.bam    | Bin 301012 -> 301048 bytes
 .../data/datasets/pbalchemysim0.subreads.bam.pbi   | Bin 1003 -> 1030 bytes
 pbcore/data/datasets/pbalchemysim1.pbalign.bam     | Bin 287718 -> 287902 bytes
 pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi | Bin 2028 -> 2082 bytes
 pbcore/data/datasets/pbalchemysim1.subreads.bam    | Bin 284875 -> 284903 bytes
 .../data/datasets/pbalchemysim1.subreads.bam.pbi   | Bin 945 -> 965 bytes
 pbcore/data/empty.ccs.bam                          | Bin 540 -> 507 bytes
 pbcore/data/empty.ccs.bam.pbi                      | Bin 67 -> 67 bytes
 ...100569412550000001823090301191423_s1_p0.ccs.bam | Bin 280793 -> 278396 bytes
 ...00001823085912221377_s1_X0.aligned_subreads.bam | Bin 201867 -> 201960 bytes
 ...1823085912221377_s1_X0.aligned_subreads.bam.bai | Bin 112 -> 112 bytes
 ...1823085912221377_s1_X0.aligned_subreads.bam.pbi | Bin 2521 -> 2609 bytes
 ...564852550000001823085912221377_s1_X0.scraps.bam | Bin 0 -> 608536 bytes
 ...4852550000001823085912221377_s1_X0.subreads.bam | Bin 204413 -> 204543 bytes
 ...550000001823085912221377_s1_X0.subreads.bam.pbi | Bin 1314 -> 1463 bytes
 pbcore/io/BasH5IO.py                               |   9 +
 pbcore/io/FastaIO.py                               |   2 +
 pbcore/io/GffIO.py                                 |  97 +++++--
 pbcore/io/align/BamAlignment.py                    |   4 +-
 pbcore/io/align/BamIO.py                           |  12 +-
 pbcore/io/align/PacBioBamIndex.py                  |  43 ++-
 pbcore/io/dataset/DataSetIO.py                     | 306 ++++++++++++++++-----
 pbcore/io/dataset/DataSetMembers.py                |  73 +++--
 pbcore/io/dataset/DataSetReader.py                 |   1 +
 pbcore/io/dataset/DataSetValidator.py              |  30 +-
 pbcore/io/dataset/DataSetWriter.py                 |  21 +-
 pbcore/io/dataset/EntryPoints.py                   |  38 ++-
 pbcore/io/dataset/_pbds.py                         |   2 +-
 pbcore/io/dataset/utils.py                         |  79 ++++--
 tests/test_pbcore_extract_bam_sequence.py          |   4 +-
 tests/test_pbcore_io_AlnFileReaders.py             |  39 ++-
 tests/test_pbcore_io_GffIO.py                      |  27 +-
 tests/test_pbdataset.py                            | 185 +++++++++++--
 tests/test_pbdataset_subtypes.py                   |  82 +++++-
 48 files changed, 921 insertions(+), 534 deletions(-)

diff --git a/CHANGELOG.org b/CHANGELOG.org
index c00d28e..1a8c09f 100644
--- a/CHANGELOG.org
+++ b/CHANGELOG.org
@@ -1,3 +1,7 @@
+* Version 1.2.4
+- add GFF gather functionality
+- support for 3.0.1 BAM format spec
+
 * Version 1.2.3
 - Updated for newer dataset schema
 - BAM access speed improvements
diff --git a/doc/pacbio-theme/static/headerGradient.jpg b/doc/pacbio-theme/static/headerGradient.jpg
deleted file mode 100644
index 883f147..0000000
Binary files a/doc/pacbio-theme/static/headerGradient.jpg and /dev/null differ
diff --git a/doc/pacbio-theme/static/pacbio.css b/doc/pacbio-theme/static/pacbio.css
deleted file mode 100644
index b4ab87f..0000000
--- a/doc/pacbio-theme/static/pacbio.css
+++ /dev/null
@@ -1,238 +0,0 @@
-/**
- * Sphinx stylesheet -- default theme
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
- 
- at import url("basic.css");
- 
-/* -- page layout ----------------------------------------------------------- */
- 
-body {
-    font-family: Arial, sans-serif;
-    font-size: 100%;
-    background-color: #555;
-    color: #555;
-    margin: 0;
-    padding: 0;
-    min-width: 500px;
-    max-width: 956px;
-    margin: 0 auto;
-}
-
-div.documentwrapper {
-    float: left;
-    width: 100%;
-}
-
-div.bodywrapper {
-    margin: 0 0 0 230px;
-}
-
-hr{
-    border: 1px solid #B1B4B6;
-    
-}
- 
-div.document {
-    background-color: #eee;
-}
- 
-div.body {
-    background-color: #ffffff;
-    color: #3E4349;
-    padding: 30px 30px 30px 30px;
-    font-size: 0.8em;
-}
- 
-div.footer {
-    color: #555;
-	background-color: #fff;
-    padding: 13px 0;
-    text-align: center;
-    font-size: 75%;
-
-}
-div.footer a {
-    color: #444;
-    text-decoration: underline;
-}
- 
-div.related {
-    background: #fff url(headerGradient.jpg);
-    line-height: 80px;
-    color: #fff;
-    font-size: 0.80em;
-    height: 79px;
-    z-index: -1;
-}
-
-div.related ul {
-    background: url(pacbioLogo.png) 10px no-repeat;
-    padding: 0 0 0 200px;
-}
- 
-div.related a {
-    color: #E2F3CC;
-}
- 
-div.sphinxsidebar {
-    font-size: 0.75em;
-    line-height: 1.5em;
-}
-
-div.sphinxsidebarwrapper{
-    padding: 20px 0;
-}
- 
-div.sphinxsidebar h3,
-div.sphinxsidebar h4 {
-    font-family: Arial, sans-serif;
-    color: #222;
-    font-size: 1.2em;
-    font-weight: bold;
-    margin: 0;
-    padding: 5px 10px 0 10px;
-}
-
-div.sphinxsidebar h4{
-    font-size: 1.1em;
-}
- 
-div.sphinxsidebar h3 a {
-    color: #444;
-}
- 
- 
-div.sphinxsidebar p {
-    color: #888;
-    padding: 0px 20px;
-	margin-top: 5px;
-}
- 
-div.sphinxsidebar p.topless {
-}
- 
-div.sphinxsidebar ul {
-    margin: 5px 20px 10px 20px;
-    padding: 0;
-    color: #000;
-}
- 
-div.sphinxsidebar a {
-    color: #444;
-}
- 
-div.sphinxsidebar input {
-    border: 1px solid #ccc;
-    font-family: sans-serif;
-    font-size: 1em;
-}
-
-div.sphinxsidebar input[type=text]{
-    margin-left: 20px;
-}
- 
-/* -- body styles ----------------------------------------------------------- */
- 
-a {
-    color: #005B81;
-    text-decoration: none;
-}
- 
-a:hover {
-    color: #E32E00;
-    text-decoration: underline;
-}
- 
-div.body h1,
-div.body h2,
-div.body h3,
-div.body h4,
-div.body h5,
-div.body h6 {
-    font-family: Arial, sans-serif;
-    font-weight: bold;
-    color: #264868;
-    margin: 30px 0px 10px 0px;
-    padding: 5px 0 5px 0px;
-}
- 
-div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 180%; font-weight: normal; }
-div.body h2 { font-size: 125%; }
-div.body h3 { font-size: 110%; }
-div.body h4 { font-size: 100%; }
-div.body h5 { font-size: 100%; }
-div.body h6 { font-size: 100%; }
- 
-a.headerlink {
-    color: #c60f0f;
-    font-size: 0.8em;
-    padding: 0 4px 0 4px;
-    text-decoration: none;
-}
- 
-a.headerlink:hover {
-    background-color: #c60f0f;
-    color: white;
-}
- 
-div.body p, div.body dd, div.body li {
-    line-height: 1.5em;
-    font-size: 1em;
-}
- 
-div.admonition p.admonition-title + p {
-    display: inline;
-}
-
-div.highlight{
-    background-color: white;
-}
-
-div.note {
-    background-color: #eee;
-    border: 1px solid #ccc;
-}
- 
-div.seealso {
-    background-color: #ffc;
-    border: 1px solid #ff6;
-}
- 
-div.topic {
-    background-color: #eee;
-}
- 
-div.warning {
-    background-color: #ffe4e4;
-    border: 1px solid #f66;
-}
- 
-p.admonition-title {
-    display: inline;
-}
- 
-p.admonition-title:after {
-    content: ":";
-}
- 
-pre {
-    padding: 10px;
-    background-color: White;
-    color: #222;
-    line-height: 1.2em;
-    border: 1px solid #C6C9CB;
-    font-size: 1.2em;
-    margin: 1.5em 0 1.5em 0;
-    -webkit-box-shadow: 1px 1px 1px #d8d8d8;
-    -moz-box-shadow: 1px 1px 1px #d8d8d8;
-}
- 
-tt {
-    background-color: #ecf0f3;
-    color: #222;
-    padding: 1px 2px;
-    font-size: 1.2em;
-    font-family: monospace;
-}
-
diff --git a/doc/pacbio-theme/static/pacbioLogo.png b/doc/pacbio-theme/static/pacbioLogo.png
deleted file mode 100644
index b2e4887..0000000
Binary files a/doc/pacbio-theme/static/pacbioLogo.png and /dev/null differ
diff --git a/doc/pacbio-theme/static/pygments.css b/doc/pacbio-theme/static/pygments.css
deleted file mode 100644
index 4588cde..0000000
--- a/doc/pacbio-theme/static/pygments.css
+++ /dev/null
@@ -1,55 +0,0 @@
-.c { color: #999988; font-style: italic } /* Comment */
-.k { font-weight: bold } /* Keyword */
-.o { font-weight: bold } /* Operator */
-.cm { color: #999988; font-style: italic } /* Comment.Multiline */
-.cp { color: #999999; font-weight: bold } /* Comment.preproc */
-.c1 { color: #999988; font-style: italic } /* Comment.Single */
-.gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
-.ge { font-style: italic } /* Generic.Emph */
-.gr { color: #aa0000 } /* Generic.Error */
-.gh { color: #999999 } /* Generic.Heading */
-.gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
-.go { color: #111 } /* Generic.Output */
-.gp { color: #555555 } /* Generic.Prompt */
-.gs { font-weight: bold } /* Generic.Strong */
-.gu { color: #aaaaaa } /* Generic.Subheading */
-.gt { color: #aa0000 } /* Generic.Traceback */
-.kc { font-weight: bold } /* Keyword.Constant */
-.kd { font-weight: bold } /* Keyword.Declaration */
-.kp { font-weight: bold } /* Keyword.Pseudo */
-.kr { font-weight: bold } /* Keyword.Reserved */
-.kt { color: #445588; font-weight: bold } /* Keyword.Type */
-.m { color: #009999 } /* Literal.Number */
-.s { color: #bb8844 } /* Literal.String */
-.na { color: #008080 } /* Name.Attribute */
-.nb { color: #999999 } /* Name.Builtin */
-.nc { color: #445588; font-weight: bold } /* Name.Class */
-.no { color: #ff99ff } /* Name.Constant */
-.ni { color: #800080 } /* Name.Entity */
-.ne { color: #990000; font-weight: bold } /* Name.Exception */
-.nf { color: #990000; font-weight: bold } /* Name.Function */
-.nn { color: #555555 } /* Name.Namespace */
-.nt { color: #000080 } /* Name.Tag */
-.nv { color: purple } /* Name.Variable */
-.ow { font-weight: bold } /* Operator.Word */
-.mf { color: #009999 } /* Literal.Number.Float */
-.mh { color: #009999 } /* Literal.Number.Hex */
-.mi { color: #009999 } /* Literal.Number.Integer */
-.mo { color: #009999 } /* Literal.Number.Oct */
-.sb { color: #bb8844 } /* Literal.String.Backtick */
-.sc { color: #bb8844 } /* Literal.String.Char */
-.sd { color: #bb8844 } /* Literal.String.Doc */
-.s2 { color: #bb8844 } /* Literal.String.Double */
-.se { color: #bb8844 } /* Literal.String.Escape */
-.sh { color: #bb8844 } /* Literal.String.Heredoc */
-.si { color: #bb8844 } /* Literal.String.Interpol */
-.sx { color: #bb8844 } /* Literal.String.Other */
-.sr { color: #808000 } /* Literal.String.Regex */
-.s1 { color: #bb8844 } /* Literal.String.Single */
-.ss { color: #bb8844 } /* Literal.String.Symbol */
-.bp { color: #999999 } /* Name.Builtin.Pseudo */
-.vc { color: #ff99ff } /* Name.Variable.Class */
-.vg { color: #ff99ff } /* Name.Variable.Global */
-.vi { color: #ff99ff } /* Name.Variable.Instance */
-.il { color: #009999 } /* Literal.Number.Integer.Long */
-
diff --git a/doc/pacbio-theme/theme.conf b/doc/pacbio-theme/theme.conf
deleted file mode 100644
index dd24a1a..0000000
--- a/doc/pacbio-theme/theme.conf
+++ /dev/null
@@ -1,4 +0,0 @@
-[theme]
-inherit = default 
-stylesheet = pacbio.css
-pygments_style = tango
diff --git a/doc/pbcore.io.dataset.rst b/doc/pbcore.io.dataset.rst
index acf2269..032aab1 100644
--- a/doc/pbcore.io.dataset.rst
+++ b/doc/pbcore.io.dataset.rst
@@ -55,12 +55,15 @@ Filter::
 
     usage: dataset.py filter [-h] infile outfile filters [filters ...]
 
-    Add filters to an XML file
+    Add filters to an XML file. Suggested fields: ['bcf', 'bcq', 'bcr',
+    'length', 'pos', 'qend', 'qname', 'qstart', 'readstart', 'rname', 'rq',
+    'tend', 'tstart', 'zm']. More expensive fields: ['accuracy', 'bc', 'movie',
+    'qs']
 
     positional arguments:
       infile      The xml file to filter
       outfile     The resulting xml file
-      filters     The values and thresholds to filter (e.g. rq>0.85)
+      filters     The values and thresholds to filter (e.g. 'rq>0.85')
 
     optional arguments:
       -h, --help  show this help message and exit
@@ -123,13 +126,96 @@ Split::
       --subdatasets    Split on subdatasets
       --outdir OUTDIR  Specify an output directory
 
-Consolidation requires in-depth filtering and BAM file manipulation.
-Implementation plans TBD.
+Consolidate::
 
+    usage: dataset.py consolidate [-h] [--numFiles NUMFILES] [--noTmp]
+                                  infile datafile xmlfile
+
+    Consolidate the XML files
+
+    positional arguments:
+      infile               The XML file to consolidate
+      datafile             The resulting data file
+      xmlfile              The resulting XML file
+
+    optional arguments:
+      -h, --help           show this help message and exit
+      --numFiles NUMFILES  The number of data files to produce (1)
+      --noTmp              Don't copy to a tmp location to ensure local disk
+                           use
 
 Usage Examples
 =============================
 
+Filter Reads (CLI version)
+--------------------------
+
+In this scenario we have one or more bam files worth of subreads, aligned or
+otherwise, that we want to filter and put in a single bam file. This is
+possible using the CLI with the following steps, starting with a DataSet XML
+file::
+
+    # usage: dataset.py filter <in_fn.xml> <out_fn.xml> <filters>
+    dataset.py filter in_fn.subreadset.xml filtered_fn.subreadset.xml 'rq>0.85'
+
+    # usage: dataset.py consolidate <in_fn.xml> <out_data_fn.bam> <out_fn.xml>
+    dataset.py consolidate filtered_fn.subreadset.xml consolidate.subreads.bam out_fn.subreadset.xml
+
+The filtered DataSet and the consolidated DataSet should be read for read
+equivalent when used with SMRT Analysis software.
+
+Filter Reads (API version)
+--------------------------
+
+The API version of filtering allows for more advanced filtering criteria::
+
+    ss = SubreadSet('in_fn.subreadset.xml')
+    ss.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
+                                     ('=', 'E.faecalis.2')],
+                              tStart=[('<', '99'),
+                                      ('<', '299')],
+                              tEnd=[('>', '0'),
+                                    ('>', '200')])
+
+Produces the following conditions for a read to be considered passing:
+
+(rname = E.faecalis.2 AND tstart < 99 AND tend > 0)
+OR
+(rname = E.faecalis.2 AND tstart < 299 AND tend > 200)
+
+You can add sets of filters by providing equal length lists of requirements for
+each filter.
+
+Additional requirements added singly will be added to all filters::
+
+    ss.filters.addRequirement(rq=[('>', '0.85')]) 
+
+(rname = E.faecalis.2 AND tstart < 99 AND tend > 0 AND rq > 0.85)
+OR
+(rname = E.faecalis.2 AND tstart < 299 AND tend > 100 AND rq > 0.85)
+
+Additional requirements added with a plurality of options will duplicate the
+previous requiremnts for each option::
+
+    ss.filters.addRequirement(length=[('>', 500), ('>', 1000)])
+
+(rname = E.faecalis.2 AND tstart < 99 AND tend > 0 AND rq > 0.85 AND length > 500)
+OR
+(rname = E.faecalis.2 AND tstart < 299 AND tend > 100 AND rq > 0.85 AND length > 500)
+OR
+(rname = E.faecalis.2 AND tstart < 99 AND tend > 0 AND rq > 0.85 AND length > 1000)
+OR
+(rname = E.faecalis.2 AND tstart < 299 AND tend > 100 AND rq > 0.85 AND length > 1000)
+
+Of course you can always wipe the filters and start over::
+
+    ss.filters = None
+
+Consolidation is more similar to the CLI version::
+
+    ss.consolidate('cons.bam')
+    ss.write('cons.xml')
+
 Resequencing Pipeline (CLI version)
 -----------------------------------
 
@@ -295,4 +381,3 @@ DataSetMembers module.
     :show-inheritance:
     :undoc-members:
 
-
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
index 466ec29..dd6477f 100644
--- a/pbcore/__init__.py
+++ b/pbcore/__init__.py
@@ -28,4 +28,4 @@
 # POSSIBILITY OF SUCH DAMAGE.
 #################################################################################
 
-__VERSION__ = "1.2.3"
+__VERSION__ = "1.2.4"
diff --git a/pbcore/data/datasets/__init__.py b/pbcore/data/datasets/__init__.py
index 843faae..2d4378a 100755
--- a/pbcore/data/datasets/__init__.py
+++ b/pbcore/data/datasets/__init__.py
@@ -60,3 +60,6 @@ def getSubreadSet():
 
 def getHdfSubreadSet():
     return _getAbsPath(XML_FILES[7])
+
+def getBarcodedBam():
+    return _getAbsPath(BAM_FILES[3])
diff --git a/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam
new file mode 100644
index 0000000..ff3e6e8
Binary files /dev/null and b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam differ
diff --git a/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.bai b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.bai
new file mode 100644
index 0000000..79a6a43
Binary files /dev/null and b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.bai differ
diff --git a/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.pbi b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.pbi
new file mode 100644
index 0000000..4d2df12
Binary files /dev/null and b/pbcore/data/datasets/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.fake_bc.bam.pbi differ
diff --git a/pbcore/data/datasets/pbalchemysim0.pbalign.bam b/pbcore/data/datasets/pbalchemysim0.pbalign.bam
index b9d3997..5cd4251 100644
Binary files a/pbcore/data/datasets/pbalchemysim0.pbalign.bam and b/pbcore/data/datasets/pbalchemysim0.pbalign.bam differ
diff --git a/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi b/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi
index 1ae468c..144089a 100644
Binary files a/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi and b/pbcore/data/datasets/pbalchemysim0.pbalign.bam.pbi differ
diff --git a/pbcore/data/datasets/pbalchemysim0.subreads.bam b/pbcore/data/datasets/pbalchemysim0.subreads.bam
index f209211..8ea5174 100644
Binary files a/pbcore/data/datasets/pbalchemysim0.subreads.bam and b/pbcore/data/datasets/pbalchemysim0.subreads.bam differ
diff --git a/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi b/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi
index 0930a2e..532236a 100644
Binary files a/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi and b/pbcore/data/datasets/pbalchemysim0.subreads.bam.pbi differ
diff --git a/pbcore/data/datasets/pbalchemysim1.pbalign.bam b/pbcore/data/datasets/pbalchemysim1.pbalign.bam
index e0ee9d2..e699726 100644
Binary files a/pbcore/data/datasets/pbalchemysim1.pbalign.bam and b/pbcore/data/datasets/pbalchemysim1.pbalign.bam differ
diff --git a/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi b/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi
index cf39b47..2477e13 100644
Binary files a/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi and b/pbcore/data/datasets/pbalchemysim1.pbalign.bam.pbi differ
diff --git a/pbcore/data/datasets/pbalchemysim1.subreads.bam b/pbcore/data/datasets/pbalchemysim1.subreads.bam
index 627f5f5..e45f2fa 100644
Binary files a/pbcore/data/datasets/pbalchemysim1.subreads.bam and b/pbcore/data/datasets/pbalchemysim1.subreads.bam differ
diff --git a/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi b/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi
index d637f58..28f0f2f 100644
Binary files a/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi and b/pbcore/data/datasets/pbalchemysim1.subreads.bam.pbi differ
diff --git a/pbcore/data/empty.ccs.bam b/pbcore/data/empty.ccs.bam
index 704abbe..3454722 100644
Binary files a/pbcore/data/empty.ccs.bam and b/pbcore/data/empty.ccs.bam differ
diff --git a/pbcore/data/empty.ccs.bam.pbi b/pbcore/data/empty.ccs.bam.pbi
index 9e7fd4e..efbe492 100644
Binary files a/pbcore/data/empty.ccs.bam.pbi and b/pbcore/data/empty.ccs.bam.pbi differ
diff --git a/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam
index d927eac..c1e6df2 100644
Binary files a/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam and b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam
index 789865a..02f6299 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai
index dea076b..9e1f72b 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.bai differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi
index 4a8ff48..2c82b65 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.aligned_subreads.bam.pbi differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.scraps.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.scraps.bam
new file mode 100644
index 0000000..cf034d2
Binary files /dev/null and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.scraps.bam differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam
index 3514891..4809be5 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi
index 0cb1c4b..0027fb6 100644
Binary files a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam.pbi differ
diff --git a/pbcore/io/BasH5IO.py b/pbcore/io/BasH5IO.py
index 8552bf6..b2ef3a5 100644
--- a/pbcore/io/BasH5IO.py
+++ b/pbcore/io/BasH5IO.py
@@ -411,6 +411,15 @@ class ZmwRead(CommonEqualityMixin):
 
     PulseIndex     = _makeQvAccessor("PulseIndex")
 
+    def StartFrame(self):
+        return self.EndFrame() - self.WidthInFrames()
+
+    def EndFrame(self):
+        fullPBF = arrayFromDataset(self._getBasecallsGroup()["PreBaseFrames"],
+                                   *self._getOffsets()[self.holeNumber])
+        fullWF  = arrayFromDataset(self._getBasecallsGroup()["WidthInFrames"],
+                                   *self._getOffsets()[self.holeNumber])
+        return np.cumsum(fullWF + fullPBF)[self.readStart:self.readEnd]
 
 class CCSZmwRead(ZmwRead):
     """
diff --git a/pbcore/io/FastaIO.py b/pbcore/io/FastaIO.py
index 5e2d7bf..e750117 100644
--- a/pbcore/io/FastaIO.py
+++ b/pbcore/io/FastaIO.py
@@ -252,6 +252,8 @@ class FastaWriter(WriterBase):
             raise ValueError
         if len(args) == 1:
             record = args[0]
+            if isinstance(record, IndexedFastaRecord):
+                record = FastaRecord(record.header, record.sequence)
             assert isinstance(record, FastaRecord)
         else:
             header, sequence = args
diff --git a/pbcore/io/GffIO.py b/pbcore/io/GffIO.py
index c3313f3..a08f0b7 100644
--- a/pbcore/io/GffIO.py
+++ b/pbcore/io/GffIO.py
@@ -42,7 +42,7 @@ __all__ = [ "Gff3Record",
             "GffWriter" ]
 
 from .base import ReaderBase, WriterBase
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 from copy import copy as shallow_copy
 import tempfile
 import os.path
@@ -256,42 +256,85 @@ def sort_gff(file_name, output_file_name=None):
     return output_file_name
 
 
-def _merge_gffs(gff1, gff2, out):
-    out = GffWriter(out)
+def merge_gffs (gff_files, output_file):
+    """
+    Merge a sequence of GFF files, preserving unique headers (except for the
+    source commandline, which will usually vary).
+    """
+    headers, header_keys = _merge_gff_headers(gff_files)
     n_rec = 0
-    with GffReader(gff1) as f1:
-        map(out.writeHeader, f1.headers)
-        with GffReader(gff2) as f2:
-            rec1 = [ rec for rec in f1 ]
-            rec2 = [ rec for rec in f2 ]
-            i = j = 0
-            while i < len(rec1) and j < len(rec2):
-                if (rec1[i] > rec2[j]):
-                    out.writeRecord(rec2[j])
-                    j += 1
-                else:
-                    out.writeRecord(rec1[i])
+    with GffWriter(output_file) as writer:
+        for key in header_keys:
+            for value in headers[key]:
+                writer.writeHeader("##%s %s" % (key, value))
+        for gff_file in gff_files:
+            with GffReader(gff_file) as reader:
+                for rec in reader:
+                    writer.writeRecord(rec)
+                    n_rec += 1
+    return n_rec
+
+
+def _merge_gff_headers(gff_files):
+    ignore_fields = set(["source-commandline", "gff-version", "date"])
+    def _get_headers(f):
+        with GffReader(f) as reader:
+            for h in reader.headers:
+                fields = h[2:].strip().split(" ")
+                if not fields[0] in ignore_fields:
+                    yield fields[0], " ".join(fields[1:])
+    header_keys = []
+    for k, v in _get_headers(gff_files[0]):
+        if not k in header_keys:
+            header_keys.append(k)
+    headers = defaultdict(list)
+    for gff_file in gff_files:
+        for key, value in _get_headers(gff_file):
+            if not value in headers[key]:
+                headers[key].append(value)
+    return headers, header_keys
+
+
+def _merge_gffs_sorted(gff1, gff2, out_file):
+    import logging
+    logging.warn("Merging %s and %s" % (gff1, gff2))
+    with GffWriter(out_file) as out:
+        n_rec = 0
+        headers, header_keys = _merge_gff_headers([gff1, gff2])
+        with GffReader(gff1) as f1:
+            for key in header_keys:
+                for value in headers[key]:
+                    out.writeHeader("##%s %s" % (key, value))
+            with GffReader(gff2) as f2:
+                rec1 = [ rec for rec in f1 ]
+                rec2 = [ rec for rec in f2 ]
+                i = j = 0
+                while i < len(rec1) and j < len(rec2):
+                    if (rec1[i] > rec2[j]):
+                        out.writeRecord(rec2[j])
+                        j += 1
+                    else:
+                        out.writeRecord(rec1[i])
+                        i += 1
+                for rec in rec1[i:]:
+                    out.writeRecord(rec)
                     i += 1
-            for rec in rec1[i:]:
-                out.writeRecord(rec)
-                i += 1
-            for rec in rec2[j:]:
-                out.writeRecord(rec)
-                j += 2
-    return i + j
+                for rec in rec2[j:]:
+                    out.writeRecord(rec)
+                    j += 2
+        return i + j
 
 
-def merge_gffs(gff_files, output_file_name):
+def merge_gffs_sorted(gff_files, output_file_name):
     """
     Utility function to combine a set of N (>= 2) GFF files, with records
     ordered by genomic location.  (Assuming each input file is already sorted.)
     """
     if len(gff_files) == 1: return gff_files[0]
     while len(gff_files) > 2:
-        tmpout = tempfile.NamedTemporaryFile()
-        _merge_gffs(gff_files[0], gff_files[1], tmpout)
-        tmpout.seek(0)
+        tmpout = tempfile.NamedTemporaryFile(suffix=".gff").name
+        _merge_gffs_sorted(gff_files[0], gff_files[1], tmpout)
         gff_files = [tmpout] + gff_files[2:]
     with open(output_file_name, "w") as f:
-        _merge_gffs(gff_files[0], gff_files[1], f)
+        _merge_gffs_sorted(gff_files[0], gff_files[1], f)
     return output_file_name
diff --git a/pbcore/io/align/BamAlignment.py b/pbcore/io/align/BamAlignment.py
index 3ed36d7..14df751 100644
--- a/pbcore/io/align/BamAlignment.py
+++ b/pbcore/io/align/BamAlignment.py
@@ -253,7 +253,7 @@ class BamAlignment(AlignmentRecordMixin):
             use of this property can result in code that won't work on
             legacy data.
         """
-        return 0.001 * self.peer.opt("rq")
+        return self.peer.opt("rq")
 
     @property
     def readType(self):
@@ -592,7 +592,7 @@ class BamAlignment(AlignmentRecordMixin):
         if key in self.bam.pbi.columnNames:
             return self.bam.pbi[self.rowNumber][key]
         else:
-            raise AttributeError, "no such column in pbi index"
+            raise AttributeError("no such column '%s' in pbi index" % key)
 
     def __dir__(self):
         basicDir = self.__dict__.keys()
diff --git a/pbcore/io/align/BamIO.py b/pbcore/io/align/BamIO.py
index 824674c..5a51bf8 100644
--- a/pbcore/io/align/BamIO.py
+++ b/pbcore/io/align/BamIO.py
@@ -168,16 +168,20 @@ class _BamReaderBase(ReaderBase):
 
     def _checkFileCompatibility(self):
         # Verify that this is a "pacbio" BAM file of version at least
-        # 3.0b7
+        # 3.0.1
         try:
             checkedVersion = self.version
             if "b" in checkedVersion:
-                version, beta = checkedVersion.split("b")
-                if int(beta) < 7: raise Exception()
+                raise Exception()
+            else:
+                major, minor, patch = checkedVersion.split('.')
+                assert major >= 3
+                assert minor >= 0
+                assert patch >= 1
         except:
             raise IncompatibleFile(
                 "This BAM file is incompatible with this API " +
-                "(only PacBio BAM files version >= 3.0b7 are supported)")
+                "(only PacBio BAM files version >= 3.0.1 are supported)")
 
     def __init__(self, fname, referenceFastaFname=None):
         self.filename = fname = abspath(expanduser(fname))
diff --git a/pbcore/io/align/PacBioBamIndex.py b/pbcore/io/align/PacBioBamIndex.py
index 642593e..d14e1fa 100644
--- a/pbcore/io/align/PacBioBamIndex.py
+++ b/pbcore/io/align/PacBioBamIndex.py
@@ -47,7 +47,7 @@ PBI_HEADER_LEN              = 32
 PBI_FLAGS_BASIC             = 0
 PBI_FLAGS_MAPPED            = 1
 PBI_FLAGS_COORDINATE_SORTED = 2
-PBI_FLAGS_BARCODE_ADAPTER   = 4
+PBI_FLAGS_BARCODE           = 4
 
 
 class PacBioBamIndex(object):
@@ -62,23 +62,32 @@ class PacBioBamIndex(object):
         header = unpack("< 4s BBBx H I 18x", buf)
         (self.magic, self.vPatch, self.vMinor,
          self.vMajor, self.pbiFlags, self.nReads) = header
+        try:
+            assert self.vMajor >= 3
+            assert self.vMinor >= 0
+            assert self.vPatch >= 1
+        except:
+            raise IncompatibleFile(
+                "This PBI file is incompatible with this API "
+                "(only PacBio PBI files version >= 3.0.1 are supported)")
 
     @property
     def hasMappingInfo(self):
         return (self.pbiFlags & PBI_FLAGS_MAPPED)
 
     @property
-    def hasAdapterInfo(self):
-        return (self.pbiFlags & PBI_FLAGS_BARCODE_ADAPTER)
+    def hasBarcodeInfo(self):
+        return (self.pbiFlags & PBI_FLAGS_BARCODE)
 
     def _loadMainIndex(self, f):
-        # Main index holds subread and mapping info
-        SUBREADS_INDEX_DTYPE = [
+        # Main index holds basic, mapping, and barcode info
+        BASIC_INDEX_DTYPE = [
             ("qId"               , "i4"),
             ("qStart"            , "i4"),
             ("qEnd"              , "i4"),
             ("holeNumber"        , "i4"),
-            ("readQual"          , "u2"),
+            ("readQual"          , "f4"),
+            ("contextFlag"       , "u1"),
             ("virtualFileOffset" , "i8") ]
 
         MAPPING_INDEX_DTYPE = [
@@ -92,16 +101,24 @@ class PacBioBamIndex(object):
             ("nMM"               , "u4"),
             ("mapQV"             , "u1") ]
 
+        BARCODE_INDEX_DTYPE = [
+            ("bcForward"         , "u2"),
+            ("bcReverse"         , "u2"),
+            ("bcQual"            , "u1")]
+
         COMPUTED_COLUMNS_DTYPE = [
             ("nIns"              , "u4"),
             ("nDel"              , "u4") ]
 
-        JOINT_DTYPE = SUBREADS_INDEX_DTYPE + MAPPING_INDEX_DTYPE + COMPUTED_COLUMNS_DTYPE
+        joint_dtype = BASIC_INDEX_DTYPE[:]
 
         if self.hasMappingInfo:
-            tbl = np.zeros(shape=(self.nReads,), dtype=JOINT_DTYPE).view(np.recarray)
-        else:
-            tbl = np.zeros(shape=(self.nReads,), dtype=SUBREADS_INDEX_DTYPE).view(np.recarray)
+            joint_dtype += MAPPING_INDEX_DTYPE
+            joint_dtype += COMPUTED_COLUMNS_DTYPE
+        if self.hasBarcodeInfo:
+            joint_dtype += BARCODE_INDEX_DTYPE
+        tbl = np.zeros(shape=(self.nReads,),
+                       dtype=joint_dtype).view(np.recarray)
 
         def peek(type_, length):
             flavor, width = type_
@@ -109,7 +126,7 @@ class PacBioBamIndex(object):
 
         if True:
             # BASIC data always present
-            for columnName, columnType in SUBREADS_INDEX_DTYPE:
+            for columnName, columnType in BASIC_INDEX_DTYPE:
                 tbl[columnName] = peek(columnType, self.nReads)
 
         if self.hasMappingInfo:
@@ -120,6 +137,10 @@ class PacBioBamIndex(object):
             tbl.nIns = tbl.aEnd - tbl.aStart - tbl.nM - tbl.nMM
             tbl.nDel = tbl.tEnd - tbl.tStart - tbl.nM - tbl.nMM
 
+        if self.hasBarcodeInfo:
+            for columnName, columnType in BARCODE_INDEX_DTYPE:
+                tbl[columnName] = peek(columnType, self.nReads)
+
         self._tbl = tbl
         self._checkForBrokenColumns()
 
diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py
index d0fe999..9c63cec 100755
--- a/pbcore/io/dataset/DataSetIO.py
+++ b/pbcore/io/dataset/DataSetIO.py
@@ -10,6 +10,7 @@ import copy
 import os
 import errno
 import logging
+import itertools
 import xml.dom.minidom
 import tempfile
 from functools import wraps
@@ -19,7 +20,9 @@ from pbcore.util.Process import backticks
 from pbcore.io.opener import (openAlignmentFile, openIndexedAlignmentFile,
                               FastaReader, IndexedFastaReader, CmpH5Reader,
                               IndexedBamReader)
+from pbcore.io.align.PacBioBamIndex import PBI_FLAGS_BARCODE
 from pbcore.io.FastaIO import splitFastaHeader, FastaWriter
+from pbcore.io.FastqIO import FastqReader, FastqWriter, qvsFromAscii
 from pbcore.io import BaxH5Reader
 from pbcore.io.align._BamSupport import UnavailableFeature
 from pbcore.io.dataset.DataSetReader import (parseStats, populateDataSet,
@@ -33,7 +36,8 @@ from pbcore.io.dataset.DataSetMembers import (DataSetMetadata,
                                               BarcodeSetMetadata,
                                               ExternalResources,
                                               ExternalResource, Filters)
-from pbcore.io.dataset.utils import consolidateBams, _infixFname
+from pbcore.io.dataset.utils import (consolidateBams, _infixFname, _pbindexBam,
+                                     _indexBam, _indexFasta)
 
 log = logging.getLogger(__name__)
 
@@ -53,41 +57,62 @@ def filtered(generator):
     return wrapper
 
 
-def _toDsId(x):
-    return "PacBio.DataSet.{x}".format(x=x)
+def _toDsId(name):
+    """Translate a class name into a MetaType/ID"""
+    return "PacBio.DataSet.{x}".format(x=name)
 
-def _dsIdToName(x):
-    if DataSetMetaTypes.isValid(x):
-        return x.split('.')[-1]
+def _dsIdToName(dsId):
+    """Translate a MetaType/ID into a class name"""
+    if DataSetMetaTypes.isValid(dsId):
+        return dsId.split('.')[-1]
+    else:
+        raise InvalidDataSetIOError("Invalid DataSet MetaType")
 
-def _dsIdToType(x):
-    if DataSetMetaTypes.isValid(x):
+def _dsIdToType(dsId):
+    """Translate a MetaType/ID into a type"""
+    if DataSetMetaTypes.isValid(dsId):
         types = DataSet.castableTypes()
-        return types[_dsIdToName(x)]
+        return types[_dsIdToName(dsId)]
+    else:
+        raise InvalidDataSetIOError("Invalid DataSet MetaType")
 
-def _dsIdToSuffix(x):
+def _dsIdToSuffix(dsId):
+    """Translate a MetaType/ID into a file suffix"""
     dsIds = DataSetMetaTypes.ALL
     suffixMap = {dsId: _dsIdToName(dsId) for dsId in dsIds}
     suffixMap[_toDsId("DataSet")] = 'DataSet'
-    if DataSetMetaTypes.isValid(x):
-        suffix = suffixMap[x]
+    if DataSetMetaTypes.isValid(dsId):
+        suffix = suffixMap[dsId]
         suffix = suffix.lower()
         suffix += '.xml'
         return suffix
+    else:
+        raise InvalidDataSetIOError("Invalid DataSet MetaType")
 
 def _typeDataSet(dset):
+    """Determine the type of a dataset from the xml file without opening it"""
     xml_rt = xmlRootType(dset)
     dsId = _toDsId(xml_rt)
     tbrType = _dsIdToType(dsId)
     return tbrType
 
+def isDataSet(xmlfile):
+    """Determine if a file is a DataSet before opening it"""
+    try:
+        _typeDataSet(xmlfile)
+        return True
+    except:
+        return False
+
 def openDataSet(*files, **kwargs):
-    if not files[0].endswith('xml'):
-        raise TypeError("openDataSet requires that the first file is an XML")
+    """Factory function for DataSet types as suggested by the first file"""
+    if not isDataSet(files[0]):
+        raise TypeError("openDataSet requires that the first file is a DS")
     tbrType = _typeDataSet(files[0])
     return tbrType(*files, **kwargs)
 
 def openDataFile(*files, **kwargs):
+    """Factory function for DataSet types determined by the first data file"""
     possibleTypes = [AlignmentSet, SubreadSet, ConsensusReadSet,
                      ConsensusAlignmentSet, ReferenceSet, HdfSubreadSet]
     origFiles = files
@@ -126,10 +151,16 @@ def openDataFile(*files, **kwargs):
                     return SubreadSet(*origFiles, **kwargs)
 
 def _stackRecArrays(recArrays):
+    """Stack recarrays into a single larger recarray"""
     tbr = np.concatenate(recArrays)
     tbr = tbr.view(np.recarray)
     return tbr
 
+def _flatten(lol, times=1):
+    """ This wont do well with mixed nesting"""
+    for _ in range(times):
+        lol = np.concatenate(lol)
+    return lol
 
 class DataSetMetaTypes(object):
     """
@@ -231,6 +262,7 @@ class DataSet(object):
         """
         self._strict = kwargs.get('strict', False)
         self._skipCounts = kwargs.get('skipCounts', False)
+        _induceIndices = kwargs.get('generateIndices', False)
 
         # The metadata concerning the DataSet or subtype itself (e.g.
         # name, version, UniqueId)
@@ -326,6 +358,13 @@ class DataSet(object):
             elif self.totalLength <= 0 or self.numRecords <= 0:
                 self.updateCounts()
 
+        # generate indices if requested and needed
+        if _induceIndices:
+            self._induceIndices()
+
+    def _induceIndices(self):
+        raise NotImplementedError()
+
     def __repr__(self):
         """Represent the dataset with an informative string:
 
@@ -381,9 +420,7 @@ class DataSet(object):
                 self.__class__.__name__ == 'DataSet'):
             # determine whether or not this is the merge that is populating a
             # dataset for the first time
-            firstIn = False
-            if len(self.externalResources) == 0:
-                firstIn = True
+            firstIn = True if len(self.externalResources) == 0 else False
 
             # Block on filters?
             if (not firstIn and
@@ -407,6 +444,17 @@ class DataSet(object):
             else:
                 result = self
 
+            # If this dataset has no subsets representing it, add self as a
+            # subdataset to the result
+            # TODO: this is a stopgap to prevent spurious subdatasets when
+            # creating datasets from dataset xml files...
+            if not self.subdatasets and not firstIn:
+                result.addDatasets(self.copy())
+
+            # add subdatasets
+            if other.subdatasets or not firstIn:
+                result.addDatasets(other.copy())
+
             # add in the metadata (not to be confused with obj metadata)
             if firstIn:
                 result.metadata = other.metadata
@@ -421,20 +469,6 @@ class DataSet(object):
             # DataSets may only be merged if they have identical filters,
             # So there is nothing new to add.
 
-            # If this dataset has no subsets representing it, add self as a
-            # subdataset to the result
-            # TODO: this is a stopgap to prevent spurious subdatasets when
-            # creating datasets from dataset xml files...
-            if not self.subdatasets and not firstIn:
-                result.addDatasets(self.copy())
-
-            # add subdatasets
-            if other.subdatasets or not firstIn:
-                #if copyOnMerge:
-                result.addDatasets(other.copy())
-                #else:
-                    #result.addDatasets(other)
-
             return result
         else:
             raise TypeError('DataSets can only be merged with records of the '
@@ -623,7 +657,7 @@ class DataSet(object):
 
     def split(self, chunks=0, ignoreSubDatasets=False, contigs=False,
               maxChunks=0, breakContigs=False, targetSize=5000, zmws=False,
-              barcodes=False, byRecords=False, updateCounts=False):
+              barcodes=False, byRecords=False, updateCounts=True):
         """Deep copy the DataSet into a number of new DataSets containing
         roughly equal chunks of the ExternalResources or subdatasets.
 
@@ -655,6 +689,13 @@ class DataSet(object):
             breakContigs: Whether or not to break contigs when using maxChunks.
                           If True, something resembling an efficient number of
                           chunks for the dataset size will be produced.
+            targetSize: The target number of reads per chunk, when using
+                        byRecords
+            zmws: Split by zmws
+            barcodes: Split by barcodes
+            byRecords: Split contigs by mapped records, rather than reference
+                       length
+            updateCounts: Update the count metadata in each chunk
         Returns:
             A list of new DataSet objects (all other information deep copied).
 
@@ -837,9 +878,16 @@ class DataSet(object):
                 self.makePathsAbsolute()
         xml_string = toXml(self)
         if pretty:
-            xml_string = xml.dom.minidom.parseString(xml_string).toprettyxml()
-        if validate:
-            validateString(xml_string, relTo=outFile)
+            xml_string = xml.dom.minidom.parseString(xml_string).toprettyxml(
+                encoding="UTF-8")
+        # Disable for contigsets: no support for multiple contigs in XSD
+        # Disable for ccssets: no support for datasetmetadata in XSD
+        # XXX turn this off temporarily to aid in pbsmrtpipe fixing
+        if (validate and
+                not self.datasetType in [ContigSet.datasetType,
+                                         ConsensusReadSet.datasetType]):
+            #validateString(xml_string, relTo=os.path.dirname(outFile))
+            pass
         fileName = urlparse(outFile).path.strip()
         if self._strict and not isinstance(self.datasetType, tuple):
             if not fileName.endswith(_dsIdToSuffix(self.datasetType)):
@@ -1017,7 +1065,9 @@ class DataSet(object):
             resource.timeStampedName = tsName
 
     def _getTimeStampedName(self, mType):
-        time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S")
+        mType = mType.lower()
+        mType = '_'.join(mType.split('.'))
+        time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
         return "{m}-{t}".format(m=mType, t=time)
 
     @staticmethod
@@ -1297,6 +1347,21 @@ class DataSet(object):
             ranges.append((movie, min(values), max(values)))
         return ranges
 
+    # FIXME this is a workaround for the lack of support for barcode chunking
+    # in pbbam, and should probably go away once that is available.
+    @property
+    def barcodes(self):
+        """Return the list of barcodes explicitly set by filters via
+        DataSet.split(barcodes=True).
+
+        """
+        barcodes = []
+        for filt in self._filters:
+            for param in filt:
+                if param.name == "bc":
+                    barcodes.append(param.value)
+        return barcodes
+
     def toFofn(self, outfn=None, uri=False, relative=False):
         """Return a list of resource filenames (and write to optional outfile)
 
@@ -1394,7 +1459,10 @@ class DataSet(object):
     @filters.setter
     def filters(self, value):
         """Limit setting to ensure cache hygiene and filter compatibility"""
-        self._filters = value
+        if value is None:
+            self._filters = Filters()
+        else:
+            self._filters = value
 
     def reFilter(self, light=True):
         """
@@ -1476,7 +1544,9 @@ class DataSet(object):
 
     @property
     def sequencingChemistry(self):
-        return self._checkIdentical('sequencingChemistry')
+        key = 'sequencingChemistry'
+        responses = self._pollResources(lambda x: getattr(x, key))
+        return _flatten(responses)
 
     @property
     def isEmpty(self):
@@ -1545,7 +1615,6 @@ class DataSet(object):
                     raise IOError(errno.EIO, "File not indexed", fname)
         return True
 
-
     def __getitem__(self, index):
         """Should respect filters for free, as _indexMap should only be
         populated by filtered reads. Only pbi filters considered, however."""
@@ -1580,6 +1649,30 @@ class ReadSet(DataSet):
         super(ReadSet, self).__init__(*files, **kwargs)
         self._metadata = SubreadSetMetadata(self._metadata)
 
+    def _induceIndices(self):
+        for res in self.externalResources:
+            fname = res.resourceId
+            if not res.pbi:
+                _pbindexBam(fname)
+                self.close()
+            if not res.bai:
+                _indexBam(fname)
+                self.close()
+
+    @property
+    def isIndexed(self):
+        if self.hasPbi:
+            return True
+        return False
+
+    def assertBarcoded(self):
+        """Test whether all resources are barcoded files"""
+        res = self._pollResources(
+            lambda x: x.index.pbiFlags & PBI_FLAGS_BARCODE)
+        if not self._unifyResponses(res):
+            raise RuntimeError("File not barcoded")
+
+
     def _openFiles(self):
         """Open the files (assert they exist, assert they are of the proper
         type before accessing any file)
@@ -1607,12 +1700,12 @@ class ReadSet(DataSet):
                         location, referenceFastaFname=refFile)
                 else:
                     raise
-            #if resource is None:
-                #assert not self._strict
-                #resource = openAlignmentFile(
-                    #location, referenceFastaFname=refFile)
-            if not resource.isEmpty:
-                self._openReaders.append(resource)
+            try:
+                if not resource.isEmpty:
+                    self._openReaders.append(resource)
+            except UnavailableFeature: # isEmpty requires bai
+                if list(itertools.islice(resource, 1)):
+                    self._openReaders.append(resource)
         if len(self._openReaders) == 0 and len(self.toExternalFiles()) != 0:
             raise IOError("No files were openable or reads found")
         log.debug("Done opening resources")
@@ -1645,10 +1738,12 @@ class ReadSet(DataSet):
 
         """
         # Find all possible barcodes and counts for each
-        # TODO: switch this over to the pbi when bc information is exposed
+        self.assertIndexed()
+        self.assertBarcoded()
         barcodes = defaultdict(int)
-        for read in self.records:
-            barcodes[tuple(read.peer.opt("bc"))] += 1
+        for bcTuple in itertools.izip(self.index.bcForward,
+                                      self.index.bcReverse):
+            barcodes[bcTuple] += 1
 
         log.debug("{i} barcodes found".format(i=len(barcodes.keys())))
 
@@ -1677,6 +1772,7 @@ class ReadSet(DataSet):
         log.debug("Generating new UUID")
         for result in results:
             result.newUuid()
+        # TODO: updateCounts
 
         # Update the basic metadata for the new DataSets from external
         # resources, or at least mark as dirty
@@ -1840,7 +1936,7 @@ class ReadSet(DataSet):
         # Pull generic values, kwargs, general treatment in super
         super(ReadSet, self).addMetadata(newMetadata, **kwargs)
 
-    def consolidate(self, dataFile, numFiles=1):
+    def consolidate(self, dataFile, numFiles=1, useTmp=True):
         """Consolidate a larger number of bam files to a smaller number of bam
         files (min 1)
 
@@ -1863,15 +1959,18 @@ class ReadSet(DataSet):
                 resLists.append(thisResList)
             fnames = [_infixFname(dataFile, str(i)) for i in range(numFiles)]
             for resList, fname in zip(resLists, fnames):
-                consolidateBams(resList, fname, filterDset=self)
+                consolidateBams(resList, fname, filterDset=self, useTmp=useTmp)
             log.debug("Replacing resources")
             self.externalResources = ExternalResources()
             self.addExternalResources(fnames)
         else:
-            consolidateBams(self.toExternalFiles(), dataFile, filterDset=self)
+            consolidateBams(self.toExternalFiles(), dataFile, filterDset=self,
+                            useTmp=useTmp)
+            # TODO: add as subdatasets
             log.debug("Replacing resources")
             self.externalResources = ExternalResources()
             self.addExternalResources([dataFile])
+        self._populateMetaTypes()
 
     def __len__(self):
         if self.numRecords == -1:
@@ -1919,6 +2018,13 @@ class HdfSubreadSet(ReadSet):
         self.objMetadata["TimeStampedName"] = self._getTimeStampedName(
             self.objMetadata["MetaType"])
 
+    def _induceIndices(self):
+        log.debug("Bax files already indexed")
+
+    @property
+    def isIndexed(self):
+        return True
+
     def _openFiles(self):
         """Open the files (assert they exist, assert they are of the proper
         type before accessing any file)
@@ -2055,6 +2161,19 @@ class AlignmentSet(ReadSet):
         else:
             return super(AlignmentSet, self).consolidate(*args, **kwargs)
 
+    def _induceIndices(self):
+        if self.isCmpH5:
+            log.debug("Cmp.h5 files already indexed")
+        else:
+            return super(AlignmentSet, self)._induceIndices()
+
+    @property
+    def isIndexed(self):
+        if self.isCmpH5:
+            return True
+        else:
+            return super(AlignmentSet, self).isIndexed
+
     def addReference(self, fname):
         if isinstance(fname, ReferenceSet):
             reference = fname.externalResources.resourceIds
@@ -2221,9 +2340,9 @@ class AlignmentSet(ReadSet):
                     winend = -1
                     for param in filt:
                         if param.name == 'tstart':
-                            winstart = param.value
-                        if param.name == 'tend':
                             winend = param.value
+                        if param.name == 'tend':
+                            winstart = param.value
                     # If the filter is just for rname, fill the window
                     # boundaries (pricey but worth the guaranteed behavior)
                     if winend == -1:
@@ -2365,7 +2484,7 @@ class AlignmentSet(ReadSet):
         return shiftedAtoms
 
     def _split_contigs(self, chunks, maxChunks=0, breakContigs=False,
-                       targetSize=5000, byRecords=True, updateCounts=True):
+                       targetSize=5000, byRecords=False, updateCounts=True):
         """Split a dataset into reference windows based on contigs.
 
         Args:
@@ -2396,21 +2515,22 @@ class AlignmentSet(ReadSet):
                          for rn in refNames
                          if len(self._indexReadsInReference(rn)) != 0]
             else:
-                atoms = [(rn, 0, 0) for rn in refNames if
+                atoms = [(rn, 0, refLens[rn]) for rn in refNames if
                          len(self._indexReadsInReference(rn)) != 0]
         else:
             log.debug("Skipping records for each reference check")
-            atoms = [(rn, 0, 0) for rn in refNames]
+            atoms = [(rn, 0, refLens[rn]) for rn in refNames]
             if byRecords:
                 log.debug("Counting records...")
                 # This is getting out of hand, but the number of references
                 # determines the best read counting algorithm:
                 if len(refNames) < 100:
-                    atoms = [(rn, 0, 0, self.countRecords(rn))
+                    atoms = [(rn, 0, refLens[rn], self.countRecords(rn))
                              for rn in refNames]
                 else:
                     counts = self._countMappedReads()
-                    atoms = [(rn, 0, 0, counts[rn]) for rn in refNames]
+                    atoms = [(rn, 0, refLens[rn], counts[rn])
+                             for rn in refNames]
         log.debug("{i} contigs found".format(i=len(atoms)))
 
         if byRecords:
@@ -2473,6 +2593,27 @@ class AlignmentSet(ReadSet):
                 sub_segments.append(tmp)
                 segments.extend(sub_segments)
             atoms = segments
+        elif breakContigs and not byRecords:
+            log.debug("Checking for oversized chunks")
+            # we may have chunks <= len(atoms). We wouldn't usually split up
+            # contigs, but some might be huge, resulting in some tasks running
+            # very long
+            # We are only doing this for refLength splits for now, as those are
+            # cheap (and quiver is linear in length not coverage)
+            dataSize = sum(refLens.values())
+            # target size per chunk:
+            target = dataSize/chunks
+            log.debug("Target chunk length: {t}".format(t=target))
+            newAtoms = []
+            for i, atom in enumerate(atoms):
+                testAtom = atom
+                while testAtom[2] - testAtom[1] > target:
+                    newAtom1 = (testAtom[0], testAtom[1], testAtom[1] + target)
+                    newAtom2 = (testAtom[0], testAtom[1] + target, testAtom[2])
+                    newAtoms.append(newAtom1)
+                    testAtom = newAtom2
+                newAtoms.append(testAtom)
+                atoms = newAtoms
 
         log.debug("Done defining {n} chunks".format(n=chunks))
         # duplicate
@@ -2489,7 +2630,8 @@ class AlignmentSet(ReadSet):
         log.debug("Distributing chunks")
         # if we didn't have to split atoms and are doing it byRecords, the
         # original counts are still valid:
-        # Now well have to count records again to recombine atoms
+        #
+        # Otherwise we'll now have to count records again to recombine atoms
         chunks = self._chunkList(atoms, chunks, balanceKey)
 
         log.debug("Done chunking")
@@ -2498,8 +2640,8 @@ class AlignmentSet(ReadSet):
             if atoms[0][2]:
                 result.filters.addRequirement(
                     rname=[('=', c[0]) for c in chunk],
-                    tStart=[('>', c[1]) for c in chunk],
-                    tEnd=[('<', c[2]) for c in chunk])
+                    tStart=[('<', c[2]) for c in chunk],
+                    tEnd=[('>', c[1]) for c in chunk])
             else:
                 result.filters.addRequirement(
                     rname=[('=', c[0]) for c in chunk])
@@ -3030,8 +3172,9 @@ class ContigSet(DataSet):
         super(ContigSet, self).__init__(*files, **kwargs)
         self._metadata = ContigSetMetadata(self._metadata)
         self._updateMetadata()
+        self._fastq = False
 
-    def consolidate(self, outfn=None, numFiles=1):
+    def consolidate(self, outfn=None, numFiles=1, useTmp=False):
         """Consolidation should be implemented for window text in names and
         for filters in ContigSets"""
 
@@ -3066,6 +3209,7 @@ class ContigSet(DataSet):
             writeTemp = True
         writeMatches = {}
         writeComments = {}
+        writeQualities = {}
         for name, match_list in matches.items():
             if len(match_list) > 1:
                 log.debug("Multiple matches found for {i}".format(i=name))
@@ -3086,7 +3230,11 @@ class ContigSet(DataSet):
 
                 # collapse matches
                 new_name = self._removeWindow(name)
-                new_seq = ''.join([match.sequence for match in match_list])
+                new_seq = ''.join([match.sequence[:] for match in match_list])
+                if self._fastq:
+                    new_qual = ''.join([match.qualityString for match in
+                                        match_list])
+                    writeQualities[new_name] = new_qual
 
                 # set to write
                 writeTemp = True
@@ -3094,8 +3242,10 @@ class ContigSet(DataSet):
                 writeComments[new_name] = match_list[0].comment
             else:
                 log.debug("One match found for {i}".format(i=name))
-                writeMatches[name] = match_list[0].sequence
+                writeMatches[name] = match_list[0].sequence[:]
                 writeComments[name] = match_list[0].comment
+                if self._fastq:
+                    writeQualities[name] = match_list[0].qualityString
         if writeTemp:
             log.debug("Writing a new file is necessary")
             if not outfn:
@@ -3103,12 +3253,18 @@ class ContigSet(DataSet):
                 outdir = tempfile.mkdtemp(suffix="consolidated-contigset")
                 outfn = os.path.join(outdir,
                                      'consolidated_contigs.fasta')
-            with FastaWriter(outfn) as outfile:
+            with self._writer(outfn) as outfile:
                 log.debug("Writing new resource {o}".format(o=outfn))
                 for name, seq in writeMatches.items():
                     if writeComments[name]:
                         name = ' '.join([name, writeComments[name]])
+                    if self._fastq:
+                        outfile.writeRecord(name, seq,
+                                            qvsFromAscii(writeQualities[name]))
+                        continue
                     outfile.writeRecord(name, seq)
+            if not self._fastq:
+                _indexFasta(outfn)
             # replace resources
             log.debug("Replacing resources")
             self.externalResources = ExternalResources()
@@ -3117,12 +3273,19 @@ class ContigSet(DataSet):
             log.debug("Replacing metadata")
             self._metadata.contigs = []
             self._populateContigMetadata()
+        self._populateMetaTypes()
+
+    @property
+    def _writer(self):
+        if self._fastq:
+            return FastqWriter
+        return FastaWriter
 
     def _popSuffix(self, name):
         """Chunking and quivering adds suffixes to contig names, after the
         normal ID and window. This complicates our dewindowing and
         consolidation, so we'll remove them for now"""
-        observedSuffixes = ['|quiver']
+        observedSuffixes = ['|quiver', '|plurality', '|arrow']
         for suff in observedSuffixes:
             if name.endswith(suff):
                 log.debug("Suffix found: {s}".format(s=suff))
@@ -3138,6 +3301,12 @@ class ContigSet(DataSet):
             return '_'.join(name.split('_')[:-2]) + suff
         return name
 
+    def _induceIndices(self):
+        if not self.isIndexed:
+            for fname in self.toExternalFiles():
+                _indexFasta(fname)
+            self.close()
+
     def _parseWindow(self, name):
         """Chunking and quivering appends a window to the contig ID, which
         allows us to consolidate the contig chunks."""
@@ -3230,6 +3399,10 @@ class ContigSet(DataSet):
         log.debug("Opening resources")
         for extRes in self.externalResources:
             location = urlparse(extRes.resourceId).path
+            if location.endswith("fastq"):
+                self._fastq = True
+                self._openReaders.append(FastqReader(location))
+                continue
             try:
                 resource = IndexedFastaReader(location)
             except IOError:
@@ -3314,6 +3487,7 @@ class ContigSet(DataSet):
     @staticmethod
     def _metaTypeMapping():
         return {'fasta':'PacBio.ContigFile.ContigFastaFile',
+                'fastq':'PacBio.ContigFile.ContigFastqFile',
                 'fa':'PacBio.ContigFile.ContigFastaFile',
                 'fas':'PacBio.ContigFile.ContigFastaFile',
                 'fai':'PacBio.Index.SamIndex',
diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py
index fb79d29..3f36238 100755
--- a/pbcore/io/dataset/DataSetMembers.py
+++ b/pbcore/io/dataset/DataSetMembers.py
@@ -54,6 +54,7 @@ import operator as OP
 import numpy as np
 import logging
 from functools import partial as P
+from collections import Counter
 from pbcore.io.dataset.DataSetWriter import namespaces
 
 log = logging.getLogger(__name__)
@@ -427,10 +428,6 @@ class Filters(RecordWrapper):
 
     @property
     def _bamAccMap(self):
-        # tStart/tEnd is a hack for overlapping ranges. you're not testing
-        # whether the tStart/tEnd is within a range, you're testing if it
-        # overlaps with the tStart/tEnd in the filter (overlaps with a
-        # reference window)
         return {'rname': (lambda x: x.referenceName),
                 'length': (lambda x: int(x.readLength)),
                 'qname': (lambda x: x.qNameA),
@@ -442,38 +439,43 @@ class Filters(RecordWrapper):
                 'pos': (lambda x: int(x.tStart)),
                 'accuracy': (lambda x: float(x.identity)),
                 'readstart': (lambda x: int(x.aStart)),
-                'tstart': (lambda x: int(x.tEnd)), # see above
-                'tend': (lambda x: int(x.tStart)), # see above
+                'tstart': (lambda x: int(x.tStart)),
+                'tend': (lambda x: int(x.tEnd)),
                }
 
     def _pbiAccMap(self, tIdMap):
-        # tStart/tEnd is a hack for overlapping ranges. you're not testing
-        # whether the tStart/tEnd is within a range, you're testing if it
-        # overlaps with the tStart/tEnd in the filter (overlaps with a
-        # reference window)
         return {'rname': (lambda x, m=tIdMap: m[x.tId]),
                 'length': (lambda x: int(x.aEnd)-int(x.aStart)),
                 'qname': (lambda x: x.qId),
                 'zm': (lambda x: int(x.holeNumber)),
                 'pos': (lambda x: int(x.tStart)),
                 'readstart': (lambda x: int(x.aStart)),
-                'tstart': (lambda x: int(x.tEnd)), # see above
-                'tend': (lambda x: int(x.tStart)), # see above
+                'tstart': (lambda x: int(x.tStart)),
+                'tend': (lambda x: int(x.tEnd)),
                }
 
-    def _pbiVecAccMap(self, tIdMap):
-        # tStart/tEnd is a hack for overlapping ranges. you're not testing
-        # whether the tStart/tEnd is within a range, you're testing if it
-        # overlaps with the tStart/tEnd in the filter (overlaps with a
-        # reference window)
-        return {'rname': (lambda x, m=tIdMap: m[x.tId]),
+    def _pbiMappedVecAccMap(self, tIdMap):
+        plus = {'rname': (lambda x, m=tIdMap: m[x.tId]),
                 'length': (lambda x: x.aEnd - x.aStart),
-                'qname': (lambda x: x.qId),
-                'zm': (lambda x: x.holeNumber),
                 'pos': (lambda x: x.tStart),
                 'readstart': (lambda x: x.aStart),
-                'tstart': (lambda x: x.tEnd), # see above
-                'tend': (lambda x: x.tStart), # see above
+                'tstart': (lambda x: x.tStart),
+                'tend': (lambda x: x.tEnd),
+               }
+        base = self._pbiVecAccMap(tIdMap)
+        base.update(plus)
+        return base
+
+    def _pbiVecAccMap(self, tIdMap):
+        return {'length': (lambda x: x.qEnd - x.qStart),
+                'qstart': (lambda x: x.qStart),
+                'qend': (lambda x: x.qEnd),
+                'qname': (lambda x: x.qId),
+                'zm': (lambda x: x.holeNumber),
+                'rq': (lambda x: x.readQual),
+                'bcf': (lambda x: x.bcForward),
+                'bcr': (lambda x: x.bcForward),
+                'bcq': (lambda x: x.bcQual),
                }
 
     @property
@@ -528,7 +530,9 @@ class Filters(RecordWrapper):
     def filterIndexRecords(self, indexRecords, nameMap):
         typeMap = self._bamTypeMap
         accMap = self._pbiVecAccMap({})
-        accMap['rname'] = lambda x: x.tId
+        if 'aEnd' in indexRecords.dtype.names:
+            accMap = self._pbiMappedVecAccMap({})
+            accMap['rname'] = lambda x: x.tId
         filterLastResult = None
         for filt in self:
             lastResult = None
@@ -554,6 +558,10 @@ class Filters(RecordWrapper):
                 del lastResult
         return filterLastResult
 
+    def fromString(self, filterString):
+        filtDict = {}
+        self._runCallbacks()
+
     def addRequirement(self, **kwargs):
         """Use this to add requirements. Members of the list will be considered
         options for fulfilling this requirement, all other filters will be
@@ -705,6 +713,9 @@ class ExternalResources(RecordWrapper):
         super(self.__class__, self).__init__(record)
         self.record['tag'] = self.__class__.__name__
 
+        # state tracking. Not good, but needs it:
+        self._resourceIds = []
+
     def __eq__(self, other):
         for extRef in self:
             found = False
@@ -728,7 +739,14 @@ class ExternalResources(RecordWrapper):
 
     def merge(self, other):
         # make sure we don't add dupes
-        curIds = [res.resourceId for res in self]
+        curIds = self.resourceIds
+
+        # check to make sure ResourceIds in other are unique
+        otherIds = Counter([res.resourceId for res in other])
+        dupes = [c for c in otherIds if otherIds[c] > 1]
+        if dupes:
+            raise RuntimeError("Duplicate ResourceIds found: "
+                               "{f}".format(f=', '.join(dupes)))
 
         for newRes in other:
             # merge instead
@@ -752,6 +770,7 @@ class ExternalResources(RecordWrapper):
             resourceIds: a list of uris as strings
         """
         templist = []
+        self._resourceIds = []
         for res in resourceIds:
             toAdd = res
             if not isinstance(res, ExternalResource):
@@ -774,13 +793,16 @@ class ExternalResources(RecordWrapper):
         aren't in record form, but append will fix that for us automatically. A
         bit messy, but fairly concise.
         """
+        self._resourceIds = []
         self.record['children'] = []
         for res in resources:
             self.append(res)
 
     @property
     def resourceIds(self):
-        return [res.resourceId for res in self]
+        if not self._resourceIds:
+            self._resourceIds = [res.resourceId for res in self]
+        return self._resourceIds
 
 
 class ExternalResource(RecordWrapper):
@@ -1000,7 +1022,6 @@ class DataSetMetadata(RecordWrapper):
             else:
                 self.append(other.summaryStats)
         if not self.namespace:
-            print "HERE"
             self.namespace = other.namespace
             self.attrib.update(other.attrib)
 
diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py
index 3c932db..eadf00a 100755
--- a/pbcore/io/dataset/DataSetReader.py
+++ b/pbcore/io/dataset/DataSetReader.py
@@ -57,6 +57,7 @@ def _addXmlFile(dset, path):
     root = tree.getroot()
     tmp = _parseXml(type(dset), root)
     tmp.makePathsAbsolute(curStart=os.path.dirname(path))
+    # copyOnMerge must be false, you're merging in a tmp and maintaining dset
     dset.merge(tmp, copyOnMerge=False)
 
 def openFofnFile(path):
diff --git a/pbcore/io/dataset/DataSetValidator.py b/pbcore/io/dataset/DataSetValidator.py
index d0f9980..d588198 100755
--- a/pbcore/io/dataset/DataSetValidator.py
+++ b/pbcore/io/dataset/DataSetValidator.py
@@ -16,7 +16,7 @@ def validateResources(xmlroot, relTo='.'):
     Args:
         xmlroot: The ET root of an xml tree
         relTo: ('.') The path relative to which resources may reside. This will
-               work poorly if relTo is not set to the location of the incoming
+               work poorly if relTo is not set to the dirname of the incoming
                XML file.
     """
     stack = [xmlroot]
@@ -28,8 +28,10 @@ def validateResources(xmlroot, relTo='.'):
             parsedId = urlparse(resId)
             rfn = urlparse(resId).path.strip()
             if not os.path.isfile(rfn):
-                if not os.path.isfile(os.path.join(os.path.dirname(relTo),
-                                                   rfn)):
+                if (not os.path.isfile(os.path.join(relTo,
+                                                    rfn)) and
+                        not os.path.isfile(os.path.join('.',
+                                                        rfn))):
                     raise IOError, "{f} not found".format(f=rfn)
 
 def validateLxml(xml_fn, xsd_fn):
@@ -49,13 +51,10 @@ def validateMiniXsv(xml_fn, xsd_fn):
     except ImportError:
         log.debug('minixsv not found, validation disabled')
 
-def validateXml(xmlroot, skipResources=False):
-    # pyxb precompiles and therefore does not need the original xsd file.
-    #if not os.path.isfile(XSD):
-        #raise SystemExit, "Validation xsd {s} not found".format(s=XSD)
+def validateXml(xmlroot, skipResources=False, relTo='.'):
 
-    if not skipResources: # XXX generally a bad idea, but useful for pbvalidate
-        validateResources(xmlroot)
+    if not skipResources:
+        validateResources(xmlroot, relTo)
 
     # Conceal the first characters of UniqueIds if they are legal numbers that
     # would for some odd reason be considered invalid. Let all illegal
@@ -75,14 +74,9 @@ def validateFile(xmlfn, skipResources=False):
     if ':' in xmlfn:
         xmlfn = urlparse(xmlfn).path.strip()
     with open(xmlfn, 'r') as xmlfile:
-        #root = etree.parse(xmlfile)
         root = ET.parse(xmlfile).getroot()
-        return validateXml(root, skipResources=skipResources)
-
-def validateString(xmlstring, relTo='.'):
-    #root = etree.parse(xmlfile)
-    root = ET.fromstring(xmlstring)
-    validateResources(root, relTo=relTo)
-    #return validateXml(root)
-
+        return validateXml(root, skipResources=skipResources,
+                           relTo=os.path.dirname(xmlfn))
 
+def validateString(xmlString, skipResources=False, relTo='.'):
+    validateXml(ET.fromstring(xmlString), skipResources, relTo)
diff --git a/pbcore/io/dataset/DataSetWriter.py b/pbcore/io/dataset/DataSetWriter.py
index 976572d..6275b4f 100755
--- a/pbcore/io/dataset/DataSetWriter.py
+++ b/pbcore/io/dataset/DataSetWriter.py
@@ -158,9 +158,9 @@ def _toElementTree(dataSet, root=None, core=False):
         root = ET.Element(rootType, attribs)
 
     _addExternalResourcesElement(dataSet, root, core)
-    _addDataSetMetadataElement(dataSet, root)
     _addFiltersElement(dataSet, root)
     _addDataSetsElement(dataSet, root)
+    _addDataSetMetadataElement(dataSet, root)
     xsi = "{http://www.w3.org/2001/XMLSchema-instance}"
     # The ElementTree element dictionary doesn't quite work the same as a
     # regular dictionary, it seems, thus the convoluted get/set business
@@ -234,8 +234,15 @@ def _guessNs(tag):
 
 def _eleFromDictList(eleAsDict, core=False):
     """A last ditch capture method for uknown Elements"""
+    if eleAsDict['tag'] == 'DataSets':
+        print eleAsDict['namespace']
     if not eleAsDict['namespace']:
         eleAsDict['namespace'] = _guessNs(eleAsDict['tag'])
+    elif (eleAsDict['namespace'] ==
+            "http://pacificbiosciences.com/PacBioDataModel.xsd"):
+        newNamespace = _guessNs(eleAsDict['tag'])
+        if newNamespace != '':
+            eleAsDict['namespace'] = newNamespace
     if core:
         ele = ET.Element("{{{n}}}{t}".format(n=eleAsDict['namespace'],
                                              t=eleAsDict['tag']),
@@ -258,9 +265,7 @@ def _addFiltersElement(dataset, root, core=False):
         root: The root ElementTree object. Extended here using SubElement
     """
     if dataset.filters:
-        filters = ET.SubElement(root, 'Filters')
-        for child in dataset.filters.record['children']:
-            filters.append(_eleFromDictList(child, core))
+        root.append(_eleFromDictList(dataset.filters.record, core=core))
 
 def _addDataSetsElement(dataset, root):
     """Add DataSet Elements to root, which essentially nests ElementTrees.
@@ -269,9 +274,13 @@ def _addDataSetsElement(dataset, root):
         root: The root ElementTree object. Extended here using SubElement
     """
     if dataset.subdatasets:
-        dse = ET.SubElement(root, 'DataSets')
+        rootType = '{{{n}}}{t}'.format(n=namespaces()['pbds'],
+                                   t='DataSets')
+        dse = ET.SubElement(root, rootType)
         for subSet in dataset.subdatasets:
-            subSetRoot = ET.SubElement(dse, subSet.__class__.__name__,
+            rootType = '{{{n}}}{t}'.format(n=namespaces()['pbds'],
+                                           t=subSet.__class__.__name__)
+            subSetRoot = ET.SubElement(dse, rootType,
                                        subSet.objMetadata)
             _toElementTree(subSet, subSetRoot)
 
diff --git a/pbcore/io/dataset/EntryPoints.py b/pbcore/io/dataset/EntryPoints.py
index 47aede4..cdf02c7 100755
--- a/pbcore/io/dataset/EntryPoints.py
+++ b/pbcore/io/dataset/EntryPoints.py
@@ -2,7 +2,9 @@
 
 import os
 import argparse
+from collections import defaultdict
 from pbcore.io import DataSet, ContigSet, openDataSet
+from pbcore.io.dataset.DataSetMembers import Filters
 from pbcore.io.dataset.DataSetValidator import validateFile
 import logging
 
@@ -24,7 +26,13 @@ def summarize_options(parser):
 def createXml(args):
     dsTypes = DataSet.castableTypes()
     dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
-                                skipCounts=args.skipCounts)
+                                skipCounts=args.skipCounts,
+                                generateIndices=args.generateIndices)
+    if args.generateIndices:
+        # we generated the indices with the last open, lets capture them with
+        # this one:
+        dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
+                                    skipCounts=args.skipCounts)
     log.debug("Dataset created")
     dset.write(args.outfile, validate=args.novalidate, modPaths=True,
                relPaths=args.relative)
@@ -42,6 +50,8 @@ def create_options(parser):
                         help="The fofn or BAM file(s) to make into an XML")
     parser.add_argument("--type", type=str, default='DataSet',
                         dest='dsType', help="The type of XML to create")
+    parser.add_argument("--generateIndices", action='store_true',
+                        default=False, help="The type of XML to create")
     parser.add_argument("--novalidate", action='store_false', default=True,
                         help=("Don't validate the resulting XML, don't touch "
                               "paths"))
@@ -51,34 +61,38 @@ def create_options(parser):
     parser.set_defaults(func=createXml)
 
 def filterXml(args):
-    log.error("Adding filters via CLI is temporarily out of order")
-    exit(1)
     if args.infile.endswith('xml'):
         dataSet = openDataSet(args.infile, strict=args.strict)
-        filters = []
+        filters = defaultdict(list)
         separators = ['<=', '>=', '!=', '==', '>', '<', '=']
         for filt in args.filters:
             for sep in separators:
                 if sep in filt:
                     param, condition = filt.split(sep)
-                    condition = sep + condition
-                    filters[param] = condition
+                    condition = (sep, condition)
+                    filters[param].append(condition)
                     break
-        dataSet.addFilters([filters])
+        dataSet.filters.addRequirement(**filters)
+        dataSet.updateCounts()
         log.info("{i} filters added".format(i=len(filters)))
         dataSet.write(args.outfile)
     else:
         raise IOError("No files found/found to be compatible")
 
 def filter_options(parser):
-    parser.description = 'Add filters to an XML file'
+    pbiFilterOptions = set(Filters()._pbiMappedVecAccMap({}).keys())
+    bamFilterOptions = set(Filters()._bamAccMap.keys())
+    parser.description = ('Add filters to an XML file. Suggested fields: '
+                          '{f}. More expensive fields: {b}'.format(
+        f=sorted(list(pbiFilterOptions)),
+        b=sorted(list(bamFilterOptions - pbiFilterOptions))))
     #parser.add_argument("infile", type=validate_file,
     parser.add_argument("infile", type=str,
                         help="The xml file to filter")
     parser.add_argument("outfile", type=str, help="The resulting xml file")
     parser.add_argument("filters", type=str, nargs='+',
                         help="The values and thresholds to filter (e.g. "
-                        "rq>0.85)")
+                        "'rq>0.85')")
     parser.set_defaults(func=filterXml)
 
 def splitXml(args):
@@ -207,7 +221,8 @@ def consolidateXml(args):
     """Combine BAMs and apply the filters described in the XML file, producing
     one consolidated XML"""
     dset = openDataSet(args.infile)
-    dset.consolidate(args.datafile, numFiles=args.numFiles)
+    dset.consolidate(args.datafile, numFiles=args.numFiles, useTmp=(not
+                     args.noTmp))
     dset.write(args.xmlfile)
 
 def consolidate_options(parser):
@@ -215,6 +230,9 @@ def consolidate_options(parser):
     #parser.add_argument("infile", type=validate_file,
     parser.add_argument("--numFiles", type=int, default=1,
                         help="The number of data files to produce (1)")
+    parser.add_argument("--noTmp", default=False, action='store_true',
+                        help="Don't copy to a tmp location to ensure local"
+                             " disk use")
     parser.add_argument("infile", type=str,
                         help="The XML file to consolidate")
     parser.add_argument("datafile", type=str,
diff --git a/pbcore/io/dataset/_pbds.py b/pbcore/io/dataset/_pbds.py
index a027058..34d0e05 100755
--- a/pbcore/io/dataset/_pbds.py
+++ b/pbcore/io/dataset/_pbds.py
@@ -1632,7 +1632,7 @@ CTD_ANON_._Automaton = _BuildAutomaton_()
 
 
 
-DataSetMetadataType._AddElement(pyxb.binding.basis.element(pyxb.namespace.ExpandedName(Namespace, 'TotalLength'), pyxb.binding.datatypes.int, scope=DataSetMetadataType, location=pyxb.utils.utility.Location('/tmp/tmpoNuZaMxsds/PacBioDatasets.xsd', 52, 3)))
+DataSetMetadataType._AddElement(pyxb.binding.basis.element(pyxb.namespace.ExpandedName(Namespace, 'TotalLength'), pyxb.binding.datatypes.long, scope=DataSetMetadataType, location=pyxb.utils.utility.Location('/tmp/tmpoNuZaMxsds/PacBioDatasets.xsd', 52, 3)))
 
 DataSetMetadataType._AddElement(pyxb.binding.basis.element(pyxb.namespace.ExpandedName(Namespace, 'NumRecords'), pyxb.binding.datatypes.int, scope=DataSetMetadataType, location=pyxb.utils.utility.Location('/tmp/tmpoNuZaMxsds/PacBioDatasets.xsd', 53, 3)))
 
diff --git a/pbcore/io/dataset/utils.py b/pbcore/io/dataset/utils.py
index 05fccb4..acf131e 100755
--- a/pbcore/io/dataset/utils.py
+++ b/pbcore/io/dataset/utils.py
@@ -5,13 +5,28 @@ import os
 import tempfile
 import logging
 import json
+import shutil
+import pysam
 from pbcore.util.Process import backticks
 
 log = logging.getLogger(__name__)
 
-def consolidateBams(inFiles, outFile, filterDset=None):
+def consolidateBams(inFiles, outFile, filterDset=None, useTmp=True):
     """Take a list of infiles, an outFile to produce, and optionally a dataset
     (filters) to provide the definition and content of filtrations."""
+    # check space available
+    if useTmp:
+        tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
+        if (disk_free(os.path.split(outFile)[0]) >
+                sum(file_size(infn) for infn in inFiles)):
+            tmpInFiles = _tmpFiles(inFiles, tmpout)
+            origOutFile = outFile
+            origInFiles = inFiles[:]
+            inFiles = tmpInFiles
+            outFile = os.path.join(tmpout, "outfile.bam")
+        else:
+            useTmp = False
+
     if filterDset and filterDset.filters:
         finalOutfile = outFile
         outFile = _infixFname(outFile, "_unfiltered")
@@ -21,6 +36,29 @@ def consolidateBams(inFiles, outFile, filterDset=None):
         outFile = finalOutfile
     _indexBam(outFile)
     _pbindexBam(outFile)
+    if useTmp:
+        shutil.copy(outFile, origOutFile)
+        shutil.copy(outFile + ".bai", origOutFile + ".bai")
+        shutil.copy(outFile + ".pbi", origOutFile + ".pbi")
+
+def _tmpFiles(inFiles, tmpout=None):
+    tmpInFiles = []
+    if tmpout is None:
+        tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
+    for i, fname in enumerate(inFiles):
+        newfn = _infixFname(os.path.join(tmpout, os.path.basename(fname)), i)
+        shutil.copy(fname, newfn)
+        tmpInFiles.append(newfn)
+    return tmpInFiles
+
+def disk_free(path):
+    if path == '':
+        path = os.getcwd()
+    space = os.statvfs(path)
+    return space.f_bavail * space.f_frsize
+
+def file_size(path):
+    return os.stat(path).st_size
 
 def _pbindexBam(fname):
     cmd = "pbindex {i}".format(i=fname)
@@ -36,30 +74,13 @@ def _sortBam(fname):
     o, r, m = backticks(cmd)
     if r != 0:
         raise RuntimeError(m)
-    _mvFile(tmpname, fname)
-
-def _mvFile(inFname, outFname):
-    cmd = "mv {i} {o}".format(i=inFname,
-                              o=outFname)
-    log.info(cmd)
-    o, r, m = backticks(cmd)
-    if r != 0:
-        raise RuntimeError(m)
-
-def _cpFile(inFname, outFname):
-    cmd = "cp {i} {o}".format(i=inFname,
-                              o=outFname)
-    log.info(cmd)
-    o, r, m = backticks(cmd)
-    if r != 0:
-        raise RuntimeError(m)
+    shutil.move(tmpname, fname)
 
 def _indexBam(fname):
-    cmd = "samtools index {i}".format(i=fname)
-    log.info(cmd)
-    o, r, m = backticks(cmd)
-    if r != 0:
-        raise RuntimeError(m)
+    pysam.index(fname)
+
+def _indexFasta(fname):
+    pysam.faidx(fname)
 
 def _mergeBams(inFiles, outFile):
     if len(inFiles) > 1:
@@ -70,10 +91,7 @@ def _mergeBams(inFiles, outFile):
         if r != 0:
             raise RuntimeError(m)
     else:
-        _cpFile(inFiles[0], outFile)
-
-def _maxDigits(numItems):
-    return len(str(numItems))
+        shutil.copy(inFiles[0], outFile)
 
 def _filterBam(inFile, outFile, filterDset):
     tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
@@ -88,11 +106,12 @@ def _filterBam(inFile, outFile, filterDset):
 
 def _infixFname(fname, infix):
     prefix, extension = os.path.splitext(fname)
-    return prefix + infix + extension
+    return prefix + str(infix) + extension
 
 def _emitFilterScript(filterDset, filtScriptName):
-    bamtoolsFilters = ['reference', 'insertSize', 'mapQuality', 'name',
-                       'position', 'queryBases']
+    """Use the filter script feature of bamtools. Use with specific filters if
+    all that are needed are available, otherwise filter by readname (easy but
+    uselessly expensive)"""
     filterMap = {'rname': 'reference',
                  'pos': 'position',
                  'length': 'queryBases',
diff --git a/tests/test_pbcore_extract_bam_sequence.py b/tests/test_pbcore_extract_bam_sequence.py
index cc516fd..ffd0399 100644
--- a/tests/test_pbcore_extract_bam_sequence.py
+++ b/tests/test_pbcore_extract_bam_sequence.py
@@ -14,11 +14,11 @@ import shutil
 import os
 
 sam_str_ = """\
- at HD	VN:1.5\tSO:coordinate\tpb:3.0b7
+ at HD	VN:1.5\tSO:coordinate\tpb:3.0.1
 @SQ\tSN:genome\tLN:28\tM5:734d5f3b2859595f4bd87a2fe6b7389b
 @RG\tID:1\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd=ip;BASECALLERVERSION=2.0.1.0.123678;FRAMERATEHZ=75.000000;BINDINGKIT=foo;SEQUENCINGKIT=bar\tPU:movie1
 @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\tDS:bax2bam
-movie1/54130/0_10\t16\tgenome\t12\t30\t10=\t*\t0\t-28\tTCTCAGGAGA\t*\tRG:Z:1\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:10\tqs:i:0\trq:i:854\tsn:B:f,-1,-1,-1,-1\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2"""
+movie1/54130/0_10\t16\tgenome\t12\t30\t10=\t*\t0\t-28\tTCTCAGGAGA\t*\tRG:Z:1\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:10\tqs:i:0\trq:f:0.854\tsn:B:f,-1,-1,-1,-1\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2"""
 
 fasta_str = """\
 >genome
diff --git a/tests/test_pbcore_io_AlnFileReaders.py b/tests/test_pbcore_io_AlnFileReaders.py
index cac5d36..7d3bb97 100644
--- a/tests/test_pbcore_io_AlnFileReaders.py
+++ b/tests/test_pbcore_io_AlnFileReaders.py
@@ -2,9 +2,12 @@ from numpy.testing import (assert_array_almost_equal as ASIM,
                            assert_array_equal        as AEQ)
 from nose.tools import (nottest,
                         assert_raises,
-                        assert_equal as EQ)
+                        assert_equal as EQ,
+                        assert_almost_equal as EQISH)
 from nose import SkipTest
 
+import tempfile
+import pysam
 import numpy as np
 import bisect
 import h5py
@@ -407,10 +410,10 @@ class TestBasicBam(_BasicAlnFileReaderTests):
     CONSTRUCTOR_ARGS   = (data.getBamAndCmpH5()[0], data.getLambdaFasta())
 
     def testSpecVersion(self):
-        EQ("3.0b7",     self.f.version)
+        EQ("3.0.1",     self.f.version)
 
     def testReadScore(self):
-        EQ(0.904, self.fwdAln.readScore)
+        EQISH(0.904, self.fwdAln.readScore, 3)
 
 
 class TestIndexedBam(_IndexedAlnFileReaderTests):
@@ -439,3 +442,33 @@ class TestCCSBam(object):
                 aln.qStart
             with assert_raises(UnavailableFeature):
                 aln.qEnd
+
+
+class TestMultipleReadGroups(object):
+    """
+    Verify that BAM files with multiple read groups are handled sensibly - see
+    bug 26548.
+    """
+    SAM_IN = """\
+ at HD\tVN:1.5\tSO:coordinate\tpb:3.0.1
+ at SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b
+ at RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200\tPU:movie1
+ at RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\tPU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0
+movie1/54130/0_10\t2\tecoliK12_pbi_March2013_2955000_to_2980000\t2\t10\t10=\t*\t0\t0\tAATGAGGAGA\t*\tRG:Z:3f58e5b8\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:10\tqs:i:0\trq:f:0.854\tsn:B:f,2,2,2,2\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2
+m140906_231018_42161_c100676332550000001823129611271486_s1_p0/1/10_20\t2\tecoliK12_pbi_March2013_2955000_to_2980000\t12\t10\t10=\t*\t0\t0\tAATGAGGAGA\t*\tRG:Z:b5482b33\tdq:Z:2222'$22'2\tdt:Z:NNNNAGNNGN\tip:B:C,255,2,0,10,22,34,0,2,3,0,16\tiq:Z:(+#1'$#*1&\tmq:Z:&1~51*5&~2\tnp:i:1\tqe:i:20\tqs:i:10\trq:f:0.854\tsn:B:f,2,2,2,2\tsq:Z:<32<4<<<<3\tzm:i:54130\tAS:i:-3020\tNM:i:134\tcx:i:2"""
+
+    def test_retrieve_read_group_properties(self):
+        f1 = tempfile.NamedTemporaryFile(suffix=".sam").name
+        f2 = tempfile.NamedTemporaryFile(suffix=".bam").name
+        with open(f1, "w") as f:
+            f.write(self.SAM_IN)
+        with pysam.AlignmentFile(f1) as sam_in:
+            with pysam.AlignmentFile(f2, 'wb', template=sam_in) as bam_out:
+                for aln in sam_in:
+                    bam_out.write(aln)
+        movie_names = []
+        with BamReader(f2) as bam_in:
+            for aln in bam_in:
+                EQ(aln.sequencingChemistry, "P6-C4")
+                movie_names.append(aln.movieName)
+        EQ(movie_names, ['movie1', 'm140906_231018_42161_c100676332550000001823129611271486_s1_p0'])
diff --git a/tests/test_pbcore_io_GffIO.py b/tests/test_pbcore_io_GffIO.py
index 8144910..d2e86b1 100644
--- a/tests/test_pbcore_io_GffIO.py
+++ b/tests/test_pbcore_io_GffIO.py
@@ -6,7 +6,7 @@ import os.path
 from nose.tools import assert_equal, assert_raises
 
 from pbcore.io import GffWriter, Gff3Record, GffReader
-from pbcore.io.GffIO import merge_gffs, sort_gff
+from pbcore.io.GffIO import merge_gffs, merge_gffs_sorted, sort_gff
 from pbcore import data
 
 class TestGff3Record:
@@ -109,11 +109,16 @@ class TestGffWriter:
 class TestGffSorting(unittest.TestCase):
     gff_data = ["""\
 ##gff-version 3
-##some random comment here
+##source ipdSummary
+##source-commandline ipdSummary etc.
+##sequence-region lambda_NEB3011 1 48502
 chr1\tkinModCall\tmodified_base\t32580\t32580\t32\t-\t.\tcoverage=94;context=AATGGCATCGTTCCGGTGGTGGGCGTTGATGGCTGGTCCCG;IPDRatio=1.75
 chr1\tkinModCall\tmodified_base\t32766\t32766\t42\t-\t.\tcoverage=170;context=GCTGGGAAGCTGGCTGAACGTGTCGGCATGGATTCTGTCGA;IPDRatio=1.70
 chr1\tkinModCall\tmodified_base\t32773\t32773\t54\t-\t.\tcoverage=154;context=AACGCTGGCTGGGAAGCTGGCTGAACGTGTCGGCATGGATT;IPDRatio=2.65""", """\
 ##gff-version 3
+##source ipdSummary
+##source-commandline ipdSummary etc.
+##sequence-region lambda_NEB3011 1 48502
 chr2\tkinModCall\tmodified_base\t1200\t1200\t47\t-\t.\tcoverage=109;context=ACTTTTCACGGTAGTTTTTTGCCGCTTTACCGCCCAGGCAC;IPDRatio=1.89
 chr2\tkinModCall\tmodified_base\t1786\t1786\t36\t-\t.\tcoverage=153;context=TCCCACGTCTCACCGAGCGTGGTGTTTACGAAGGTTTTACG;IPDRatio=1.67
 chr2\tkinModCall\tmodified_base\t1953\t1953\t39\t+\t.\tcoverage=148;context=AATGCGCGTATGGGGATGGGGGCCGGGTGAGGAAAGCTGGC;IPDRatio=1.86""", """\
@@ -152,8 +157,24 @@ chr1\tkinModCall\tmodified_base\t16348\t16348\t42\t-\t.\tcoverage=115;context=CC
             os.remove(cls.combined)
 
     def test_merge_gffs(self):
-        gff_out = "tmp_pbcore_merged.gff"
+        gff_out = "tmp_pbcore_merge.gff"
         merge_gffs(self.files, gff_out)
+        n_rec = 0
+        for fn in self.files:
+            with GffReader(fn) as f:
+                n_rec += len([ rec for rec in f ])
+        with GffReader(gff_out) as f:
+            self.assertEqual(f.headers, [
+                "##gff-version 3",
+                "##source ipdSummary",
+                "##sequence-region lambda_NEB3011 1 48502",
+            ])
+            n_rec_merged = len([ rec for rec in f ])
+            self.assertEqual(n_rec, n_rec_merged)
+
+    def test_merge_gffs_sorted(self):
+        gff_out = "tmp_pbcore_merged_sorted.gff"
+        merge_gffs_sorted(self.files, gff_out)
         with GffReader(gff_out) as f:
             start = [ (rec.seqid, rec.start) for rec in f ]
             self.assertEqual(start, self.sorted_start)
diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py
index 66ad971..5457906 100644
--- a/tests/test_pbdataset.py
+++ b/tests/test_pbdataset.py
@@ -7,8 +7,10 @@ import tempfile
 
 import numpy as np
 import unittest
+import shutil
 from unittest.case import SkipTest
 
+from pbcore.io import PacBioBamIndex, IndexedBamReader
 from pbcore.io import openIndexedAlignmentFile
 from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
                        openDataSet, DataSetMetaTypes, HdfSubreadSet)
@@ -21,6 +23,34 @@ import pbcore.data.datasets as data
 
 log = logging.getLogger(__name__)
 
+def _check_constools():
+    cmd = "dataset.py"
+    o, r, m = backticks(cmd)
+    if r != 2:
+        return False
+
+    cmd = "bamtools"
+    o, r, m = backticks(cmd)
+    if r != 0:
+        return False
+
+    cmd = "pbindex"
+    o, r, m = backticks(cmd)
+    if r != 1:
+        return False
+
+    cmd = "samtools"
+    o, r, m = backticks(cmd)
+    if r != 1:
+        return False
+    return True
+
+def _internal_data():
+    if os.path.exists("/mnt/secondary-siv/testdata"):
+        return True
+    return False
+
+
 class TestDataSet(unittest.TestCase):
     """Unit and integrationt tests for the DataSet class and \
     associated module functions"""
@@ -39,7 +69,9 @@ class TestDataSet(unittest.TestCase):
         # e.g. d.write("alignmentset.xml")
         outdir = tempfile.mkdtemp(suffix="dataset-unittest")
         outXml = os.path.join(outdir, 'tempfile.xml')
-        d.write(outXml)
+        log.debug(outXml)
+        # don't validate, type DataSet doesn't validate well
+        d.write(outXml, validate=False)
         # And then recover the same XML (or a different one):
         # e.g. d = DataSet("alignmentset.xml")
         d = DataSet(outXml)
@@ -79,6 +111,8 @@ class TestDataSet(unittest.TestCase):
             ds.externalResources[-1].indices[0].resourceId ==
             "IdontExist.bam.pbi")
 
+    @unittest.skipIf(not _check_constools(),
+                     "bamtools or pbindex not found, skipping")
     def test_split_cli(self):
         outdir = tempfile.mkdtemp(suffix="dataset-unittest")
         cmd = "dataset.py split --outdir {o} --contigs --chunks 2 {d}".format(
@@ -92,6 +126,71 @@ class TestDataSet(unittest.TestCase):
         self.assertTrue(os.path.exists(
             os.path.join(outdir, os.path.basename(data.getXml(16)))))
 
+    @unittest.skipIf(not _check_constools(),
+                     "bamtools or pbindex not found, skipping")
+    def test_filter_cli(self):
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outfn = os.path.join(outdir, "filtered8.xml")
+        log.debug(outfn)
+        cmd = "dataset.py filter {i} {o} {f}".format(
+            i=data.getXml(8),
+            o=outfn,
+            f="rname=E.faecalis.1")
+        log.debug(cmd)
+        o, r, m = backticks(cmd)
+        if r != 0:
+            log.debug(m)
+        self.assertEqual(r, 0)
+        self.assertTrue(os.path.exists(outfn))
+        aln = AlignmentSet(data.getXml(8))
+        aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
+        aln.updateCounts()
+        dset = AlignmentSet(outfn)
+        self.assertEqual(str(aln.filters), str(dset.filters))
+        self.assertEqual(aln.totalLength, dset.totalLength)
+        self.assertEqual(aln.numRecords, dset.numRecords)
+
+    def test_merge_subdatasets(self):
+        # from data file
+        ds1 = AlignmentSet(data.getBam(0))
+        self.assertEqual(len(ds1.subdatasets), 0)
+        ds2 = AlignmentSet(data.getBam(1))
+        self.assertEqual(len(ds1.subdatasets), 0)
+        merged = ds1 + ds2
+        self.assertEqual(len(merged.subdatasets), 2)
+        self.assertEqual(merged.subdatasets[0].toExternalFiles(),
+                         AlignmentSet(data.getBam(0)).toExternalFiles())
+        self.assertEqual(len(merged.subdatasets[0].toExternalFiles()), 1)
+        self.assertEqual(merged.subdatasets[1].toExternalFiles(),
+                         AlignmentSet(data.getBam(1)).toExternalFiles())
+        self.assertEqual(len(merged.subdatasets[1].toExternalFiles()), 1)
+
+        # from data set
+        ds1 = AlignmentSet(data.getXml(8))
+        self.assertEqual(len(ds1.subdatasets), 0)
+        ds2 = AlignmentSet(data.getXml(11))
+        self.assertEqual(len(ds2.subdatasets), 0)
+        merged = ds1 + ds2
+        self.assertEqual(len(merged.subdatasets), 2)
+        self.assertEqual(merged.subdatasets[0].toExternalFiles(),
+                         AlignmentSet(data.getXml(8)).toExternalFiles())
+        self.assertEqual(len(merged.subdatasets[0].toExternalFiles()), 1)
+        self.assertEqual(merged.subdatasets[1].toExternalFiles(),
+                         AlignmentSet(data.getXml(11)).toExternalFiles())
+        self.assertEqual(len(merged.subdatasets[1].toExternalFiles()), 1)
+
+        # combined data set
+        merged = AlignmentSet(data.getXml(8), data.getXml(11))
+        self.assertEqual(len(merged.subdatasets), 2)
+        self.assertEqual(len(merged.subdatasets[0].toExternalFiles()), 1)
+        self.assertEqual(merged.subdatasets[0].toExternalFiles(),
+                         AlignmentSet(data.getXml(8)).toExternalFiles())
+        self.assertEqual(len(merged.subdatasets[1].toExternalFiles()), 1)
+        self.assertEqual(merged.subdatasets[1].toExternalFiles(),
+                         AlignmentSet(data.getXml(11)).toExternalFiles())
+
+    @unittest.skipIf(not _check_constools(),
+                     "bamtools or pbindex not found, skipping")
     def test_create_cli(self):
         log.debug("Absolute")
         outdir = tempfile.mkdtemp(suffix="dataset-unittest")
@@ -214,6 +313,25 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(aln.totalLength, -1)
         self.assertEqual(aln.numRecords, -1)
 
+    # TODO: replace this with a reproducable bam file and move test upstream
+    @unittest.skip("Skip until suitable barcoded files found and updated")
+    def test_barcode_accession(self):
+        testFile = data.getBarcodedBam()
+        # Test the pbi file:
+        bam = IndexedBamReader(testFile)
+        pbi = PacBioBamIndex(testFile + '.pbi')
+        for brec, prec in zip(bam, pbi):
+            brec_bc = brec.peer.opt("bc")
+            prec_bc = [prec.bcLeft, prec.bcRight]
+            self.assertEqual(brec_bc, prec_bc)
+
+        # Test split by barcode:
+        ss = SubreadSet(testFile)
+        sss = ss.split(chunks=2, barcodes=True)
+        self.assertEqual(len(sss), 2)
+        for sset in sss:
+            self.assertTrue(len(sset.barcodes) > 1)
+
     def test_attributes(self):
         aln = AlignmentSet(data.getBam(0))
         self.assertEqual(aln.sequencingChemistry, ['unknown'])
@@ -375,6 +493,15 @@ class TestDataSet(unittest.TestCase):
             [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
               55, 1815)])
 
+    @unittest.skipUnless(os.path.isdir("/mnt/secondary-siv/testdata"),
+                         "Missing testadata directory")
+    def test_large_pbi(self):
+        pbiFn = ('/mnt/secondary-siv/testdata/SA3-DS/lambda/simulated'
+                 '/100Gb/alnsubreads/pbalchemy_100Gb_Seq_sim1_p0.'
+                 'aligned.bam.pbi')
+        pbi = PacBioBamIndex(pbiFn)
+        self.assertFalse(pbi.aStart is None)
+
     def test_copy(self):
         ds1 = DataSet(data.getXml(12))
         ds2 = ds1.copy()
@@ -597,8 +724,9 @@ class TestDataSet(unittest.TestCase):
             num += 1
         self.assertTrue(num > 2000)
 
-    @unittest.skipUnless(os.path.isdir("/mnt/secondary/Share/Quiver"),
-                         "Missing testadata directory")
+    @unittest.skip("Skip until /mnt/secondary/Share/Quiver/ updated")
+    #@unittest.skipUnless(os.path.isdir("/mnt/secondary/Share/Quiver"),
+    #                     "Missing testadata directory")
     def test_reads_in_range_order_large(self):
         window = ('Staphylococcus_aureus_subsp_aureus_USA300_TCH1516',
                   558500,
@@ -811,7 +939,7 @@ class TestDataSet(unittest.TestCase):
         dss = ds3.split(contigs=True, chunks=3, updateCounts=True)
         self.assertEqual(len(dss), 3)
         sizes = sorted([dset.numRecords for dset in dss])
-        self.assertListEqual(sizes, [22, 31, 39])
+        self.assertListEqual(sizes, [20, 24, 48])
         refWindows = sorted(reduce(lambda x, y: x + y,
                                    [ds.refWindows for ds in dss]))
         for ref in random_few:
@@ -933,25 +1061,24 @@ class TestDataSet(unittest.TestCase):
         log.debug(dss[0].filters)
         log.debug(dss[1].filters)
         self.assertTrue(
-            '( rname = E.faecalis.2 ) '
+            '( rname = E.faecalis.2 '
             in str(dss[0].filters)
             or
-            '( rname = E.faecalis.2 ) '
+            '( rname = E.faecalis.2 '
             in str(dss[1].filters))
         ds = AlignmentSet(data.getBam())
-        ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'),
-                                         ('=', 'lambda_NEB3011')],
-                                  tStart=[('<', '0'),
-                                          ('<', '100')],
-                                  tEnd=[('>', '99'),
-                                        ('>', '299')])
+        ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
+                                         ('=', 'E.faecalis.2')],
+                                  tStart=[('<', '99'),
+                                          ('<', '299')],
+                                  tEnd=[('>', '0'),
+                                        ('>', '100')])
         self.assertEqual(str(ds.filters),
-                         '( rname = lambda_NEB3011 AND tstart '
-                         '< 0 AND tend > 99 ) OR ( rname = lambd'
-                         'a_NEB3011 AND tstart < 100 AND tend > 299 )')
-        # TODO: look into swapping around refWindows to make this work:
-        #self.assertEqual(ds.refWindows, [('lambda_NEB3011', 0, 99),
-                                         #('lambda_NEB3011', 100, 299)])
+                         '( rname = E.faecalis.2 AND tstart '
+                         '< 99 AND tend > 0 ) OR ( rname = '
+                         'E.faecalis.2 AND tstart < 299 AND tend > 100 )')
+        self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
+                                         ('E.faecalis.2', 100, 299)])
 
 
     def test_refLengths(self):
@@ -1064,6 +1191,28 @@ class TestDataSet(unittest.TestCase):
             for i, item in enumerate(aln):
                 self.assertEqual(item, aln[i])
 
+    @unittest.skipIf(not _check_constools(),
+                     "bamtools or pbindex not found, skipping")
+    def test_induce_indices(self):
+        # all of our test files are indexed. Copy just the main files to a temp
+        # location, open as dataset, assert unindexed, open with
+        # generateIndices=True, assert indexed
+        toTest = [8, 9, 10, 11, 12, 15, 16]
+        for fileNo in toTest:
+            outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+            orig_dset = openDataSet(data.getXml(fileNo))
+            resfnames = orig_dset.toExternalFiles()
+            new_resfnames = []
+            for fname in resfnames:
+                newfname = os.path.join(outdir, os.path.basename(fname))
+                shutil.copy(fname, newfname)
+                new_resfnames.append(newfname)
+            dset = type(orig_dset)(*new_resfnames)
+            self.assertFalse(dset.isIndexed)
+            dset = type(orig_dset)(*new_resfnames, generateIndices=True)
+            self.assertTrue(dset.isIndexed)
+
+
     def test_reads_in_reference(self):
         ds = AlignmentSet(data.getBam())
         refNames = ds.refNames
diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py
index 71d2869..3500d66 100644
--- a/tests/test_pbdataset_subtypes.py
+++ b/tests/test_pbdataset_subtypes.py
@@ -12,7 +12,7 @@ from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet,
                        ReferenceSet, ContigSet, AlignmentSet,
                        FastaReader, FastaWriter, IndexedFastaReader,
                        HdfSubreadSet, ConsensusAlignmentSet,
-                       openDataFile)
+                       openDataFile, FastaWriter)
 import pbcore.data as upstreamData
 import pbcore.data.datasets as data
 from pbcore.io.dataset.DataSetValidator import validateXml
@@ -21,6 +21,11 @@ import xml.etree.ElementTree as ET
 log = logging.getLogger(__name__)
 
 def _check_constools():
+    cmd = "dataset.py"
+    o, r, m = backticks(cmd)
+    if r != 2:
+        return False
+
     cmd = "bamtools"
     o, r, m = backticks(cmd)
     if r != 0:
@@ -137,6 +142,22 @@ class TestDataSet(unittest.TestCase):
         self.assertEquals(type(ds2).__name__, 'ConsensusReadSet')
         self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
 
+    def test_ccsset_from_bam(self):
+        # DONE bug 28698
+        ds1 = ConsensusReadSet(upstreamData.getCCSBAM(), strict=False)
+        fn = tempfile.NamedTemporaryFile(suffix=".consensusreadset.xml").name
+        log.debug(fn)
+        ds1.write(fn, validate=False)
+        ds1.write(fn)
+
+    def test_subreadset_from_bam(self):
+        # DONE control experiment for bug 28698
+        bam = upstreamData.getUnalignedBam()
+        ds1 = SubreadSet(bam, strict=False)
+        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+        log.debug(fn)
+        ds1.write(fn)
+
     def test_ccsalignment_build(self):
         ds1 = ConsensusAlignmentSet(data.getXml(20), strict=False)
         self.assertEquals(type(ds1).__name__, 'ConsensusAlignmentSet')
@@ -146,6 +167,22 @@ class TestDataSet(unittest.TestCase):
         #self.assertEquals(type(ds2).__name__, 'ConsensusAlignmentSet')
         #self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
 
+    def test_contigset_write(self):
+        fasta = upstreamData.getLambdaFasta()
+        ds = ContigSet(fasta)
+        self.assertTrue(isinstance(ds.resourceReaders()[0],
+                                   IndexedFastaReader))
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outfn = os.path.join(outdir, 'test.fasta')
+        w = FastaWriter(outfn)
+        for rec in ds:
+            w.writeRecord(rec)
+        w.close()
+        fas = FastaReader(outfn)
+        for rec in fas:
+            # make sure a __repr__ didn't slip through:
+            self.assertFalse(rec.sequence.startswith('<'))
+
     def test_file_factory(self):
         # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
         types = [AlignmentSet(data.getXml(8)),
@@ -209,6 +246,13 @@ class TestDataSet(unittest.TestCase):
             self.assertEqual(read1, read2)
         self.assertEqual(len(aln), len(nonCons))
 
+        # Test that it is a valid xml:
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        datafile = os.path.join(outdir, "apimerged.bam")
+        xmlfile = os.path.join(outdir, "apimerged.xml")
+        log.debug(xmlfile)
+        aln.write(xmlfile)
+
         log.debug("Test with cheap filter")
         aln = AlignmentSet(data.getXml(12))
         self.assertEqual(len(list(aln)), 177)
@@ -286,7 +330,7 @@ class TestDataSet(unittest.TestCase):
                     "7920800000001823174110291514_s1_p0."
                     "all.alignmentset.xml")
         aln = AlignmentSet(testFile)
-        nonCons= AlignmentSet(testFile)
+        nonCons = AlignmentSet(testFile)
         self.assertEqual(len(aln.toExternalFiles()), 3)
         outdir = tempfile.mkdtemp(suffix="dataset-unittest")
         outfn = os.path.join(outdir, 'merged.bam')
@@ -381,9 +425,39 @@ class TestDataSet(unittest.TestCase):
 
         # open acc and compare to exp
         for name, seq in zip(singletons, exp_single_seqs):
-            self.assertEqual(acc_file.get_contig(name).sequence, seq)
-        self.assertEqual(acc_file.get_contig(double).sequence, exp_double_seq)
+            self.assertEqual(acc_file.get_contig(name).sequence[:], seq)
+        self.assertEqual(acc_file.get_contig(double).sequence[:],
+                         exp_double_seq)
 
+    def test_contigset_consolidate_genomic_consensus(self):
+        """
+        Verify that the contigs output by GenomicConsensus (e.g. quiver) can
+        be consolidated.
+        """
+        FASTA1 = ("lambda_NEB3011_0_60",
+            "GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG")
+        FASTA2 = ("lambda_NEB3011_120_180",
+            "CACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTG")
+        FASTA3 = ("lambda_NEB3011_60_120",
+            "GTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGT")
+        files = []
+        for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]):
+            _files = []
+            for suffix in ["", "|quiver", "|plurality", "|arrow"]:
+                tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name
+                with open(tmpfile, "w") as f:
+                    f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
+                _files.append(tmpfile)
+            files.append(_files)
+        for i in range(3):
+            ds = ContigSet(*[ f[i] for f in files ])
+            out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
+            fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name
+            ds.consolidate(fa1)
+            ds.write(out1)
+            with ContigSet(out1) as ds_new:
+                self.assertEqual(len([ rec for rec in ds_new ]), 1,
+                                 "failed on %d" % i)
 
     def test_split_hdfsubreadset(self):
         hdfds = HdfSubreadSet(*upstreamData.getBaxH5_v23())

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git



More information about the debian-med-commit mailing list