[med-svn] [python-pbh5tools] 01/02: Imported Upstream version 0.8.0

Afif Elghraoui afif-guest at moszumanska.debian.org
Sun Jun 7 10:16:33 UTC 2015


This is an automated email from the git hooks/post-receive script.

afif-guest pushed a commit to branch master
in repository python-pbh5tools.

commit 448056df96b586e654b4e754f9205c0b95a01848
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Sat Jun 6 18:51:46 2015 -0700

    Imported Upstream version 0.8.0
---
 .gitignore                                 |   5 +
 LICENSES.txt                               |  32 ++
 Makefile                                   |  52 +++
 README.md                                  |   6 +
 bin/bash5tools.py                          | 166 +++++++++
 bin/cmph5tools.py                          | 241 ++++++++++++
 doc/Makefile                               | 153 ++++++++
 doc/_static/.placeholder                   |   0
 doc/_templates/.placeholder                |   0
 doc/cmph5tools-examples.rst                | 127 +++++++
 doc/conf.py                                | 245 +++++++++++++
 doc/example-slides.html                    | 141 +++++++
 doc/examples.md                            | 209 +++++++++++
 doc/examples.t                             | 265 ++++++++++++++
 doc/index.rst                              | 147 ++++++++
 doc/pacbio-theme/static/headerGradient.jpg | Bin 0 -> 7099 bytes
 doc/pacbio-theme/static/pacbio.css         | 238 ++++++++++++
 doc/pacbio-theme/static/pacbioLogo.png     | Bin 0 -> 3128 bytes
 doc/pacbio-theme/static/pygments.css       |  55 +++
 doc/pacbio-theme/theme.conf                |   4 +
 etc/aligned_reads_ss.cmp.h5                | Bin 0 -> 9862067 bytes
 etc/grouped.csv                            |  35 ++
 pbh5tools/CmpH5Compare.py                  |  94 +++++
 pbh5tools/CmpH5Format.py                   |  76 ++++
 pbh5tools/CmpH5Merge.py                    | 338 +++++++++++++++++
 pbh5tools/CmpH5Select.py                   | 167 +++++++++
 pbh5tools/CmpH5Sort.py                     | 518 ++++++++++++++++++++++++++
 pbh5tools/CmpH5Stats.py                    |  79 ++++
 pbh5tools/CmpH5Utils.py                    |  85 +++++
 pbh5tools/Indexer.py                       |  64 ++++
 pbh5tools/Metrics.py                       | 567 +++++++++++++++++++++++++++++
 pbh5tools/PBH5ToolsException.py            |  37 ++
 pbh5tools/__init__.py                      |   0
 pbh5tools/_version.py                      |  31 ++
 pbh5tools/cbook.py                         |  86 +++++
 pbh5tools/ci.c                             | 112 ++++++
 pbh5tools/mlab.py                          | 390 ++++++++++++++++++++
 setup.py                                   |  37 ++
 tests/cram/bash5tools.t                    | 312 ++++++++++++++++
 tests/cram/groupcsv.t                      |  47 +++
 tests/cram/merge.t                         |  27 ++
 tests/cram/portability.sh                  |   4 +
 tests/cram/select.t                        |  49 +++
 tests/cram/sort-extended.t                 |  29 ++
 tests/cram/sort.t                          |  21 ++
 tests/cram/stats.t                         | 221 +++++++++++
 tests/cram/valid.t                         |   2 +
 tests/test_cmph5lib_CmpH5Merge.py          | 106 ++++++
 tests/test_cmph5lib_CmpH5Sort.py           | 122 +++++++
 49 files changed, 5742 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..894b708
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+build/
+dist/
+doc/_build/
+*.egg-info
+*.pyc
diff --git a/LICENSES.txt b/LICENSES.txt
new file mode 100644
index 0000000..ccdee9e
--- /dev/null
+++ b/LICENSES.txt
@@ -0,0 +1,32 @@
+Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of Pacific Biosciences nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+GRANTED BY THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC
+BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..c8f0ed5
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,52 @@
+.PHONY: doc
+SHELL = /bin/bash -e
+
+all: build install
+
+build:
+	python setup.py build --executable="/usr/bin/env python"
+
+bdist:
+	python setup.py build --executable="/usr/bin/env python"
+	python setup.py bdist --formats=egg
+
+install:
+	python setup.py install
+
+develop:
+	python setup.py develop
+
+test: examples
+	find tests -name "*.py" | xargs nosetests
+	find tests/cram -name "*.t" | xargs cram 
+doc:
+	sphinx-apidoc -T -f -o doc src/ && cd doc && make html
+
+examples:
+	cram doc/examples.t
+	sed 's/^  /    /' doc/examples.t > doc/examples.md
+
+example-rendered: examples
+	pandoc doc/examples.md -o doc/examples.html
+	pandoc -s -S -i -t slidy --mathjax doc/examples.md -o doc/example-slides.html
+
+doc-clean:
+	cd doc && rm -rf modules.rst pbtools.* bash5lib.* cmph5tools.* \
+	bash5tools.* _templates _static _build searchindex.js objects.inv
+
+clean: doc-clean
+	rm -rf build/;\
+	find . -name "*.egg-info" | xargs rm -rf;\
+	find . -name "*.pyc" | xargs rm -rf;\
+	rm -rf dist/
+	rm -f nosetests.xml
+
+pip-install:
+	@which pip > /dev/null
+	@pip freeze|grep 'pbh5tools=='>/dev/null \
+      && ( pip uninstall -y pbh5tools \
+        || pip uninstall -y pbtools.pbh5tools ) \
+      || true
+	@pip install --no-index \
+          --install-option="--install-scripts=$(PREFIX)/bin" \
+          ./
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5177e2b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+
+``pbh5tools`` -- tools for manipulating HDF5 files produced by Pacific
+Biosciences. Specifically, this package provides functionality for
+manipulating and extracting data from "cmp.h5" and "bas.h5" files.
+
+[Detailed documentation here](doc/index.rst).
diff --git a/bin/bash5tools.py b/bin/bash5tools.py
new file mode 100644
index 0000000..befe33b
--- /dev/null
+++ b/bin/bash5tools.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+import os, os.path, sys, argparse, logging
+
+from pbcore.util.ToolRunner import PBToolRunner
+from pbcore.io import (BasH5Reader,
+                       FastaWriter,
+                       FastqWriter)
+
+from pbh5tools._version import __version__
+
+def _fileType(arg):
+    """
+    Canonicalize the given filetype argument
+    """
+    if   arg in ["fa", "fasta", "FASTA"]: return "fasta"
+    elif arg in ["fq", "fastq", "FASTQ"]: return "fastq"
+    else: raise ValueError("Unsupported output file format")
+
+
+class FastaEmitter(object):
+    def __init__(self, filename):
+        self.writer = FastaWriter(filename)
+
+    def emit(self, zmwRead):
+        self.writer.writeRecord(zmwRead.readName,
+                                zmwRead.basecalls())
+
+class FastqEmitter(object):
+    def __init__(self, filename):
+        self.writer = FastqWriter(filename)
+
+    def emit(self, zmwRead):
+        self.writer.writeRecord(zmwRead.readName,
+                                zmwRead.basecalls(),
+                                zmwRead.QualityValue())
+
+class BasH5ToolsRunner(PBToolRunner):
+
+    def __init__(self):
+        desc = "Tool for extracting data from .bas.h5 files"
+        super(BasH5ToolsRunner, self).__init__(desc)
+
+        self.parser.add_argument(
+            "inFile", metavar="input.bas.h5",
+            help="input .bas.h5 filename")
+        self.parser.add_argument(
+            "--outFilePrefix", dest="outFilePrefix", default=None,
+            help="output filename prefix [%(default)s]")
+        self.parser.add_argument(
+            "--readType", dest="readType", default="",
+            choices=["ccs", "subreads", "unrolled"],
+            help="read type (ccs, subreads, or unrolled) [%(default)s]")
+        self.parser.add_argument(
+            "--outType", dest="outType", default="fasta", type=_fileType,
+            help="output file type (fasta, fastq) [%(default)s]")
+
+        groupFilt = self.parser.add_argument_group("Read filtering arguments")
+        groupFilt.add_argument(
+            "--minLength", type=int, dest="minLength", default=0,
+            help="min read length [%(default)s]")
+        groupFilt.add_argument(
+            "--minReadScore", type=float, dest="minReadScore", default=0,
+            help="min read score, valid only with --readType={unrolled,subreads}  [%(default)s]")
+        groupFilt.add_argument(
+            "--minPasses", type=int, dest="minPasses", default=0,
+            help="min number of CCS passes, valid only with --readType=ccs [%(default)s]")
+
+    def getVersion(self):
+        return __version__
+
+    def validateArgs(self):
+        if not os.path.isfile(self.args.inFile):
+            self.parser.error("File %s does not exist!" % self.args.inFile)
+        if self.args.minReadScore > 1.0 or self.args.minReadScore < 0.0:
+            self.parser.error("Minimum read score needs to be > 0.0 and < 1.0")
+
+    def zmwReads(self, inBasH5, readType):
+        """
+        Extract all reads of the appropriate read type
+        """
+        for zmw in inBasH5:
+            if readType == "ccs":
+                r = zmw.ccsRead
+                if r: yield r
+            elif readType == "unrolled":
+                yield zmw.read()
+            else:
+                for r in zmw.subreads:
+                    yield r
+
+    def run(self):
+        inBasH5 = BasH5Reader(self.args.inFile)
+        
+        if not inBasH5.hasConsensusBasecalls and self.args.readType == "ccs":
+            print "Input file %s contains no CCS reads." % self.args.inFile
+            sys.exit(-1)
+        
+        if not inBasH5.hasRawBasecalls and self.args.readType in ["unrolled", "subreads"]:
+            print "Input file %s contains no %s reads" % (self.args.inFile,
+                                                          self.args.readType)
+            sys.exit(-1)
+
+        movieName = inBasH5.movieName
+        outFilePrefix = self.args.outFilePrefix or movieName
+        outFilename = "%s.%s" % (outFilePrefix, self.args.outType)
+
+        if self.args.outType == "fasta":
+            sink = FastaEmitter(outFilename)
+        elif self.args.outType == "fastq":
+            sink = FastqEmitter(outFilename)
+
+        if self.args.readType == '':
+            # choose based on file.
+            if inBasH5.hasRawBasecalls:
+                readType = 'subreads' 
+            elif inBasH5.hasConsensusBasecalls:
+                readType = 'ccs'
+            else:
+                print "Input bas.h5 file has neither CCS nor subread data"
+                sys.exit(-1)
+        else:
+            readType = self.args.readType
+
+        for zmwRead in self.zmwReads(inBasH5, readType):
+            zmw = zmwRead.zmw
+            #
+            # Emit read if filters pass
+            #
+            if ((readType != "ccs" or zmw.numPasses >= self.args.minPasses)    and
+                (readType == "ccs" or zmw.readScore >= self.args.minReadScore) and
+                (len(zmwRead) >= self.args.minLength)):
+
+                sink.emit(zmwRead)
+
+
+if __name__ == "__main__":
+    sys.exit(BasH5ToolsRunner().start())
diff --git a/bin/cmph5tools.py b/bin/cmph5tools.py
new file mode 100644
index 0000000..f62168d
--- /dev/null
+++ b/bin/cmph5tools.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import os
+import sys
+import argparse
+import logging
+import tempfile
+import shutil
+import pkg_resources
+
+from pbcore.util.ToolRunner import PBMultiToolRunner
+from pbh5tools.PBH5ToolsException import PBH5ToolsException
+
+from pbh5tools.CmpH5Select import cmpH5Select
+from pbh5tools.CmpH5Merge import cmpH5Merge
+from pbh5tools.CmpH5Sort import cmpH5Sort
+from pbh5tools.CmpH5Stats import cmpH5Stats
+from pbh5tools.CmpH5Compare import cmpH5Equal, cmpH5Summarize, cmpH5Validate
+from pbh5tools.Metrics import DocumentedMetric,DocumentedStatistic
+from pbh5tools._version import __version__
+
+class CmpH5ToolsRunner(PBMultiToolRunner):
+    def __init__(self):
+        desc = ['Toolkit for command-line tools associated with cmp.h5 file processing.',
+                'Notes: For all command-line arguments, default values are listed in [].']
+        super(CmpH5ToolsRunner, self).__init__('\n'.join(desc))
+        subparsers = self.subParsers
+
+        # select
+        desc = ['Create a new cmp.h5 file by selecting alignments.',
+                'Users can specify indices using the idx argument to select',
+                'particular alignments.',
+                'Alternatively, users can specify a where expression which chooses',
+                'the alignments which the predicate is true.',
+                'If a groupBy expression is specified then mulitple cmp.h5 files are',
+                'generated according to the expression. For instance, if a user wanted'
+                'to generate a cmp.h5 file for each reference sequence then --groupBy=Reference']
+
+        parser = subparsers.add_parser('select', help = 'Create new cmp.h5 files from selections of input.cmp.h5',
+                                       description = '\n'.join(desc))
+
+
+        parser.add_argument('inCmp', metavar='input.cmp.h5')
+        parser.add_argument('--outFile',
+                            default = "out.cmp.h5",
+                            dest='outCmp', metavar='out.cmp.h5',
+                            help = "Either a pattern string or a filename")
+        parser.add_argument('--idxs', metavar='N', type=int, nargs='+',
+                            help='indices to select')
+        parser.add_argument('--groupBy', metavar='groupBy-expression',
+                            type = str, help='groupBy expression, e.g., Movie*Barcode')
+        parser.add_argument('--groupByCsv', metavar='groupByCsv',
+                            type = str, help='groupByCsv file, e.g. Group,Movie,Barcode\\ngroupname,movie,barcode')
+        parser.add_argument('--where', metavar='where-expression',
+                            type = str, help='where expression, e.g., ReadLength > 500')
+        parser.add_argument('--outDir', metavar='outputDir',
+                            type = str, default = ".")
+
+        # merge
+        desc = ['Merge two or more cmp.h5 files. The input.cmp.h5 files must have',
+                'been aligned to the same reference sequences']
+        parser = subparsers.add_parser('merge',
+                                       help = 'Merge input.cmp.h5 files into out.cmp.h5',
+                                       description='\n'.join(desc))
+        parser.add_argument('--outFile',
+                            dest='outCmp', default='out.cmp.h5',
+                            help='output filename [%(default)s]')
+        parser.add_argument("--referencesFile", '-W',
+                            default=None)
+        parser.add_argument('inCmps', metavar='input.cmp.h5', nargs='+',
+                            help='input filenames')
+
+        # sort
+        desc = ['Sort cmp.h5 files. If output-file is unspecified the input-file is',
+                'overwritten']
+        parser= subparsers.add_parser('sort',
+                                         help='Sort input.cmp.h5 file',
+                                         description='\n'.join(desc))
+        parser.add_argument('inCmp', metavar='input.cmp.h5',
+                              help='input filename')
+        parser.add_argument('--outFile', dest='outCmp',
+                               help='output filename')
+        parser.add_argument('--deep', dest='deepsort', action='store_true',
+                              help='whether a deep sorting should be conducted, i.e. sort the' +
+                              'AlignmentArrays [%(default)s]')
+        parser.add_argument('--tmpDir', dest='tmpdir', default='/tmp',
+                              help='temporary directory to use when sorting in-place [%(default)s]')
+        parser.add_argument('--usePythonIndexer', dest='usePythonIndexer', default = False,
+                              action = 'store_true',
+                              help='Whether to use native indexing [%(default)s].')
+        parser.add_argument('--inPlace', dest='inPlace', default = False, action = 'store_true',
+                              help = 'Whether to make a temporary copy of the original cmp.h5' +
+                              ' file before sorting.')
+        # equal
+        desc = ['Compare two cmp.h5 files for equivalence.']
+        parser = subparsers.add_parser('equal',
+                                         help='Compare two cmp.h5 files for equivalence',
+                                         description='\n'.join(desc))
+        parser.add_argument('inCmp1', metavar='cmp.h5.1', help='filename 1')
+        parser.add_argument('inCmp2', metavar='cmp.h5.2', help='filename 2')
+
+        # summarize
+        desc = ['Summarize cmp.h5 files.']
+        parser = subparsers.add_parser('summarize',
+                                         help='Summarize contents of cmp.h5 files',
+                                         description='\n'.join(desc))
+        parser.add_argument('inCmps', metavar='input.cmp.h5', nargs='+',
+                            help='cmp.h5 files to summarize')
+
+        # stats
+        desc = ['Emit statistics from a cmp.h5 file.']
+        parser = subparsers.add_parser('stats',
+                                       help='Compute statistics from input.cmp.h5',
+                                       description='\n'.join(desc))
+        parser.add_argument('--outFile', dest='outCsv',
+                            help='output csv filename', default = None)
+        parser.add_argument('--what', metavar = 'what-expression',
+                            default = None)
+        parser.add_argument('--where', metavar = 'where-expression',
+                            default = None)
+        parser.add_argument('--groupBy', metavar='groupBy-expression',
+                            default = None)
+        parser.add_argument('--sortBy', metavar='sortBy-expression',
+                            default = None)
+        parser.add_argument('--limit', metavar='maximum-records', type=int,
+                            default = None)
+        parser.add_argument('inCmp', metavar='input.cmp.h5', help='input filename')
+
+        # listMetrics
+        desc = ["List available metrics and statistics for selection and stats."]
+        parser = subparsers.add_parser('listMetrics',
+                                       help = "List available metrics",
+                                       description = '\n'.join(desc))
+        parser.add_argument('--json', default = False, action = 'store_true',
+                            help = 'Should output be in JSON format')
+
+        # validate
+        desc = ['Validate a cmp.h5 file']
+        parser = subparsers.add_parser('validate',
+                                       help = 'Validate input.cmp.h5',
+                                       description = '\n'.join(desc))
+        parser.add_argument('inCmp', metavar = 'input.cmp.h5',
+                            help = 'input filename')
+
+
+    def getVersion(self):
+        return __version__
+
+    def run(self):
+        cmd = self.args.subCommand
+        try:
+            if cmd == 'merge':
+                cmpH5Merge(self.args.inCmps, self.args.outCmp, self.args.referencesFile)
+
+            elif cmd == 'sort':
+                cmpH5Sort(self.args.inCmp, self.args.outCmp, self.args.tmpdir,
+                          deep = self.args.deepsort,
+                          useNative = not self.args.usePythonIndexer,
+                          inPlace = self.args.inPlace)
+
+            elif cmd == 'select':
+                cmpH5Select(self.args.inCmp, self.args.outCmp,
+                            idxs = self.args.idxs, whereStr = self.args.where,
+                            groupByStr = self.args.groupBy,
+                            groupByCsv = self.args.groupByCsv,
+                            outDir = self.args.outDir)
+
+            elif cmd == 'stats':
+                cmpH5Stats(self.args.inCmp, 
+                           whatStr = self.args.what, 
+                           whereStr = self.args.where,
+                           groupByStr = self.args.groupBy, 
+                           sortByStr = self.args.sortBy,
+                           limit     = self.args.limit,
+                           outFile =  self.args.outCsv)
+
+            elif cmd == 'listMetrics':
+                print '--- Metrics:'
+                print "\t\n".join(DocumentedMetric.list())
+                print '\n--- Statistics:'
+                print "\t\n".join(DocumentedStatistic.list())
+
+            elif cmd == 'equal':
+                ret = cmpH5Equal(self.args.inCmp1, self.args.inCmp2)
+                if not ret[0]:
+                    print >> sys.stderr, ret[1]
+                    return 1
+                else:
+                    return 0
+
+            elif cmd == 'summarize':
+                for inCmp in self.args.inCmps:
+                    print "".join(["-"] * 40)
+                    print cmpH5Summarize(inCmp)
+
+            elif cmd == 'validate':
+                if cmpH5Validate(self.args.inCmp):
+                    return 0
+                else:
+                    return 1
+            else:
+                raise PBH5ToolsException("", "Unkown command passed to cmph5tools.py:" +
+                                         self.args.subName)
+            return 0
+
+        except PBH5ToolsException as pbe:
+            logging.exception(pbe)
+            return 1
+
+if __name__ == '__main__':
+    sys.exit(CmpH5ToolsRunner().start())
+
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..f8541ff
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,153 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbh5tools.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbh5tools.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/pbh5tools"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbh5tools"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/doc/_static/.placeholder b/doc/_static/.placeholder
new file mode 100644
index 0000000..e69de29
diff --git a/doc/_templates/.placeholder b/doc/_templates/.placeholder
new file mode 100644
index 0000000..e69de29
diff --git a/doc/cmph5tools-examples.rst b/doc/cmph5tools-examples.rst
new file mode 100644
index 0000000..9311a75
--- /dev/null
+++ b/doc/cmph5tools-examples.rst
@@ -0,0 +1,127 @@
+############################
+cmph5tools.py Query Examples
+############################
+
+This interface is used to both produce data tables as well as new cmp.h5 files. The interface is meant to be somewhat similar to SQL.
+At the heart of the new tools is a small query language for extracting alignments and computing statistics over those alignments. 
+The three relevant clauses are: ``what``, ``where``, and ``groupBy``. 
+
+--------------------------------------------
+Example 1: Produce a sub-sampled cmp.h5 file
+--------------------------------------------
+
+Take 50% of the reads: ::
+
+    $ cmph5tools.py select --where "SubSample(rate=.5)" \
+    > --outFile ss.cmp.h5 aligned_reads.cmp.h5
+    
+-------------------------------------------
+Example 2: Produce per-Barcode cmp.h5 files
+-------------------------------------------
+
+Filter by AverageBarcodeScore: ::
+
+    $ cmph5tools.py select --where "AverageBarcodeScore >= 30" \
+    > --groupBy Barcode aligned_reads.cmp.h5
+   
+--------------------------------------------------
+Example 3: Produce tabular data from a cmp.h5 File    
+--------------------------------------------------
+
+Grouped Statistics: :: 
+
+    $ cmph5tools.py stats --what "Tbl(q = Percentile(ReadLength, 90), m = Median(Accuracy))" \
+    > --groupBy Barcode aligned_reads.cmp.h5 | tail
+    bc_88--bc_88      486.40          0.91
+    bc_89--bc_89      561.00          0.91
+    bc_9--bc_9        479.80          0.90 
+    bc_90--bc_90      563.60          0.89
+    bc_91--bc_91      554.60          0.91
+    bc_92--bc_92      523.00          0.90
+    bc_93--bc_93      542.00          0.90
+    bc_94--bc_94      518.00          0.90
+    bc_95--bc_95      512.20          0.91
+    bc_96--bc_96      609.60          0.92
+
+------------------------------------------------------------------------------
+Example 4: Query the package to determine the available metrics and statistics
+------------------------------------------------------------------------------
+
+Metrics and Statistics: ::
+
+    $ cmph5tools.py listMetrics
+    --- Metrics:
+    ByFactor[metric, factor, statistic]	
+    _MoleculeReadStart	
+    _MinSubreadLength	
+    _MaxSubreadLength	
+    _UnrolledReadLength	
+    DefaultWhere	
+    DefaultGroupBy	
+    TemplateSpan
+	The number of template bases covered by the read	
+    ReadLength	
+    NErrors	
+    ReadDuration	
+    FrameRate	
+    IPD	
+    PulseWidth	
+    Movie	
+    Reference	
+    RefIdentifier	
+    HoleNumber	
+    ReadStart	 
+    ReadEnd	
+    TemplateStart	
+    TemplateEnd	
+    MoleculeId	
+    MoleculeName	
+    Strand	
+    AlignmentIdx	
+    Barcode	
+    AverageBarcodeScore	
+    MapQV	
+    WhiteList	
+    SubSample[rate, n]
+       	boolean vector with true occuring at rate rate or nreads = n
+ 
+    --- Statistics:
+    Min	
+    Max	
+    Sum	
+    Mean	
+    Median	
+    Count	
+    Percentile[metric, ptile]	
+    Round[metric, digits]
+
+-----------------------------------
+Example 5: Familiar SQL-like syntax
+-----------------------------------
+
+Filter by barcode and group by reference: ::
+
+    $ cmph5tools.py stats --what "Tbl(a=Accuracy,b=Barcode)" \
+    > --where "Barcode == 'bc_78--bc_78'"  \
+    > --groupBy Reference aligned_reads.cmp.h5
+    Group                a          b
+    MET_600_t2_2      0.96          bc_78--bc_78          
+    MET_600_t2_2      0.82          bc_78--bc_78          
+    MET_600_t2_2      0.85          bc_78--bc_78          
+    MET_600_t2_2      0.89          bc_78--bc_78          
+    MET_600_t2_2      0.87          bc_78--bc_78          
+    MET_600_t2_2      0.90          bc_78--bc_78          
+    MET_600_t2_2      0.90          bc_78--bc_78          
+    MET_600_t2_2      0.94          bc_78--bc_78
+
+--------------------------------------
+Example 6: Familiar SQL-like functions
+--------------------------------------
+
+Count alignments: ::
+
+    $ cmph5tools.py stats --what "Count(Reference)" \
+    > --where "Barcode == 'bc_78--bc_78'" \
+    > --groupBy Reference aligned_reads.cmp.h5
+    Group             Count(Reference)
+    MET_600_t2_2                     8
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100755
index 0000000..81d5f26
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,245 @@
+# -*- coding: utf-8 -*-
+#
+# pbh5tools documentation build configuration file, created by
+# sphinx-quickstart on Thu Nov 10 17:09:22 2011.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'pbh5tools'
+copyright = u'2011, devnet at pacificbiosciences.com'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1.0'
+# The full version, including alpha/beta/rc tags.
+release = '0.1.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'pacbio-theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = ["../../../doc/theme/", "./"]
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pbh5toolsdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('index', 'pbh5tools.tex', u'pbh5tools Documentation',
+   u'devnet at pacificbiosciences.com', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'pbh5tools', u'pbh5tools Documentation',
+     [u'devnet at pacificbiosciences.com'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'pbh5tools', u'pbh5tools Documentation', u'devnet at pacificbiosciences.com',
+   'pbh5tools', 'One line description of project.', 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {'http://docs.python.org/': None}
diff --git a/doc/example-slides.html b/doc/example-slides.html
new file mode 100644
index 0000000..828fbe9
--- /dev/null
+++ b/doc/example-slides.html
@@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
+<head>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  <meta http-equiv="Content-Style-Type" content="text/css" />
+  <meta name="generator" content="pandoc" />
+  <title></title>
+  <link rel="stylesheet" type="text/css" media="screen, projection, print"
+    href="http://www.w3.org/Talks/Tools/Slidy2/styles/slidy.css" />
+  <script src="http://www.w3.org/Talks/Tools/Slidy2/scripts/slidy.js.gz"
+    charset="utf-8" type="text/javascript"></script>
+</head>
+<body>
+<div class="section slide level2" id="overview">
+<h1 id="overview">Overview</h1>
+<p>This document describes the cmph5tools.py query interface. This interface is used to both produce data tables as well as new cmp.h5 files. The interface is meant to be somewhat similar to SQL.</p>
+<p>At the heart of the new tools is a small query language for extracting alignments and computing statistics over those alignments. The three relevant clauses are: <code>what</code>, <code>where</code>, and <code>groupBy</code>.</p>
+</div>
+<div class="section slide level2" id="example-1-produce-a-sub-sampled-cmp.h5-file">
+<h1 id="example-1-produce-a-sub-sampled-cmp.h5-file">Example 1: produce a sub-sampled cmp.h5 file</h1>
+<pre><code>$ export INCMP=$HOME/projects/software/bioinformatics/tools/pbh5tools/etc/aligned_reads_ss.cmp.h5</code></pre>
+<p>take 50% of the reads</p>
+<pre><code>$ cmph5tools.py select --where "SubSample(.5)" $INCMP --outFile ss.cmp.h5
+$ test -f ss.cmp.h5</code></pre>
+</div>
+<div class="section slide level2" id="example-2-produce-tabular-data-from-a-cmp.h5-file">
+<h1 id="example-2-produce-tabular-data-from-a-cmp.h5-file">Example 2: produce tabular data from a cmp.h5 file</h1>
+<pre><code>$ cmph5tools.py stats --what "Tbl(q = Percentile(ReadLength, 90), m = Median(Accuracy))" \
+> --groupBy Barcode $INCMP | tail
+bc_88--bc_88      486.40          0.91
+bc_89--bc_89      561.00          0.91
+bc_9--bc_9        479.80          0.90
+bc_90--bc_90      563.60          0.89
+bc_91--bc_91      554.60          0.91
+bc_92--bc_92      523.00          0.90
+bc_93--bc_93      542.00          0.90
+bc_94--bc_94      518.00          0.90
+bc_95--bc_95      512.20          0.91
+bc_96--bc_96      609.60          0.92</code></pre>
+</div>
+<div class="section slide level2" id="example-3-query-the-package-to-determine-the-available-metrics-and-statistics">
+<h1 id="example-3-query-the-package-to-determine-the-available-metrics-and-statistics">Example 3: query the package to determine the available metrics and statistics</h1>
+<pre><code>$ cmph5tools.py metrics
+\t\t--- Metrics --- (esc)
+TemplateSpan
+\tThe number of template bases covered by the read (esc)
+ReadLength
+NErrors
+ReadDuration
+FrameRate
+IPD
+PulseWidth
+Movie
+Reference
+RefIdentifier
+HoleNumber
+ReadStart
+ReadEnd
+TemplateStart
+TemplateEnd
+MoleculeIdx
+Strand
+AlignmentIdx
+Barcode
+SubSample[rate]
+\tboolean vector with true occuring at rate rate (esc)
+
+\t\t--- Statistics --- (esc)
+Mean
+Median
+Count
+Percentile[ptile]
+Round[digits]</code></pre>
+</div>
+<div class="section slide level2" id="example-4-familiar-sql-like-syntax">
+<h1 id="example-4-familiar-sql-like-syntax">Example 4: Familiar SQL-like syntax</h1>
+<pre><code>$ cmph5tools.py stats --what "Tbl(a=Accuracy,b=Barcode)" \
+> --where "Barcode == 'bc_78--bc_78'" \
+> --groupBy Reference $INCMP
+Group                a          b
+MET_600_t2_2      0.96          bc_78--bc_78          
+MET_600_t2_2      0.82          bc_78--bc_78          
+MET_600_t2_2      0.85          bc_78--bc_78          
+MET_600_t2_2      0.89          bc_78--bc_78          
+MET_600_t2_2      0.87          bc_78--bc_78          
+MET_600_t2_2      0.90          bc_78--bc_78          
+MET_600_t2_2      0.90          bc_78--bc_78          
+MET_600_t2_2      0.94          bc_78--bc_78          </code></pre>
+</div>
+<div class="section slide level2" id="example-5-familiar-sql-like-functions">
+<h1 id="example-5-familiar-sql-like-functions">Example 5: Familiar SQL-like functions</h1>
+<pre><code>$ cmph5tools.py stats --what "Count(Reference)" \
+> --where "Barcode == 'bc_78--bc_78'" \
+> --groupBy Reference $INCMP
+Group             Count(Reference)
+MET_600_t2_2                     8</code></pre>
+</div>
+<div class="section slide level2" id="handling-ipd-pulsewidth-and-base-level-metrics">
+<h1 id="handling-ipd-pulsewidth-and-base-level-metrics">Handling IPD, PulseWidth, and Base-level Metrics</h1>
+<p>A constant <em>hurdle</em> with cmp.h5 is dealing with the different sized data, i.e., base-level data and alignment-level data. The stats tool provides convenience functions for dealing with this</p>
+<pre><code>$ cmph5tools.py stats --where "(Barcode == 'bc_78--bc_78') & (Accuracy > .95)" \
+> --what "Tbl(idx = AlignmentIdx, ipd = Median(IPD), pw = Median(PulseWidth))" \
+> $INCMP
+           ipd             pw           idx
+          9.00          12.00          1683</code></pre>
+<p><strong>NOTE:</strong> The ‘(’ surrounding the clauses in the where predicate are</p>
+</div>
+<div class="section slide level2" id="selection">
+<h1 id="selection">Selection</h1>
+<p>Some of our internal tools work on an entire cmp.h5 file. In particular, a good deal of the Milhouse plotting code treats an input cmp.h5 file as a single condition. This <em>pushes</em> the logic of plotting down into the individual plotting functions, e.g., <code>gAccuracyByReferenceSequence</code>. Instead, we can use tools like selection to produce plots on very arbitrary splits.</p>
+<pre><code>$ cmph5tools.py select --where "Accuracy > .95" $INCMP
+$ cmph5tools.py summarize out.cmp.h5
+----------------------------------------
+filename: out.cmp.h5
+version:  1.3.1.PB
+n reads:  91
+n refs:   40
+n movies: 4
+n bases:  15315
+avg rl:   168
+avg acc:  0.9658</code></pre>
+</div>
+<div class="section slide level2" id="selection-1">
+<h1 id="selection-1">Selection</h1>
+<pre><code>$ cmph5tools.py select --groupBy Barcode \
+> --where "(Barcode == 'bc_42--bc_42') | (Barcode == 'bc_28--bc_28')" $INCMP
+$ cmph5tools.py merge --outFile merged.cmp.h5 bc_42--bc_42.cmp.h5 bc_28--bc_28.cmp.h5
+$ cmph5tools.py stats --what "Count(Reference)" --groupBy Barcode merged.cmp.h5
+Group             Count(Reference)
+bc_28--bc_28                     6
+bc_42--bc_42                    65</code></pre>
+<p><strong>NOTE:</strong> One of the reasons that the cmp.h5 file format is appealing is that you don’t need to split it to access particular strata. In practice, we probably don’t want to be splitting files up; A <em>View</em>-type design pattern might be more appropriate.</p>
+</div>
+<div class="section slide level2" id="implementation">
+<h1 id="implementation">Implementation</h1>
+<p>The <em>Stats</em> interface is defined <code>pbtools.pbh5tools.Metrics</code>. Implementing new metrics and statistics is fairly straightforward. In fact, all of the heavy lifting is performed by D. Alexander’s <code>CmpH5Reader</code>.</p>
+</div>
+</body>
+</html>
diff --git a/doc/examples.md b/doc/examples.md
new file mode 100644
index 0000000..0e05538
--- /dev/null
+++ b/doc/examples.md
@@ -0,0 +1,209 @@
+## Overview
+This document describes the cmph5tools.py query interface. This
+interface is used to both produce data tables as well as new cmp.h5
+files. The interface is meant to be somewhat similar to SQL.
+
+At the heart of the new tools is a small query language for extracting
+alignments and computing statistics over those alignments. The three
+relevant clauses are: `what`, `where`, and `groupBy`. 
+
+## Example 1: produce a sub-sampled cmp.h5 file
+
+    $ export INCMP=$TESTDIR/../etc/aligned_reads_ss.cmp.h5
+
+take 50% of the reads 
+
+    $ cmph5tools.py select --where "SubSample(.5)" $INCMP --outFile ss.cmp.h5
+    $ test -f ss.cmp.h5
+
+## Example 2: produce tabular data from a cmp.h5 file
+
+    $ cmph5tools.py stats --what "Tbl(q = Percentile(ReadLength, 90), m = Median(Accuracy))" \
+    > --groupBy Barcode $INCMP | tail
+                    bc_88--bc_88                    486.40                    0.91
+                    bc_89--bc_89                    561.00                    0.91
+                      bc_9--bc_9                    479.80                    0.90
+                    bc_90--bc_90                    563.60                    0.89
+                    bc_91--bc_91                    554.60                    0.91
+                    bc_92--bc_92                    523.00                    0.90
+                    bc_93--bc_93                    542.00                    0.90
+                    bc_94--bc_94                    518.00                    0.90
+                    bc_95--bc_95                    512.20                    0.91
+                    bc_96--bc_96                    609.60                    0.92
+
+
+## Example 3: query the package to determine the available metrics and statistics
+
+    $ cmph5tools.py listMetrics
+    --- Metrics:
+    ByFactor[metric, factor, statistic]\t (esc)
+    _MoleculeReadStart\t (esc)
+    _MinSubreadLength\t (esc)
+    _MaxSubreadLength\t (esc)
+    _UnrolledReadLength\t (esc)
+    DefaultWhere\t (esc)
+    DefaultGroupBy\t (esc)
+    TemplateSpan
+    \tThe number of template bases covered by the read\t (esc)
+    ReadLength\t (esc)
+    NErrors\t (esc)
+    ReadDuration\t (esc)
+    FrameRate\t (esc)
+    IPD\t (esc)
+    PulseWidth\t (esc)
+    Movie\t (esc)
+    Reference\t (esc)
+    RefIdentifier\t (esc)
+    HoleNumber\t (esc)
+    ReadStart\t (esc)
+    ReadEnd\t (esc)
+    TemplateStart\t (esc)
+    TemplateEnd\t (esc)
+    MoleculeId\t (esc)
+    MoleculeName\t (esc)
+    Strand\t (esc)
+    AlignmentIdx\t (esc)
+    Barcode\t (esc)
+    AverageBarcodeScore\t (esc)
+    MapQV\t (esc)
+    WhiteList\t (esc)
+    SubSample[rate, n]
+    \tboolean vector with true occuring at rate rate or nreads = n (esc)
+    
+    --- Statistics:
+    Min\t (esc)
+    Max\t (esc)
+    Sum\t (esc)
+    Mean\t (esc)
+    Median\t (esc)
+    Count\t (esc)
+    Percentile[metric, ptile]\t (esc)
+    Round[metric, digits]
+
+## Example 4: Familiar SQL-like syntax
+
+    $ cmph5tools.py stats --what "Tbl(a=Accuracy,b=Barcode)" \
+    > --where "Barcode == 'bc_78--bc_78'" \
+    > --groupBy Reference $INCMP
+                           Group                       a                               b
+                    MET_600_t2_2                    0.94                    bc_78--bc_78
+                    MET_600_t2_2                    0.90                    bc_78--bc_78
+                    MET_600_t2_2                    0.90                    bc_78--bc_78
+                    MET_600_t2_2                    0.87                    bc_78--bc_78
+                    MET_600_t2_2                    0.89                    bc_78--bc_78
+                    MET_600_t2_2                    0.85                    bc_78--bc_78
+                    MET_600_t2_2                    0.82                    bc_78--bc_78
+                    MET_600_t2_2                    0.96                    bc_78--bc_78
+
+## Example 5: Familiar SQL-like functions
+
+    $ cmph5tools.py stats --what "Count(Reference)" \
+    > --where "Barcode == 'bc_78--bc_78'" \
+    > --groupBy Reference $INCMP
+                           Group                    Count(Reference)
+                    MET_600_t2_2                                   8
+
+## Handling IPD, PulseWidth, and Base-level Metrics
+
+A constant _hurdle_ with cmp.h5 is dealing with the different sized
+data, i.e., base-level data and alignment-level data. The stats tool
+provides convenience functions for dealing with this
+
+    $ cmph5tools.py stats --where "(Barcode == 'bc_78--bc_78') & (Accuracy > .95)" \
+    > --what "Tbl(idx = AlignmentIdx, ipd = Median(IPD), pw = Median(PulseWidth))" \
+    > $INCMP
+                         ipd                       pw                     idx
+                        9.00                    12.00                    1683
+
+**NOTE:** The '(' surrounding the clauses in the where predicate are
+
+## Selection
+Some of our internal tools work on an entire cmp.h5 file. In
+particular, a good deal of the Milhouse plotting code treats an input
+cmp.h5 file as a single condition. This _pushes_ the logic of plotting
+down into the individual plotting functions, e.g.,
+`gAccuracyByReferenceSequence`. Instead, we can use tools like
+selection to produce plots on very arbitrary splits.
+
+    $ cmph5tools.py select --where "Accuracy > .95" $INCMP
+    $ cmph5tools.py summarize out.cmp.h5
+    ----------------------------------------
+    filename: out.cmp.h5
+    version:  1.3.1.PB
+    n reads:  91
+    n refs:   40
+    n movies: 4
+    n bases:  15315
+    avg rl:   168
+    avg acc:  0.9658
+    
+    \t Movie Summary: (esc)
+            Group     nBases     avgAccuracy     avgReadLength
+    m121005_190843_42142_c100376662550000001523029810101285_s1_p0     4770               1.0             170.4
+    m121005_190843_42142_c100376662550000001523029810101285_s2_p0     2259               1.0             173.8
+    m121005_210848_42142_c100376662550000001523029810101286_s1_p0     6167               1.0             192.7
+    m121005_210848_42142_c100376662550000001523029810101286_s2_p0     2119               1.0             117.7
+    
+    \t Reference Summary: (esc)
+            Group     nBases     avgAccuracy     avgReadLength
+    EGFR_600_t11_1     1396               1.0             232.7
+    EGFR_600_t13_1     83                 1.0              41.5
+    EGFR_600_t15_1     252                1.0             126.0
+    EGFR_600_t16_1     50                 1.0              25.0
+    EGFR_600_t17_1     531                1.0             531.0
+    EGFR_600_t19_1     705                1.0             235.0
+    EGFR_600_t20_1     184                1.0             184.0
+    EGFR_600_t21_1     122                1.0             122.0
+    EGFR_600_t23_1     553                1.0             276.5
+    EGFR_600_t24_1     32                 1.0              32.0
+    EGFR_600_t25_1     531                1.0             265.5
+    EGFR_600_t26_1     55                 1.0              27.5
+    EGFR_600_t27_1     148                1.0             148.0
+    EGFR_600_t28_1     91                 1.0              30.3
+    EGFR_600_t2_1     239                1.0              59.8
+    EGFR_600_t3_1     462                1.0             462.0
+    EGFR_600_t4_1     701                1.0             350.5
+    EGFR_600_t5_1     147                1.0              49.0
+    EGFR_600_t6_1     201                1.0              50.2
+    EGFR_600_t7_1     873                1.0             436.5
+    EGFR_600_t9_1     562                1.0             112.4
+     MET_600_t2_1     42                 1.0              42.0
+    MET_600_t2_11     681                1.0             227.0
+    MET_600_t2_13     448                1.0             448.0
+    MET_600_t2_14     223                1.0             223.0
+    MET_600_t2_15     43                 1.0              43.0
+    MET_600_t2_16     671                1.0             167.8
+    MET_600_t2_17     44                 1.0              44.0
+    MET_600_t2_18     701                1.0             175.2
+    MET_600_t2_19     546                1.0             182.0
+     MET_600_t2_2     186                1.0              93.0
+    MET_600_t2_20     394                1.0              98.5
+    MET_600_t2_21     71                 1.0              35.5
+     MET_600_t2_3     655                1.0             218.3
+     MET_600_t2_4     1321               1.0             330.2
+     MET_600_t2_5     500                1.0             500.0
+     MET_600_t2_6     120                1.0             120.0
+     MET_600_t2_7     660                1.0             220.0
+     MET_600_t2_8     24                 1.0              24.0
+     MET_600_t2_9     67                 1.0              67.0
+
+## Selection
+    $ cmph5tools.py select --groupBy Barcode \
+    > --where "(Barcode == 'bc_42--bc_42') | (Barcode == 'bc_28--bc_28')" $INCMP
+    $ cmph5tools.py merge --outFile merged.cmp.h5 bc_42--bc_42.cmp.h5 bc_28--bc_28.cmp.h5
+    $ cmph5tools.py stats --what "Count(Reference)" --groupBy Barcode merged.cmp.h5
+                           Group                    Count(Reference)
+                    bc_28--bc_28                                   6
+                    bc_42--bc_42                                  65
+
+
+**NOTE:** One of the reasons that the cmp.h5 file format is appealing
+is that you don't need to split it to access particular strata. In
+practice, we probably don't want to be splitting files up; A
+_View_-type design pattern might be more appropriate.
+
+## Implementation
+The _Stats_ interface is defined
+`pbtools.pbh5tools.Metrics`. Implementing new metrics and statistics
+is fairly straightforward. In fact, all of the heavy lifting is
+performed by D. Alexander's `CmpH5Reader`.
diff --git a/doc/examples.t b/doc/examples.t
new file mode 100644
index 0000000..c13ce21
--- /dev/null
+++ b/doc/examples.t
@@ -0,0 +1,265 @@
+## Overview
+This document describes the cmph5tools.py query interface. This
+interface is used to both produce data tables as well as new cmp.h5
+files. The interface is meant to be somewhat similar to SQL.
+
+At the heart of the new tools is a small query language for extracting
+alignments and computing statistics over those alignments. The three
+relevant clauses are: `what`, `where`, and `groupBy`. 
+
+## Example 1: produce a sub-sampled cmp.h5 file
+
+  $ export INCMP=$TESTDIR/../etc/aligned_reads_ss.cmp.h5
+
+take 50% of the reads 
+
+  $ cmph5tools.py select --where "SubSample(.5)" $INCMP --outFile ss.cmp.h5
+  $ test -f ss.cmp.h5
+
+## Example 2: produce tabular data from a cmp.h5 file
+
+  $ cmph5tools.py stats --what "Tbl(q = Percentile(ReadLength, 90), m = Median(Accuracy))" \
+  > --groupBy Barcode $INCMP | tail
+                    F_44--R_44                    516.80                    0.90
+                    F_45--R_45                    534.60                    0.90
+                    F_46--R_46                    517.80                    0.90
+                    F_47--R_47                    515.80                    0.91
+                    F_48--R_48                    624.20                    0.91
+                      F_5--R_5                    571.60                    0.90
+                      F_6--R_6                    507.90                    0.90
+                      F_7--R_7                    572.40                    0.90
+                      F_8--R_8                    574.40                    0.91
+                      F_9--R_9                    485.60                    0.90
+
+
+## Example 3: query the package to determine the available metrics and statistics
+
+  $ cmph5tools.py listMetrics
+  --- Metrics:
+  ByFactor[metric, factor, statistic]\t (esc)
+  _MoleculeReadStart\t (esc)
+  _MinSubreadLength\t (esc)
+  _MaxSubreadLength\t (esc)
+  _UnrolledReadLength\t (esc)
+  DefaultWhere\t (esc)
+  DefaultGroupBy\t (esc)
+  TemplateSpan
+  \tThe number of template bases covered by the read\t (esc)
+  ReadLength\t (esc)
+  NErrors\t (esc)
+  ReadDuration\t (esc)
+  FrameRate\t (esc)
+  IPD\t (esc)
+  PulseWidth\t (esc)
+  Movie\t (esc)
+  Reference\t (esc)
+  RefIdentifier\t (esc)
+  HoleNumber\t (esc)
+  ReadStart\t (esc)
+  ReadEnd\t (esc)
+  TemplateStart\t (esc)
+  TemplateEnd\t (esc)
+  MoleculeId\t (esc)
+  MoleculeName\t (esc)
+  Strand\t (esc)
+  AlignmentIdx\t (esc)
+  Barcode\t (esc)
+  AverageBarcodeScore\t (esc)
+  MapQV\t (esc)
+  WhiteList\t (esc)
+  SubSample[rate, n]
+  \tboolean vector with true occuring at rate rate or nreads = n (esc)
+  
+  --- Statistics:
+  Min\t (esc)
+  Max\t (esc)
+  Sum\t (esc)
+  Mean\t (esc)
+  Median\t (esc)
+  Count\t (esc)
+  Percentile[metric, ptile]\t (esc)
+  Round[metric, digits]
+
+## Example 4: Familiar SQL-like syntax
+
+  $ cmph5tools.py stats --what "Tbl(a=Accuracy,b=Barcode)" \
+  > --where "Barcode == 'F_28--R_28'" \
+  > --groupBy Reference $INCMP
+                         Group                       a                             b
+                EGFR_600_t29_1                    0.91                    F_28--R_28
+                EGFR_600_t29_1                    0.91                    F_28--R_28
+                EGFR_600_t29_1                    0.95                    F_28--R_28
+                EGFR_600_t29_1                    0.93                    F_28--R_28
+                EGFR_600_t29_1                    0.91                    F_28--R_28
+                EGFR_600_t29_1                    0.90                    F_28--R_28
+                EGFR_600_t29_1                    0.94                    F_28--R_28
+                EGFR_600_t29_1                    0.93                    F_28--R_28
+                 EGFR_600_t9_1                    0.86                    F_28--R_28
+
+## Example 5: Familiar SQL-like functions
+
+  $ cmph5tools.py stats --what "Count(Reference)" \
+  > --where "Barcode == 'F_10--R_10'" \
+  > --groupBy Reference $INCMP
+                         Group                    Count(Reference)
+                EGFR_600_t11_1                                  62
+
+## Handling IPD, PulseWidth, and Base-level Metrics
+
+A constant _hurdle_ with cmp.h5 is dealing with the different sized
+data, i.e., base-level data and alignment-level data. The stats tool
+provides convenience functions for dealing with this
+
+  $ cmph5tools.py stats --where "(Barcode == 'F_10--R_10') & (Accuracy > .95)" \
+  > --what "Tbl(idx = AlignmentIdx, ipd = Median(IPD), pw = Median(PulseWidth))" \
+  > $INCMP
+                        ipd                       pw                    idx
+                      15.50                    11.50                    562
+                       9.00                    10.00                    563
+                      11.00                    10.00                    589
+                      13.00                    11.00                    603
+                      14.00                    10.50                    626
+                       8.00                    13.00                    627
+
+**NOTE:** The '(' surrounding the clauses in the where predicate are
+
+## Selection
+Some of our internal tools work on an entire cmp.h5 file. In
+particular, a good deal of the Milhouse plotting code treats an input
+cmp.h5 file as a single condition. This _pushes_ the logic of plotting
+down into the individual plotting functions, e.g.,
+`gAccuracyByReferenceSequence`. Instead, we can use tools like
+selection to produce plots on very arbitrary splits.
+
+  $ cmph5tools.py select --where "Accuracy > .95" $INCMP
+  $ cmph5tools.py summarize out.cmp.h5
+  ----------------------------------------
+  filename: out.cmp.h5
+  version:  1.3.1.PB
+  n reads:  91
+  n refs:   40
+  n movies: 4
+  n bases:  15315
+  avg rl:   168
+  avg acc:  0.9658
+  
+  \t Movie Summary: (esc)
+          Group     nBases     avgAccuracy     avgReadLength
+  m121005_190843_42142_c100376662550000001523029810101285_s1_p0     4770               1.0             170.4
+  m121005_190843_42142_c100376662550000001523029810101285_s2_p0     2259               1.0             173.8
+  m121005_210848_42142_c100376662550000001523029810101286_s1_p0     6167               1.0             192.7
+  m121005_210848_42142_c100376662550000001523029810101286_s2_p0     2119               1.0             117.7
+  
+  \t Reference Summary: (esc)
+          Group     nBases     avgAccuracy     avgReadLength
+  EGFR_600_t11_1     1396               1.0             232.7
+  EGFR_600_t13_1     83                 1.0              41.5
+  EGFR_600_t15_1     252                1.0             126.0
+  EGFR_600_t16_1     50                 1.0              25.0
+  EGFR_600_t17_1     531                1.0             531.0
+  EGFR_600_t19_1     705                1.0             235.0
+  EGFR_600_t20_1     184                1.0             184.0
+  EGFR_600_t21_1     122                1.0             122.0
+  EGFR_600_t23_1     553                1.0             276.5
+  EGFR_600_t24_1     32                 1.0              32.0
+  EGFR_600_t25_1     531                1.0             265.5
+  EGFR_600_t26_1     55                 1.0              27.5
+  EGFR_600_t27_1     148                1.0             148.0
+  EGFR_600_t28_1     91                 1.0              30.3
+  EGFR_600_t2_1     239                1.0              59.8
+  EGFR_600_t3_1     462                1.0             462.0
+  EGFR_600_t4_1     701                1.0             350.5
+  EGFR_600_t5_1     147                1.0              49.0
+  EGFR_600_t6_1     201                1.0              50.2
+  EGFR_600_t7_1     873                1.0             436.5
+  EGFR_600_t9_1     562                1.0             112.4
+   MET_600_t2_1     42                 1.0              42.0
+  MET_600_t2_11     681                1.0             227.0
+  MET_600_t2_13     448                1.0             448.0
+  MET_600_t2_14     223                1.0             223.0
+  MET_600_t2_15     43                 1.0              43.0
+  MET_600_t2_16     671                1.0             167.8
+  MET_600_t2_17     44                 1.0              44.0
+  MET_600_t2_18     701                1.0             175.2
+  MET_600_t2_19     546                1.0             182.0
+   MET_600_t2_2     186                1.0              93.0
+  MET_600_t2_20     394                1.0              98.5
+  MET_600_t2_21     71                 1.0              35.5
+   MET_600_t2_3     655                1.0             218.3
+   MET_600_t2_4     1321               1.0             330.2
+   MET_600_t2_5     500                1.0             500.0
+   MET_600_t2_6     120                1.0             120.0
+   MET_600_t2_7     660                1.0             220.0
+   MET_600_t2_8     24                 1.0              24.0
+   MET_600_t2_9     67                 1.0              67.0
+
+## Selection
+  $ cmph5tools.py select --groupBy Barcode \
+  > --where "(Barcode == 'F_42--R_42') | (Barcode == 'F_10--R_10')" $INCMP
+  $ cmph5tools.py merge --outFile merged.cmp.h5 F_42--R_42.cmp.h5 F_10--R_10.cmp.h5
+  $ cmph5tools.py stats --what "Count(Reference)" --groupBy Barcode merged.cmp.h5
+                         Group                    Count(Reference)
+                    F_10--R_10                                  62
+                    F_42--R_42                                  76
+
+
+cmp.h5 files can also be split by a pre-defined csv file with three columns.
+The csv must contain a header.  Column 1 is group names.  Columns 2 and 3 header
+names are metrics from 'cmph5tools.py listMetrics'
+  $ export INCSV=$TESTDIR/../etc/grouped.csv
+  $ cmph5tools.py select --groupByCsv $INCSV $INCMP
+
+  $ cmph5tools.py stats grpA.cmp.h5
+  readLength                    accuracy
+  426                               0.90
+  538                               0.92
+  552                               0.92
+  589                               0.89
+  538                               0.92
+  129                               0.91
+  118                               0.83
+  126                               0.85
+  239                               0.92
+  $ cmph5tools.py stats grpB.cmp.h5
+  readLength                    accuracy
+  156                               0.92
+  347                               0.86
+  531                               0.92
+  550                               0.93
+  122                               0.80
+  86                                0.93
+  132                               0.90
+  $ cmph5tools.py stats grpC.cmp.h5
+  readLength                    accuracy
+  151                               0.90
+  193                               0.90
+  439                               0.90
+  546                               0.90
+  563                               0.90
+  565                               0.92
+  568                               0.89
+  148                               0.95
+  59                                0.88
+  $ cmph5tools.py stats grpD.cmp.h5
+  readLength                    accuracy
+  95                                0.94
+  304                               0.88
+  455                               0.95
+  489                               0.86
+  584                               0.87
+  558                               0.90
+  567                               0.87
+  543                               0.90
+  170                               0.98
+  171                               0.82
+
+**NOTE:** One of the reasons that the cmp.h5 file format is appealing
+is that you don't need to split it to access particular strata. In
+practice, we probably don't want to be splitting files up; A
+_View_-type design pattern might be more appropriate.
+
+## Implementation
+The _Stats_ interface is defined
+`pbtools.pbh5tools.Metrics`. Implementing new metrics and statistics
+is fairly straightforward. In fact, all of the heavy lifting is
+performed by D. Alexander's `CmpH5Reader`.
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..aca93b0
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,147 @@
+.. pbh5tools documentation master file, created by
+   sphinx-quickstart on Thu Nov 10 17:09:22 2011.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+=========
+pbh5tools
+=========
+
+``pbh5tools`` is a collection of tools that can manipulate the content or extract data from 
+two types of h5 files:
+
+* ``cmp.h5``: files that contain alignment information.
+* ``bas.h5`` and ``pls.h5``: files that contain base-call information.
+
+``pbh5tools`` is comprised of two executables: ``cmph5tools.py`` and
+``bash5tools.py``. At the moment, the ``cmph5tools.py`` program
+provides a rich set of tools to manipulate and analyze the data in a
+``cmp.h5`` file. The ``bash5tools.py`` provides mechanisms to extract
+basecall information from bas.h5 files.
+
+
+############
+Installation
+############
+
+To install ``pbh5tools``, run the following command from the ``pbh5tools`` root directory: ::
+
+   python setup.py install
+
+####################
+Tool: bash5tools.py
+####################
+
+``bash5tools.py`` can extract read sequences and quality values for
+both Raw and circular consensus sequencing (CCS) readtypes and use
+create ``fastq`` and ``fasta`` files.
+
+
+-----
+Usage
+-----
+::
+
+    usage: bash5tools.py [-h] [--verbose] [--version] [--profile] [--debug]
+                         [--outFilePrefix OUTFILEPREFIX]
+                         [--readType {ccs,subreads,unrolled}] [--outType OUTTYPE]
+                         [--minLength MINLENGTH] [--minReadScore MINREADSCORE]
+                         [--minPasses MINPASSES]
+                         input.bas.h5
+
+    Tool for extracting data from .bas.h5 files
+
+    positional arguments:
+      input.bas.h5          input .bas.h5 filename
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      --verbose, -v         Set the verbosity level (default: None)
+      --version             show program's version number and exit
+      --profile             Print runtime profile at exit (default: False)
+      --debug               Run within a debugger session (default: False)
+      --outFilePrefix OUTFILEPREFIX
+                            output filename prefix [None]
+      --readType {ccs,subreads,unrolled}
+                            read type (ccs, subreads, or unrolled) []
+      --outType OUTTYPE     output file type (fasta, fastq) [fasta]
+
+    Read filtering arguments:
+      --minLength MINLENGTH
+                            min read length [0]
+      --minReadScore MINREADSCORE
+                            min read score, valid only with
+                            --readType={unrolled,subreads} [0]
+      --minPasses MINPASSES
+                            min number of CCS passes, valid only with
+                            --readType=ccs [0]
+
+--------
+Examples
+--------
+
+Extracting all Raw reads from ``input.bas.h5`` without any filtering
+and exporting to FASTA (``myreads.fasta``): ::
+
+    python bash5tools.py input.bas.h5 --outFilePrefix myreads --outType fasta --readType Raw
+
+Extracting all CCS reads from ``input.bas.h5`` that have read lengths
+larger than 100 and exporting to FASTQ (``myreads.fastq``): ::
+
+    python bash5tools.py --inFile input.bas.h5 --outFilePref myreads --outType fastq --readType CCS --minLength 100
+
+
+####################
+Tool: cmph5tools.py
+####################
+
+``cmph5tools.py`` is a multi-commandline tool that provides access to
+the following subtools:
+
+1. **merge**: Merge multiple ``cmp.h5`` files into a single file.
+
+2. **sort**: Sort a ``cmp.h5`` file.
+
+3. **select**: Create a new file from a ``cmp.h5`` file by specifying
+which reads to include.
+
+4. **equal**: Compare the contents of 2 ``cmp.h5`` files for
+equivalence.
+
+5. **summarize**: Summarize the contents of a ``cmp.h5`` file in a
+verbose, human readable format.
+
+6. **stats**: Extract summary metrics from a ``cmp.h5`` file into a
+``csv`` file.
+
+7. **valid**: Determine whether a ``cmp.h5`` file is valid.
+
+8. **listMetrics**: Emit the available metrics and statistics for use
+in the ``select`` and ``stats`` subcommands.
+
+To list all available subtools provided by ``cmph5tools.py`` simply
+run: ::
+
+    cmph5tools.py --help
+
+Each subtool has its own usage information which can be generated by
+running: ::
+
+    cmph5tools.py <toolname> --help
+
+To run any subtool it is suggested to use the ``--info`` commandline
+argument since this will provide progress information while the script
+is running via printing in stdout: ::
+
+    cmph5tools.py <toolname> --info <other arguments>
+
+.. toctree::
+   :maxdepth: 2
+
+   cmph5tools-examples
+   
+   
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/doc/pacbio-theme/static/headerGradient.jpg b/doc/pacbio-theme/static/headerGradient.jpg
new file mode 100644
index 0000000..883f147
Binary files /dev/null and b/doc/pacbio-theme/static/headerGradient.jpg differ
diff --git a/doc/pacbio-theme/static/pacbio.css b/doc/pacbio-theme/static/pacbio.css
new file mode 100644
index 0000000..b4ab87f
--- /dev/null
+++ b/doc/pacbio-theme/static/pacbio.css
@@ -0,0 +1,238 @@
+/**
+ * Sphinx stylesheet -- default theme
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+ 
+ at import url("basic.css");
+ 
+/* -- page layout ----------------------------------------------------------- */
+ 
+body {
+    font-family: Arial, sans-serif;
+    font-size: 100%;
+    background-color: #555;
+    color: #555;
+    margin: 0;
+    padding: 0;
+    min-width: 500px;
+    max-width: 956px;
+    margin: 0 auto;
+}
+
+div.documentwrapper {
+    float: left;
+    width: 100%;
+}
+
+div.bodywrapper {
+    margin: 0 0 0 230px;
+}
+
+hr{
+    border: 1px solid #B1B4B6;
+    
+}
+ 
+div.document {
+    background-color: #eee;
+}
+ 
+div.body {
+    background-color: #ffffff;
+    color: #3E4349;
+    padding: 30px 30px 30px 30px;
+    font-size: 0.8em;
+}
+ 
+div.footer {
+    color: #555;
+	background-color: #fff;
+    padding: 13px 0;
+    text-align: center;
+    font-size: 75%;
+
+}
+div.footer a {
+    color: #444;
+    text-decoration: underline;
+}
+ 
+div.related {
+    background: #fff url(headerGradient.jpg);
+    line-height: 80px;
+    color: #fff;
+    font-size: 0.80em;
+    height: 79px;
+    z-index: -1;
+}
+
+div.related ul {
+    background: url(pacbioLogo.png) 10px no-repeat;
+    padding: 0 0 0 200px;
+}
+ 
+div.related a {
+    color: #E2F3CC;
+}
+ 
+div.sphinxsidebar {
+    font-size: 0.75em;
+    line-height: 1.5em;
+}
+
+div.sphinxsidebarwrapper{
+    padding: 20px 0;
+}
+ 
+div.sphinxsidebar h3,
+div.sphinxsidebar h4 {
+    font-family: Arial, sans-serif;
+    color: #222;
+    font-size: 1.2em;
+    font-weight: bold;
+    margin: 0;
+    padding: 5px 10px 0 10px;
+}
+
+div.sphinxsidebar h4{
+    font-size: 1.1em;
+}
+ 
+div.sphinxsidebar h3 a {
+    color: #444;
+}
+ 
+ 
+div.sphinxsidebar p {
+    color: #888;
+    padding: 0px 20px;
+	margin-top: 5px;
+}
+ 
+div.sphinxsidebar p.topless {
+}
+ 
+div.sphinxsidebar ul {
+    margin: 5px 20px 10px 20px;
+    padding: 0;
+    color: #000;
+}
+ 
+div.sphinxsidebar a {
+    color: #444;
+}
+ 
+div.sphinxsidebar input {
+    border: 1px solid #ccc;
+    font-family: sans-serif;
+    font-size: 1em;
+}
+
+div.sphinxsidebar input[type=text]{
+    margin-left: 20px;
+}
+ 
+/* -- body styles ----------------------------------------------------------- */
+ 
+a {
+    color: #005B81;
+    text-decoration: none;
+}
+ 
+a:hover {
+    color: #E32E00;
+    text-decoration: underline;
+}
+ 
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+    font-family: Arial, sans-serif;
+    font-weight: bold;
+    color: #264868;
+    margin: 30px 0px 10px 0px;
+    padding: 5px 0 5px 0px;
+}
+ 
+div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 180%; font-weight: normal; }
+div.body h2 { font-size: 125%; }
+div.body h3 { font-size: 110%; }
+div.body h4 { font-size: 100%; }
+div.body h5 { font-size: 100%; }
+div.body h6 { font-size: 100%; }
+ 
+a.headerlink {
+    color: #c60f0f;
+    font-size: 0.8em;
+    padding: 0 4px 0 4px;
+    text-decoration: none;
+}
+ 
+a.headerlink:hover {
+    background-color: #c60f0f;
+    color: white;
+}
+ 
+div.body p, div.body dd, div.body li {
+    line-height: 1.5em;
+    font-size: 1em;
+}
+ 
+div.admonition p.admonition-title + p {
+    display: inline;
+}
+
+div.highlight{
+    background-color: white;
+}
+
+div.note {
+    background-color: #eee;
+    border: 1px solid #ccc;
+}
+ 
+div.seealso {
+    background-color: #ffc;
+    border: 1px solid #ff6;
+}
+ 
+div.topic {
+    background-color: #eee;
+}
+ 
+div.warning {
+    background-color: #ffe4e4;
+    border: 1px solid #f66;
+}
+ 
+p.admonition-title {
+    display: inline;
+}
+ 
+p.admonition-title:after {
+    content: ":";
+}
+ 
+pre {
+    padding: 10px;
+    background-color: White;
+    color: #222;
+    line-height: 1.2em;
+    border: 1px solid #C6C9CB;
+    font-size: 1.2em;
+    margin: 1.5em 0 1.5em 0;
+    -webkit-box-shadow: 1px 1px 1px #d8d8d8;
+    -moz-box-shadow: 1px 1px 1px #d8d8d8;
+}
+ 
+tt {
+    background-color: #ecf0f3;
+    color: #222;
+    padding: 1px 2px;
+    font-size: 1.2em;
+    font-family: monospace;
+}
+
diff --git a/doc/pacbio-theme/static/pacbioLogo.png b/doc/pacbio-theme/static/pacbioLogo.png
new file mode 100644
index 0000000..b2e4887
Binary files /dev/null and b/doc/pacbio-theme/static/pacbioLogo.png differ
diff --git a/doc/pacbio-theme/static/pygments.css b/doc/pacbio-theme/static/pygments.css
new file mode 100644
index 0000000..4588cde
--- /dev/null
+++ b/doc/pacbio-theme/static/pygments.css
@@ -0,0 +1,55 @@
+.c { color: #999988; font-style: italic } /* Comment */
+.k { font-weight: bold } /* Keyword */
+.o { font-weight: bold } /* Operator */
+.cm { color: #999988; font-style: italic } /* Comment.Multiline */
+.cp { color: #999999; font-weight: bold } /* Comment.preproc */
+.c1 { color: #999988; font-style: italic } /* Comment.Single */
+.gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
+.ge { font-style: italic } /* Generic.Emph */
+.gr { color: #aa0000 } /* Generic.Error */
+.gh { color: #999999 } /* Generic.Heading */
+.gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
+.go { color: #111 } /* Generic.Output */
+.gp { color: #555555 } /* Generic.Prompt */
+.gs { font-weight: bold } /* Generic.Strong */
+.gu { color: #aaaaaa } /* Generic.Subheading */
+.gt { color: #aa0000 } /* Generic.Traceback */
+.kc { font-weight: bold } /* Keyword.Constant */
+.kd { font-weight: bold } /* Keyword.Declaration */
+.kp { font-weight: bold } /* Keyword.Pseudo */
+.kr { font-weight: bold } /* Keyword.Reserved */
+.kt { color: #445588; font-weight: bold } /* Keyword.Type */
+.m { color: #009999 } /* Literal.Number */
+.s { color: #bb8844 } /* Literal.String */
+.na { color: #008080 } /* Name.Attribute */
+.nb { color: #999999 } /* Name.Builtin */
+.nc { color: #445588; font-weight: bold } /* Name.Class */
+.no { color: #ff99ff } /* Name.Constant */
+.ni { color: #800080 } /* Name.Entity */
+.ne { color: #990000; font-weight: bold } /* Name.Exception */
+.nf { color: #990000; font-weight: bold } /* Name.Function */
+.nn { color: #555555 } /* Name.Namespace */
+.nt { color: #000080 } /* Name.Tag */
+.nv { color: purple } /* Name.Variable */
+.ow { font-weight: bold } /* Operator.Word */
+.mf { color: #009999 } /* Literal.Number.Float */
+.mh { color: #009999 } /* Literal.Number.Hex */
+.mi { color: #009999 } /* Literal.Number.Integer */
+.mo { color: #009999 } /* Literal.Number.Oct */
+.sb { color: #bb8844 } /* Literal.String.Backtick */
+.sc { color: #bb8844 } /* Literal.String.Char */
+.sd { color: #bb8844 } /* Literal.String.Doc */
+.s2 { color: #bb8844 } /* Literal.String.Double */
+.se { color: #bb8844 } /* Literal.String.Escape */
+.sh { color: #bb8844 } /* Literal.String.Heredoc */
+.si { color: #bb8844 } /* Literal.String.Interpol */
+.sx { color: #bb8844 } /* Literal.String.Other */
+.sr { color: #808000 } /* Literal.String.Regex */
+.s1 { color: #bb8844 } /* Literal.String.Single */
+.ss { color: #bb8844 } /* Literal.String.Symbol */
+.bp { color: #999999 } /* Name.Builtin.Pseudo */
+.vc { color: #ff99ff } /* Name.Variable.Class */
+.vg { color: #ff99ff } /* Name.Variable.Global */
+.vi { color: #ff99ff } /* Name.Variable.Instance */
+.il { color: #009999 } /* Literal.Number.Integer.Long */
+
diff --git a/doc/pacbio-theme/theme.conf b/doc/pacbio-theme/theme.conf
new file mode 100644
index 0000000..dd24a1a
--- /dev/null
+++ b/doc/pacbio-theme/theme.conf
@@ -0,0 +1,4 @@
+[theme]
+inherit = default 
+stylesheet = pacbio.css
+pygments_style = tango
diff --git a/etc/aligned_reads_ss.cmp.h5 b/etc/aligned_reads_ss.cmp.h5
new file mode 100644
index 0000000..649a3e5
Binary files /dev/null and b/etc/aligned_reads_ss.cmp.h5 differ
diff --git a/etc/grouped.csv b/etc/grouped.csv
new file mode 100644
index 0000000..980cd60
--- /dev/null
+++ b/etc/grouped.csv
@@ -0,0 +1,35 @@
+Group,Movie,HoleNumber
+grpD,m121005_190843_42142_c100376662550000001523029810101285_s1_p0,57863
+grpC,m121005_190843_42142_c100376662550000001523029810101285_s1_p0,27852
+grpB,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,59859
+grpC,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,13786
+grpD,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,81217
+grpB,m121005_210848_42142_c100376662550000001523029810101286_s2_p0,45041
+grpA,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,68098
+grpC,m121005_190843_42142_c100376662550000001523029810101285_s1_p0,28276
+grpD,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,5427
+grpD,m121005_210848_42142_c100376662550000001523029810101286_s2_p0,17227
+grpA,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,52328
+grpB,m121005_210848_42142_c100376662550000001523029810101286_s2_p0,79655
+grpA,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,80168
+grpC,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,42546
+grpC,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,30276
+grpC,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,37436
+grpD,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,47817
+grpA,m121005_190843_42142_c100376662550000001523029810101285_s1_p0,7444
+grpA,m121005_190843_42142_c100376662550000001523029810101285_s1_p0,58620
+grpC,m121005_190843_42142_c100376662550000001523029810101285_s1_p0,33946
+grpB,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,30195
+grpD,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,42437
+grpD,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,15557
+grpD,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,75537
+grpD,m121005_190843_42142_c100376662550000001523029810101285_s1_p0,54693
+grpD,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,7823
+grpA,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,57338
+grpB,m121005_210848_42142_c100376662550000001523029810101286_s2_p0,60875
+grpB,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,62509
+grpC,m121005_210848_42142_c100376662550000001523029810101286_s2_p0,15216
+grpA,m121005_190843_42142_c100376662550000001523029810101285_s1_p0,15690
+grpA,m121005_190843_42142_c100376662550000001523029810101285_s2_p0,38570
+grpB,m121005_210848_42142_c100376662550000001523029810101286_s2_p0,46509
+grpA,m121005_210848_42142_c100376662550000001523029810101286_s1_p0,80924
diff --git a/pbh5tools/CmpH5Compare.py b/pbh5tools/CmpH5Compare.py
new file mode 100644
index 0000000..34c5d5b
--- /dev/null
+++ b/pbh5tools/CmpH5Compare.py
@@ -0,0 +1,94 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+
+import logging
+import os
+import numpy as NP
+import h5py as H5
+
+from pbcore.io import CmpH5Reader
+from pbh5tools.PBH5ToolsException import PBH5ToolsException
+from pbh5tools.Metrics import *
+from mlab import rec2csv, rec2txt
+
+def cmpH5Equal(inCmp1, inCmp2):
+    """Compare two cmp.h5 files for equality. Here equality means the
+    alignments are the same and they are in the same
+    order. Additionally, the reference information in the files has to
+    be the same."""
+    cmp1 = CmpH5Reader(inCmp1)
+    cmp2 = CmpH5Reader(inCmp2)
+
+    if not len(cmp1) == len(cmp2):
+        return (False, "cmp.h5 files differ in length (%d, %d)" %
+                (len(cmp1), len(cmp2)))
+
+    aeq = [ a1 == a2 for a1,a2 in zip(cmp1, cmp2) ]
+    if not all(aeq):
+        return (False, "%d alignments differ" %  (len(aeq)-sum(aeq)))
+
+    return (True, )
+
+def cmpH5Summarize(inCmp, movieSummary = True, refSummary = True):
+    """Summarize a cmp.h5 file"""
+    reader = CmpH5Reader(inCmp)
+    tstr   = "filename: %s\nversion:  %s\nn reads:  %d\nn refs:   " + \
+        "%d\nn movies: %d\nn bases:  %d\navg rl:   %d\navg acc:  %g"
+    
+    rl,acc,mov = zip(*[(r.readLength,r.accuracy,r.movieInfo[0]) for r in reader ])
+    
+    summaryStr = (tstr % (os.path.basename(reader.file.filename), reader.version, len(reader),
+                          len(reader.referenceInfoTable), len(set(mov)), NP.sum(rl),
+                          NP.round(NP.mean(rl)), NP.round(NP.mean(acc), 4)))
+    eTbl = Tbl(nBases = Sum(ReadLength), avgReadLength = Mean(ReadLength), 
+               avgAccuracy = Mean(Accuracy))
+    
+    movieSummaryTxt = rec2txt(toRecArray(query(reader, what = eTbl, groupBy = Movie)),
+                              padding = 5, precision = 1)
+    
+    refSummaryTxt = rec2txt(toRecArray(query(reader, what = eTbl, groupBy = Reference)),
+                            padding = 5, precision = 1)
+   
+    return (summaryStr + 
+            ("\n\n\t Movie Summary:\n" + (movieSummaryTxt if movieSummary else "\n")) + 
+            ("\n\n\t Reference Summary:\n" + (refSummaryTxt if refSummary else "\n")))
+    
+
+def cmpH5Validate(inCmp):
+    """Validate a cmp.h5 file"""
+    try:
+        reader = CmpH5Reader(inCmp)
+        return True
+    except:
+        return False
+
+
+
diff --git a/pbh5tools/CmpH5Format.py b/pbh5tools/CmpH5Format.py
new file mode 100755
index 0000000..9aa53a2
--- /dev/null
+++ b/pbh5tools/CmpH5Format.py
@@ -0,0 +1,76 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+class CmpH5Format:
+     def __init__(self, cmpH5):
+        if ('Version' in cmpH5.attrs):
+            self.VERSION = cmpH5.attrs['Version']
+
+        self.ALN_INFO             = 'AlnInfo'
+        self.REF_INFO             = 'RefInfo'
+        self.MOVIE_INFO           = 'MovieInfo'
+        self.REF_GROUP            = 'RefGroup'
+        self.ALN_GROUP            = 'AlnGroup'
+        self.ALN_INDEX_NAME       = 'AlnIndex'
+        self.FILE_LOG             = 'FileLog'
+        self.BARCODE_INFO         = 'BarcodeInfo'
+
+        self.ALN_INDEX            = '/'.join([self.ALN_INFO, self.ALN_INDEX_NAME])
+        self.REF_GROUP_ID         = '/'.join([self.REF_GROUP, 'ID'])
+        self.REF_GROUP_PATH       = '/'.join([self.REF_GROUP, 'Path'])
+        self.REF_GROUP_INFO_ID    = '/'.join([self.REF_GROUP, 'RefInfoID'])
+
+        self.REF_OFFSET_TABLE     = '/'.join([self.REF_GROUP, 'OffsetTable'])
+        self.ALN_GROUP_ID         = '/'.join([self.ALN_GROUP, 'ID'])
+        self.ALN_GROUP_PATH       = '/'.join([self.ALN_GROUP, 'Path'])
+
+        # Movie Info
+        self.MOVIE_INFO_ID         = '/'.join([self.MOVIE_INFO, 'ID'])
+        self.MOVIE_INFO_NAME       = '/'.join([self.MOVIE_INFO, 'Name'])
+        self.MOVIE_INFO_EXP        = '/'.join([self.MOVIE_INFO, 'Exp'])
+        self.MOVIE_INFO_FRAME_RATE = '/'.join([self.MOVIE_INFO, 'FrameRate'])
+        self.MOVIE_INFO_RUN        = '/'.join([self.MOVIE_INFO, 'Run'])
+        self.MOVIE_INFO_BINDING_KIT      = '/'.join([self.MOVIE_INFO, 'BindingKit'])
+        self.MOVIE_INFO_SEQUENCING_KIT   = '/'.join([self.MOVIE_INFO, 'SequencingKit'])
+        self.MOVIE_INFO_SOFTWARE_VERSION = '/'.join([self.MOVIE_INFO, 'SoftwareVersion'])
+
+        (self.ID, self.ALN_ID, self.MOVIE_ID, self.REF_ID, self.TARGET_START,
+         self.TARGET_END, self.RC_REF_STRAND, self.HOLE_NUMBER, self.SET_NUMBER,
+         self.STROBE_NUMBER, self.MOLECULE_ID, self.READ_START, self.READ_END,
+         self.MAP_QV, self.N_MATCHES, self.N_MISMATCHES, self.N_INSERTIONS,
+         self.N_DELETIONS, self.OFFSET_BEGIN, self.OFFSET_END, self.N_BACK,
+         self.N_OVERLAP) = range(0, 22)
+
+        self.extraTables = ['/'.join([self.ALN_INFO, x]) for x in
+                            cmpH5[self.ALN_INFO].keys()
+                            if not x == self.ALN_INDEX_NAME]
+        # sorting
+        self.INDEX_ATTR = "Index"
+        self.INDEX_ELTS = ['REF_ID', 'TARGET_START', 'TARGET_END']
diff --git a/pbh5tools/CmpH5Merge.py b/pbh5tools/CmpH5Merge.py
new file mode 100755
index 0000000..bc953d1
--- /dev/null
+++ b/pbh5tools/CmpH5Merge.py
@@ -0,0 +1,338 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import os
+import logging
+
+import h5py as H5
+import numpy as NP
+
+from pbh5tools.PBH5ToolsException import PBH5ToolsException
+from pbh5tools.CmpH5Format import CmpH5Format
+from pbh5tools.CmpH5Utils import copyAttributes, deleteAttrIfExists
+
+def makeRefName(rID):
+    return "ref%06d" % rID
+
+def allEqual(v):
+    def f(e):
+        return e.all() if type(e) == NP.ndarray else e
+    return all(f(v[0] == a) for a in v)
+
+def empty(cmpH5):
+    return cmpH5[CmpH5Format(cmpH5).ALN_INDEX].shape[0] <= 0
+
+def processMovies(outCmp, inCmps, fmt):
+    umovies = []
+    newMovieID = 1
+
+    def l(i, nm):
+        return cmpH5[nm][i] if nm in cmpH5 else None
+    def w(i, nm):
+        if any([z[i] is None for z in umovies]):
+            logging.info("No dataset: %s" % nm)
+        else:
+            # take dtype from previous dataset.
+            outCmp.create_dataset(nm, data = NP.array([z[i] for z in umovies]),
+                                  dtype = inCmps[0][nm].dtype)
+    for cmpH5 in inCmps:
+        for i,n in enumerate(cmpH5[fmt.MOVIE_INFO_NAME]):
+            if not n in [u[1] for u in umovies]:
+                umovies.append((newMovieID,
+                                l(i, fmt.MOVIE_INFO_NAME),
+                                l(i, fmt.MOVIE_INFO_FRAME_RATE),
+                                l(i, fmt.MOVIE_INFO_RUN),
+                                l(i, fmt.MOVIE_INFO_EXP),
+                                l(i, fmt.MOVIE_INFO_BINDING_KIT),
+                                l(i, fmt.MOVIE_INFO_SEQUENCING_KIT),
+                                l(i, fmt.MOVIE_INFO_SOFTWARE_VERSION)))
+                newMovieID += 1
+    # write the information to the new file.
+    w(0, fmt.MOVIE_INFO_ID)
+    w(1, fmt.MOVIE_INFO_NAME)
+    w(2, fmt.MOVIE_INFO_FRAME_RATE)
+    w(3, fmt.MOVIE_INFO_RUN)
+    w(4, fmt.MOVIE_INFO_EXP)
+    w(5, fmt.MOVIE_INFO_BINDING_KIT)
+    w(6, fmt.MOVIE_INFO_SEQUENCING_KIT)
+    w(7, fmt.MOVIE_INFO_SOFTWARE_VERSION)
+
+
+
+    return umovies
+
+def makeOrAppend(outCmp, dsName, newDta, chunks = True):
+    if not dsName in outCmp:
+        outCmp.create_dataset(dsName, data = newDta, chunks = chunks,
+                              maxshape = tuple([None for x in newDta.shape]))
+    else:
+        d = outCmp[dsName]
+        e = d.shape[0]
+        if len(d.shape) == 2:
+            d.resize((e + newDta.shape[0], d.shape[1]))
+            d[e:(e + newDta.shape[0]), ] = newDta
+        else:
+            d.resize((e + newDta.shape[0],))
+            d[e:(e + newDta.shape[0])] = newDta
+
+
+def _fileExists(fileName):
+    if os.path.isfile(fileName):
+        return os.path.abspath(fileName)
+    else:
+        raise IOError("Unable to find {f}".format(f=fileName))
+
+
+def cmpH5Merge(inFiles, outFile, referencesFile=None):
+    
+    # expand any fofns in inFiles
+    expandedInFiles = []
+    for fileName in inFiles:
+        if fileName.endswith(".fofn"):
+            expandedInFiles.extend([k.strip() for k in open(fileName).readlines()])
+        else:
+            expandedInFiles.append(fileName)
+
+    # input validation. This is kinda clunky
+    inps = {_fileExists(f) for f in expandedInFiles}
+
+    # Not sure if this is the expected behavior
+    if os.path.isabs(outFile):
+        outp = outFile
+    else:
+        outp = os.path.join(os.getcwd(), outFile)
+
+    if outp in inps:
+        raise ValueError("Outfile {f} was provided as an input file.".format(f=outp))
+    
+    if referencesFile is not None:
+        selectedReferences = [k.strip() for k in open(referencesFile).readlines()]
+    else:
+        selectedReferences = None
+
+    # start the analysis
+    try:
+        logging.debug("Processing:\n\t" + "\t\n".join(expandedInFiles))
+        logging.debug("Writing to:" + str(outFile))
+
+        inCmps = [H5.File(z, 'r') for z in expandedInFiles]
+        outCmp = H5.File(outFile, 'w')
+
+        logging.debug("Loaded input and output h5 files.")
+
+        if not allEqual([CmpH5Format(z).VERSION for z in inCmps]):
+            raise PBH5ToolsException("merge", "Different cmp.h5 versions.")
+
+        fmt = CmpH5Format(inCmps[0])
+
+        if not allEqual([z[fmt.REF_INFO]['MD5'].value for z in inCmps]):
+            raise PBH5ToolsException("merge", "Different reference sequences.")
+
+        # Remove cmp.h5 files which have no alignment
+        inNonEmptyCmps = []
+        for f in inCmps:
+            alnNum = 0
+            try:
+                alnNum = f['/AlnInfo/AlnIndex'].shape[0]
+                if alnNum > 0:
+                    inNonEmptyCmps.append(f)
+                else:
+                    logging.warn("Skipping emtpy file: %s" % f.filename)
+            except Exception:
+                logging.warn("Skipping emtpy file: %s" % f.filename)
+
+        inCmps = inNonEmptyCmps
+
+        if not len(inCmps):
+            raise PBH5ToolsException("merge", "No non-empty files to merge.")
+
+        # check for consistency of things like barcode and edna/z score
+        # datasets.
+        hasBarcode = all([ fmt.BARCODE_INFO in z for z in inCmps ])
+        extraDatasets = [set(filter(lambda x : not x == fmt.ALN_INDEX_NAME,
+                                    z[fmt.ALN_INFO].keys())) for z in inCmps ]
+        extraDatasets = reduce(set.intersection, extraDatasets)
+
+        def filterPrint(x):
+            if empty(x):
+                logging.warn("Skipping emtpy file: %s" % x.filename)
+                return False
+            else:
+                return True
+        inCmps = filter(filterPrint, inCmps)
+
+        if not len(inCmps):
+            raise PBH5ToolsException("merge", "No non-empty files to merge.")
+
+        # copy REF_INFO, FILE_LOG, and BARCODE_INFO if its there.
+        outCmp.copy(inCmps[0][fmt.REF_INFO], fmt.REF_INFO)
+        outCmp.copy(inCmps[0][fmt.FILE_LOG], fmt.FILE_LOG)
+
+        if hasBarcode:
+            outCmp.copy(inCmps[0][fmt.BARCODE_INFO], fmt.BARCODE_INFO)
+
+        # top-level attributes.
+        copyAttributes(inCmps[0], outCmp)
+        deleteAttrIfExists(outCmp, fmt.INDEX_ATTR)
+
+        # go through by REF_INFO_ID and select the relevant bits from each file.
+        refInfoIDs = outCmp[fmt.REF_INFO]['ID'].value
+
+        # process the movies upfront.
+        umovies = processMovies(outCmp, inCmps, fmt)
+
+        # an increment for new ALN_GROUP/ID values
+        alnIDBegin = 1
+
+        # either way you structure the loops annoyances arise.
+        for cmpH5 in inCmps:
+            logging.debug("Processing: %s" % cmpH5.filename)
+
+            # we are going to map the ref ids into the globaly unique
+            # refInfoIDs.
+            refIDMap = dict(zip(cmpH5[fmt.REF_GROUP_ID].value,
+                                cmpH5[fmt.REF_GROUP_INFO_ID].value))
+            
+            refPathMap = dict(zip(cmpH5[fmt.REF_GROUP_INFO_ID].value,
+                                  [os.path.basename(k) for k in cmpH5[fmt.REF_GROUP_PATH]]))
+
+            # make a map from this cmpH5's movies to the new movie ID.
+            movieMap = {}
+            for oid,nm in zip(cmpH5[fmt.MOVIE_INFO_ID], cmpH5[fmt.MOVIE_INFO_NAME]):
+                newID = [z[0] for z in umovies if z[1] == nm]
+                if len(newID) == 1:
+                    movieMap[oid] = newID[0]
+                else:
+                    raise PBH5ToolsException("merge", "Error processing movies.")
+
+            for rID in refInfoIDs:
+                if rID not in refIDMap.values():
+                    logging.info("Skipping reference with no reads.")
+                    continue
+                if selectedReferences is not None:
+                    if refPathMap[rID] not in selectedReferences:
+                        continue
+
+
+                # compute new reference ID.
+                aIdx    = cmpH5[fmt.ALN_INDEX].value
+                refID   = {x:y for y,x in refIDMap.iteritems()}[rID]
+                refName = makeRefName(rID)
+
+                # which reads go to this reference.
+                whichReads = aIdx[:,fmt.REF_ID] == refID
+                if not any(whichReads):
+                    # this should be covered by the test at the top,
+                    # but it is not really perfectly defined by the
+                    # spec as to whether something in the ref group
+                    # *has* to have alignments.
+                    continue
+                aIdx = aIdx[whichReads, ]
+                aIdx[:,fmt.REF_ID] = rID
+
+                # make a map between old and new IDs
+                uAlnIDs  = NP.unique(aIdx[:,fmt.ALN_ID])
+                alnIDMap = dict(zip(uAlnIDs, NP.array(range(0, len(uAlnIDs))) +
+                                    alnIDBegin))
+                alnGroup = {k:v for k,v in zip(cmpH5[fmt.ALN_GROUP_ID].value,
+                                               cmpH5[fmt.ALN_GROUP_PATH].value) if \
+                                k in uAlnIDs}
+                newAlnGroup = [(alnIDMap[k],
+                                "/%s/%s-%d" % (refName, os.path.basename(alnGroup[k]),
+                                               alnIDMap[k]),
+                                alnGroup[k]) for k in alnGroup.keys()]
+
+                # Set the new ALN_ID vals in the ALN_INDEX.
+                aIdx[:,fmt.ALN_ID] = NP.array([alnIDMap[aIdx[i,fmt.ALN_ID]] for i in
+                                               range(0, aIdx.shape[0])])
+                # Set the new MOVIE_ID vals.
+                aIdx[:,fmt.MOVIE_ID] = NP.array([movieMap[aIdx[i,fmt.MOVIE_ID]] for i in
+                                                 range(0, aIdx.shape[0])])
+
+                # copy the array data.
+                for (nid,newGroup,oldGroup) in newAlnGroup:
+                    logging.debug("Copying: \nfrom: %s \ninto: %s" % \
+                                      (oldGroup, newGroup))
+                    if not os.path.dirname(newGroup) in outCmp:
+                        outCmp.create_group(refName)
+                    outCmp.copy(cmpH5[oldGroup], outCmp[refName],
+                                name = os.path.basename(newGroup))
+
+                # increment the ALN_GROUP id offset.
+                alnIDBegin = alnIDBegin + len(uAlnIDs)
+
+                # write the adjusted alignment information.
+                makeOrAppend(outCmp, fmt.ALN_INDEX, aIdx)
+
+                # write extra datasets in the ALN_INFO group
+                for extra in extraDatasets:
+                    pth = '/'.join([fmt.ALN_INFO, extra])
+                    logging.info("Processing extra dataset: %s" % pth)
+                    makeOrAppend(outCmp, pth, cmpH5[pth].value[whichReads,])
+
+                # write the ALN_GROUP.
+                makeOrAppend(outCmp, fmt.ALN_GROUP_ID,
+                             NP.array([nid for nid,a,b in newAlnGroup],
+                                      dtype = cmpH5[fmt.ALN_GROUP_ID].dtype))
+                makeOrAppend(outCmp, fmt.ALN_GROUP_PATH,
+                             NP.array([npth for a,npth,b in newAlnGroup],
+                                      dtype = cmpH5[fmt.ALN_GROUP_PATH].dtype))
+
+        # now depending on what references had alignments we'll make the
+        # new REF_GROUP.
+        uRefsWithAlignments = NP.unique(outCmp[fmt.ALN_INDEX][:,fmt.REF_ID])
+        outCmp.create_dataset(fmt.REF_GROUP_ID, data = uRefsWithAlignments,
+                              dtype = inCmps[0][fmt.REF_GROUP_ID].dtype)
+        outCmp.create_dataset(fmt.REF_GROUP_PATH,
+                              data = NP.array([('/' + makeRefName(z)) for z in
+                                               uRefsWithAlignments]),
+                              dtype = inCmps[0][fmt.REF_GROUP_PATH].dtype)
+        outCmp.create_dataset(fmt.REF_GROUP_INFO_ID, data = uRefsWithAlignments,
+                              dtype = inCmps[0][fmt.REF_GROUP_INFO_ID].dtype)
+
+        # reset the IDs
+        outCmp[fmt.ALN_INDEX][:,fmt.ID] = range(1, outCmp[fmt.ALN_INDEX].shape[0] + 1)
+        # reset the molecule IDs
+        outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID] = \
+            ((NP.max(outCmp[fmt.ALN_INDEX][:,fmt.MOLECULE_ID]) *
+              (outCmp[fmt.ALN_INDEX][:,fmt.MOVIE_ID] - 1)) +
+             outCmp[fmt.ALN_INDEX][:,fmt.HOLE_NUMBER] + 1)
+
+        # close the sucker.
+        outCmp.close()
+
+    except Exception, e:
+        try:
+            # remove the file as it won't be correct
+            if os.path.exists(outFile):
+                os.remove(outFile)
+        except:
+            pass
+        raise
diff --git a/pbh5tools/CmpH5Select.py b/pbh5tools/CmpH5Select.py
new file mode 100755
index 0000000..9d9d104
--- /dev/null
+++ b/pbh5tools/CmpH5Select.py
@@ -0,0 +1,167 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import os
+import sys
+import shutil
+import datetime
+import logging
+import tempfile
+
+import h5py as H5
+import numpy as NP
+
+from pbcore.io import CmpH5Reader
+
+from pbh5tools.PBH5ToolsException import PBH5ToolsException
+from pbh5tools.CmpH5Format import CmpH5Format
+from pbh5tools.CmpH5Utils import *
+from pbh5tools.Metrics import *
+
+def cmpH5Select(inCmpFile, outCmp, idxs = None,
+                groupByStr = None, groupByCsv = None, 
+                whereStr = None, outDir = "."):
+    """Take a vector of indices or a where expression and select a set
+    of alignments. If a groupBy is specified, then produce a cmp.h5
+    file for each distinct member of the grouping."""
+    if idxs:
+        doSelect(inCmpFile, outCmp, idxs)
+    else:
+        where   = DefaultWhere if whereStr is None else eval(whereStr)
+        groupBy = DefaultGroupBy if groupByStr is None else eval(groupByStr)
+        idxVecs = query(CmpH5Reader(inCmpFile),
+                        what = AlignmentIdx,
+                        where = where,
+                        groupBy = groupBy,
+                        groupByCsv = groupByCsv )
+        keys = idxVecs.keys()
+
+        ## XXX: Should the resultant files be sorted? 
+        if len(keys) == 1:
+            doSelect(inCmpFile, outCmp, idxVecs[keys[0]])
+        else:
+            for k in keys:
+                #For groupByCsv, skip group of indexes not identified in csv
+                if k == NOTINCSV_LABEL:
+                    continue
+                logging.debug("Processing output for %s" % str(k))
+                doSelect(inCmpFile, "/".join([outDir, "%s.cmp.h5" % str(k)]), idxVecs[k])
+
+def doSelect(inCmpFile, outCmpFile, idxs):
+    """Take an input cmp.h5 file and a vector of indices into the
+    AlnIndex and create a new cmp.h5 file from those alignments."""
+    def trimDataset(groupName, alnIdxID, inCmp, outCmp, fmt, idName = 'ID'):
+        ids = outCmp[fmt.ALN_INDEX][:,alnIdxID]
+        nds = '/'.join([groupName, idName])
+        msk = NP.array([x in ids for x in inCmp[nds].value]) # got to be an NP.array
+        for dsName in inCmp[groupName].keys():
+            copyDataset('/'.join([groupName, dsName]), inCmp, outCmp,
+                        msk, fmt)
+
+    def copyGroup(groupName, inCmp, outCmp):
+        if groupName in inCmp:
+            outCmp.copy(inCmp[groupName], groupName)
+
+    try:
+        inCmp  = H5.File(inCmpFile, 'r')
+        outCmp = H5.File(outCmpFile, 'w') # fail if it exists.
+        idxs   = NP.array(idxs)
+        fmt    = CmpH5Format(inCmp)
+
+        if not (NP.max(idxs) < inCmp[fmt.ALN_INDEX].shape[0] and
+                NP.min(idxs) >= 0):
+            raise PBH5ToolsException("Invalid idxs specified, must be within [0, %d)" %
+                                     inCmp[fmt.ALN_INDEX].shape[0])
+
+        # copy over the AlnIndex and other AlnInfo elements
+        # correpsonding to idxs to new file.
+        for dsName in inCmp[fmt.ALN_INFO].keys():
+            copyDataset('/'.join([fmt.ALN_INFO, dsName]), inCmp, outCmp, idxs, fmt)
+
+        # reset the ALN_ID
+        outCmp[fmt.ALN_INDEX][:,fmt.ID] = \
+            NP.array(range(1, outCmp[fmt.ALN_INDEX].shape[0] + 1))
+
+        # trim the other datasets
+        trimDataset(fmt.ALN_GROUP, fmt.ALN_ID, inCmp, outCmp, fmt)
+        # trimDataset(fmt.REF_GROUP, fmt.REF_ID, inCmp, outCmp, fmt)
+        # trimDataset(fmt.MOVIE_INFO, fmt.MOVIE_ID, inCmp, outCmp, fmt)
+        # copy Ref,Movie dataset whole
+        for groupName in [fmt.REF_GROUP,fmt.MOVIE_INFO]:
+            for dsName in inCmp[groupName].keys():
+                copyDataset('/'.join([groupName,dsName]), inCmp, outCmp, None, fmt)
+
+        # other groups will go over whole hog
+        copyGroup(fmt.FILE_LOG, inCmp, outCmp)
+        copyGroup(fmt.REF_INFO, inCmp, outCmp)
+        copyGroup(fmt.BARCODE_INFO, inCmp, outCmp)
+
+        # now we copy over the actual data
+        for i in xrange(0, outCmp[fmt.ALN_GROUP_ID].shape[0]):
+            # figure out what reads are in this group.
+            agID = outCmp[fmt.ALN_GROUP_ID][i]
+            agPT = outCmp[fmt.ALN_GROUP_PATH][i]
+            alnIdx = outCmp[fmt.ALN_INDEX].value
+            whReads = NP.where(agID == alnIdx[:,fmt.ALN_ID])[0]
+            offBegin = alnIdx[whReads, fmt.OFFSET_BEGIN]
+            offEnd = alnIdx[whReads, fmt.OFFSET_END]
+            totalSize = NP.sum((offEnd - offBegin) + 1) # 0 in between
+
+            for dsName in inCmp[agPT].keys():
+                fullPath = '/'.join([agPT, dsName])
+                newDs = outCmp.create_dataset(fullPath, shape = (totalSize,),
+                                              dtype = inCmp[fullPath].dtype)
+                origDs = inCmp[fullPath]
+                cs = 0
+                for j in xrange(0, len(whReads)):
+                    newEnd = cs + offEnd[j] - offBegin[j]
+                    newDs[cs:newEnd] = origDs[offBegin[j]:offEnd[j]]
+                    outCmp[fmt.ALN_INDEX][whReads[j],fmt.OFFSET_BEGIN] = cs
+                    outCmp[fmt.ALN_INDEX][whReads[j],fmt.OFFSET_END] = newEnd
+                    cs = newEnd
+
+        # copy over the top-level attributes
+        copyAttributes(inCmp, outCmp)
+
+        # remove the offset table
+        deleteIfExists(outCmp, fmt.REF_OFFSET_TABLE)
+        deleteAttrIfExists(outCmp, fmt.INDEX_ATTR)
+
+        # close the sucker
+        logging.debug("Closing output cmp.h5 file.")
+        outCmp.close()
+
+    except Exception, e:
+        logging.exception(e)
+        try:
+            os.remove(outCmpFile)
+        except:
+            pass
+        raise e
diff --git a/pbh5tools/CmpH5Sort.py b/pbh5tools/CmpH5Sort.py
new file mode 100644
index 0000000..c1ff5af
--- /dev/null
+++ b/pbh5tools/CmpH5Sort.py
@@ -0,0 +1,518 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import os
+import sys
+import shutil
+import datetime
+import logging
+import tempfile
+
+import h5py as H5
+import numpy as NP
+
+from pbh5tools.PBH5ToolsException import PBH5ToolsException
+from pbh5tools.CmpH5Format import CmpH5Format
+
+import pbcore.io.rangeQueries as RQ
+
+def numberWithinRange(s, e, vec):
+    """
+    Compute the number of elements in vec (where vec is sorted), such
+    that each element, e obeys the following constraint: start <= e <
+    end.
+    """
+    lI = RQ.leftmostBinSearch(vec, s)
+    rI = RQ.rightmostBinSearch(vec, e)
+    return(len(filter(lambda x : s <= x < e, vec[lI:rI])))
+
+
+def computeIndices(tStart, tEnd):
+    """
+    Given a sorted (tStart, tEnd) compute a two-column matrix with
+    columns nBack and nOverlap.
+
+    nBack is defined as follows: given a position j, nBack is the
+    offset for the smallest i, such that tEnd[i] > tStart[j], i.e., j
+    - nBack == i.
+
+    nOverlap is defined as follows: given a position j, nOverlap is |
+    tEnd[i] > tStart[j] | for all i in 1,...,j - 1.
+    """
+    res = NP.zeros(2 * len(tStart), dtype = "int32").reshape(len(tStart), 2)
+
+    for i in range(len(tStart) - 1, 0, -1):
+        j = i - 1
+        nBack = 0
+        nOver = 0
+
+        while (j >= 0):
+            if (tEnd[j] > tStart[i]):
+                nOver += 1
+                nBack = (i - j)
+            j -= 1
+
+        res[i, 0] = nBack
+        res[i, 1] = nOver
+    return(res)
+
+
+def computeIndicesDP(tStart, tEnd):
+    """
+    Given a sorted tStart, tEnd compute a two-column matrix with
+    columns nBack and nOverlap.
+
+    nBack is defined as follows: given a position j, nBack is the
+    offset for the smallest i, such that tEnd[i] > tStart[j], i.e., j
+    - nBack == i.
+
+    nOverlap is defined as follows: given a position j, nOverlap is |
+    tEnd[i] > tStart[j] | for all i in 1,...,j - 1.
+
+    This is a dynamic programming implementation and is *substantially*
+    faster than computeIndices.
+    """
+    res = NP.zeros(2 * len(tStart), dtype = "int32").reshape(len(tStart), 2)
+    sortedEnds = NP.sort(tEnd)
+
+    for i in range(1, len(tStart)):
+        nEnding = numberWithinRange(tStart[i - 1], tStart[i], sortedEnds - 1)
+
+        if (nEnding == 0):
+            res[i, 0] = res[i - 1, 0] + 1
+            res[i, 1] = res[i - 1, 1] + 1
+        else:
+            res[i, 1] = res[i - 1, 1] - nEnding + 1
+
+            advance = 0
+            for k in range(i - 1 - res[i - 1, 0], i):
+                if (tEnd[k] > tStart[i]):
+                    break
+                advance += 1
+            res[i, 0] = res[i - 1, 0] - advance + 1
+
+    return(res)
+
+
+def computeRefIndexTable(refIDVector):
+    """
+    Compute a table of offsets for refIDVector; refIDVector must be
+    `blocked`, i.e., 1,1,2 and 2,1,1 are okay, but 1,2,1 is not. A
+    sorted refIDVector accomplishes this.
+    """
+    offsets = NP.zeros(shape = (len(NP.unique(refIDVector)), 3),
+                       dtype = "uint32")
+    row = 0
+    offsets[row,] = refIDVector[0], 0, 1
+    for ID in refIDVector[1:]:
+        if ID == offsets[row, 0]:
+            offsets[row, 2] += 1
+        else:
+            row += 1
+            offsets[row, 0] = ID
+            offsets[row, 1] = offsets[row - 1, 2]
+            offsets[row, 2] = offsets[row - 1, 2] + 1
+    return offsets
+
+def __pathExists(h5, path):
+    try:
+        h5[path]
+        return True
+    except Exception, E:
+        return False
+
+def __repackDataArrays(cH5, format, fixedMem = False, maxDatasetSize = 2**31 - 1):
+    """
+    Flatten read groups according to an indexed cmp.h5 file.
+    """
+    alnGroups = [x for x in cH5[format.ALN_GROUP_PATH]]
+    pulseDatasets = [cH5[x].keys() for x in alnGroups]
+    uPulseDatasets = sorted(list(reduce(lambda x,y: set.union(set(x), set(y)), pulseDatasets)))
+
+    # check to make sure that all aln groups have the same datasets -
+    # a set.equal operation would help
+    for pd in pulseDatasets:
+         error = False
+         spd = sorted(pd)
+         if len(spd) == len(uPulseDatasets):
+              if any([ (x != y) for (x,y) in zip(spd, uPulseDatasets) ]):
+                   error = True
+         else:
+              error = True
+         if error:
+              raise PBH5ToolsException("sort", "Datasets must agree:\n" + ",".join(spd) +
+                                       "\nvs\n" + ",".join(uPulseDatasets))
+
+    readGroupPaths = dict(zip(cH5[format.ALN_GROUP_ID],
+                              [x for x in cH5[format.ALN_GROUP_PATH]]))
+    refGroupPaths  = dict(zip(cH5[format.REF_GROUP_ID],
+                              [x for x in cH5[format.REF_GROUP_PATH]]))
+    uPDAndType     = dict(zip(uPulseDatasets,
+                              [cH5[readGroupPaths.values()[0]][z].dtype
+                               for z in uPulseDatasets]))
+
+    ## XXX : this needs to be augmented with some saftey on not
+    ## loading too much data.  - set a bound on the number of elts in
+    ## the cache.
+    pdsCache = {}
+
+    def getData(read, ds, start, end):
+        key = "/".join((readGroupPaths[read[format.ALN_ID]], ds))
+        if not pdsCache.has_key(key):
+            logging.debug("Cacheing: %s" % key)
+            h5DS = cH5[key]
+            smallEnoughToFitInMemory = ((h5DS.len() * h5DS.dtype.itemsize)/1024**3) < 12
+            pdsCache[key] = h5DS
+            if not fixedMem and smallEnoughToFitInMemory:
+                logging.debug("Reading %d G." % ((h5DS.len() * h5DS.dtype.itemsize)/1024**3))
+                pdsCache[key] = pdsCache[key][:]
+        return(pdsCache[key][start:end])
+
+    def getRefGroup(gID):
+         return(cH5[refGroupPaths[gID]])
+
+    def chooseAlnGroupNames(gID, readBlocks, start = 1):
+         rGroup = cH5[refGroupPaths[gID]]
+         currentGroups = rGroup.keys()
+         pref = 'rg' + str(start) + '-'
+         newNames = [ pref + str(i) for i,j in enumerate(readBlocks) ]
+         if any([ nn in currentGroups for nn in newNames ]):
+              return chooseAlnGroupNames(gID, readBlocks, start = start + 1)
+         else:
+              return newNames
+
+    offsets = cH5[format.REF_OFFSET_TABLE].value
+    sAI = cH5[format.ALN_INDEX]
+    orderedRefPaths = [""] * offsets.shape[0]
+    maxDataSize = int(maxDatasetSize)
+
+    ## These are updated in the main loop.
+    currentAlnID = 1
+    refGroupAlnGroups = []
+
+    for row in xrange(0, offsets.shape[0]):
+        logging.info("Processing reference: %d of %d" %
+                     (row + 1, offsets.shape[0]))
+
+        groupID = offsets[row, 0]
+        fRow = offsets[row, 1]
+        lRow = offsets[row, 2]
+        if (lRow == fRow):
+            continue
+
+        reads = sAI[fRow:lRow,:]
+        alens = reads[:,format.OFFSET_END] - reads[:,format.OFFSET_BEGIN]
+        clens = NP.concatenate((NP.array([0], "uint64"), NP.cumsum(alens + 1)))
+        tSize = clens[len(clens) - 1]
+
+        readBlocks = []
+        if tSize >= maxDatasetSize:
+             lastStart  = 0
+             for i in xrange(0, len(clens)):
+                  if clens[i]-clens[lastStart] >= maxDatasetSize:
+                       readBlocks.append((lastStart, i))
+                       lastStart = i
+             if lastStart < reads.shape[0]:
+                  readBlocks.append((lastStart, reads.shape[0]))
+        else:
+             readBlocks = [(0, reads.shape[0])]
+
+        # choose all the names at once for a particular reference. It
+        # is important to ensure that sorting the same file again and
+        # again works.
+        newGroupNames = chooseAlnGroupNames(groupID, readBlocks)
+
+        for i,readBlock in enumerate(readBlocks):
+             logging.debug("Processing readBlock (%d, %d)" % readBlock)
+             dsLength = clens[readBlock[1]] - clens[readBlock[0]]
+             newGroup = getRefGroup(groupID).create_group(newGroupNames[i])
+
+             for pulseDataset in uPulseDatasets:
+                  logging.debug("Processing dataset: %s" % pulseDataset)
+                  newDS = NP.zeros(dsLength, dtype = uPDAndType[pulseDataset])
+
+                  currentStart = 0
+                  for readIdx in xrange(readBlock[0], readBlock[1]):
+                       read = reads[readIdx, ]
+                       gStart, gEnd = currentStart, currentStart + alens[readIdx]
+                       newDS[gStart:gEnd] = getData(read, pulseDataset,
+                                                    read[format.OFFSET_BEGIN],
+                                                    read[format.OFFSET_END])
+                       currentStart = gEnd + 1
+
+                  ## XXX : the tuples are necessary.
+                  newGroup.create_dataset(pulseDataset, data = newDS,
+                                          dtype = uPDAndType[pulseDataset],
+                                          maxshape = (None, ), chunks = (16384,))
+                  logging.debug("flushing:" + ",".join(pdsCache.keys()))
+                  pdsCache = {}
+
+
+
+             ## once you have moved all data for a readBlock you can change
+             ## the offsets.
+             currentStart = 0
+             for readIdx in xrange(readBlock[0], readBlock[1]):
+                  read = reads[readIdx, ]
+                  gStart, gEnd = currentStart, currentStart + alens[readIdx]
+                  reads[readIdx, format.OFFSET_BEGIN] = gStart
+                  reads[readIdx, format.OFFSET_END] = gEnd
+                  reads[readIdx, format.ALN_ID] = currentAlnID
+                  currentStart = gEnd + 1
+
+             ## now write it back; have to map back into the global coord system.
+             sAI[(fRow + readBlock[0]):(fRow + readBlock[1]),] = \
+                 reads[readBlock[0]:readBlock[1],]
+
+             ## increment the currentAlnID
+             currentAlnID = currentAlnID + 1
+
+        ## add the new group names to the list.
+        for ngn in newGroupNames:
+             refGroupAlnGroups.append("/".join((refGroupPaths[groupID], ngn)))
+
+    ## re-identify them.
+    sAI[:,format.ID] = range(1, sAI.shape[0] + 1)
+    assert(len(refGroupAlnGroups) == currentAlnID - 1)
+
+    logging.info("Writing new AlnGroupPath values.")
+    del(cH5[format.ALN_GROUP_PATH])
+    del(cH5[format.ALN_GROUP_ID])
+    cH5.create_dataset(format.ALN_GROUP_PATH, data = map(str, refGroupAlnGroups),
+                       ## XXX : unicode.
+                       dtype = H5.special_dtype(vlen = str), maxshape = (None,),
+                       chunks = (256,))
+    cH5.create_dataset(format.ALN_GROUP_ID, data = range(1, currentAlnID),
+                       dtype = "int32", maxshape = (None,), chunks = (256,))
+    logging.info("Wrote new AlnGroupPath values.")
+
+    for rg in readGroupPaths.values():
+         ## this should never be false, however, MC has produced
+         ## files in the past where there are duplicate paths with
+         ## different IDs and therefore you'll error here (probably
+         ## out of spec)
+         if rg in cH5:
+              del(cH5[rg])
+         else:
+              logging.warn("Input cmp.h5 file is out of spec, duplicate " +
+                           "alignment group paths with different IDs (sorting" +
+                           "is unaffected)")
+
+
+def cmpH5Sort(inFile, outFile, tmpDir, deep = True, useNative = True,
+              inPlace = False):
+    """
+    This routine takes a cmp.h5 file and sorts the AlignmentIndex
+    table adding two additional columns for fast access. In addition,
+    a new top-level attribute is added to the indicate that the file
+    has been sorted, as well as a table to indicate the blocks of the
+    alignment index associated with each reference group.
+    """
+    success = False;
+    fout    = None # use fout as a flag to indicate that we need to copy
+                   # something over after success.
+
+    if outFile:
+         # process the copied version, original should always be fine.
+         shutil.copyfile(inFile, outFile)
+         _inFile = outFile
+    elif inPlace:
+         # process the input directly, no copies. failure in the
+         # middle means corrupt file.
+         _inFile = inFile
+         outFile = _inFile
+    elif tmpDir:
+         # make copy, work there, copy back - safer, but slower.
+         fout = tempfile.NamedTemporaryFile(suffix = '.cmp.h5', dir = tmpDir)
+         _inFile = fout.name
+         shutil.copyfile(inFile, _inFile)
+         outFile = _inFile
+    else:
+         raise PBH5ToolsException("sort", "Improper call, must specify outFile," +
+                                  "tmpDir, or inPlace must be True.")
+
+    logging.info("Processing inFile: %s saving in outFile: %s" %
+                 (_inFile, outFile))
+
+
+    ## Setup the indexer.
+    if (useNative):
+        from pbh5tools import Indexer
+        myIndexer = Indexer.Indexer().compute
+
+    def computeIndices(tStart, tEnd):
+        if (not useNative):
+            return computeIndicesDP(tStart, tEnd)
+        else:
+            return myIndexer(tStart, tEnd)
+
+    try:
+        cH5 = H5.File(_inFile, 'a')
+        format = CmpH5Format(cH5)
+        logging.info("Read cmp.h5 with version %s" % format.VERSION)
+
+        aI = cH5[format.ALN_INDEX]
+        originalAttrs = aI.attrs.items()
+
+        ## empty is a special case. In general, h5py handles
+        ## zero-length slices poorly and therefore I don't want to
+        ## make them. Therefore, I maintain the 'empty' variable to
+        ## indicate that. This makes some code less pleasing, e.g.,
+        ## computing the reference index data structure.
+        if (aI.shape[0] == 0):
+            logging.debug("Warning: %s empty!" % _inFile)
+            success = True;
+            return True;
+
+
+        # bug 22557, REF_ID isn't a `stable` identifier, within the
+        # scope of a cmp.h5 file. We need to map format.REF_ID to its
+        # REF_INFO_ID which is stable over merge/sort/split
+        # operations.
+        refIdToInfoId = dict(zip(cH5[format.REF_GROUP_ID].value,
+                                 cH5[format.REF_GROUP_INFO_ID]))
+        refInfoIds = NP.array([refIdToInfoId[i] for i in aI[:,format.REF_ID]])
+        aord = NP.lexsort([aI[:,format.TARGET_END], aI[:,format.TARGET_START],
+                           refInfoIds])
+
+        assert(len(aord) == aI.shape[0])
+
+        sAI = aI.value[aord,:]
+        del aI
+        logging.info("Sorted AlignmentIndex.")
+
+        # construct reference offset datastructure.
+        refSeqIDs = cH5[format.REF_GROUP_ID]
+        offsets = computeRefIndexTable(sAI[:,format.REF_ID])
+        logging.info("Constructed offset datastructure.")
+
+        # check that the offset data structure and the index are consistent.
+        assert(all([all(offsets[i,0] == sAI[offsets[i,1]:offsets[i,2],
+                                            format.REF_ID])
+                    for i in range(0, offsets.shape[0])]))
+
+        # fill overlap and back columns.
+        for row in range(0, offsets.shape[0]):
+            fRow = int(offsets[row, 1])
+            lRow = int(offsets[row, 2])
+            if (lRow - fRow <= 0):
+                continue
+            sAI[fRow:lRow, (format.N_BACK, format.N_OVERLAP)] = \
+                computeIndices(sAI[fRow:lRow, format.TARGET_START],
+                               sAI[fRow:lRow, format.TARGET_END])
+
+        logging.info("Constructed indices.")
+
+        # modify the cmp.h5 file.
+        # We want to keep the chunking info on the dataset.
+        del(cH5[format.ALN_INDEX])
+        cH5.create_dataset(format.ALN_INDEX, data = sAI,
+                           dtype = H5.h5t.NATIVE_UINT32,
+                           maxshape = (None, None))
+
+        ## If the file is already sorted there's no harm in resorting.
+        if (__pathExists(cH5, format.REF_OFFSET_TABLE)):
+            logging.info(format.REF_OFFSET_TABLE + " already exists, deleting.")
+            del(cH5[format.REF_OFFSET_TABLE])
+
+        ## create the offset datastructure in the file.
+        cH5.create_dataset(format.REF_OFFSET_TABLE, data = offsets,
+                           dtype = H5.h5t.NATIVE_UINT32, maxshape = (None, None))
+
+        ## add the index attribute.
+        cH5['/'].attrs.create(format.INDEX_ATTR,
+                              format.INDEX_ELTS)
+
+        ## fixup attributes.
+        for oA in originalAttrs:
+            cH5[format.ALN_INDEX].attrs.create(oA[0], oA[1])
+
+        ## deep repacking.
+        if (deep):
+            logging.info("Repacking alignment arrays.")
+            __repackDataArrays(cH5, format)
+
+        ## memory free.
+        del sAI
+
+        ## manage any extra datasets.
+        for extraTable in format.extraTables:
+            if (__pathExists(cH5, extraTable)):
+                logging.info("Sorting dataset: %s" % extraTable)
+                # need .value for permutation to work.
+                eTable = cH5[extraTable].value
+                if (len(eTable.shape) == 1):
+                    eTable = eTable[aord]
+                else:
+                    eTable = eTable[aord,:]
+
+                # save attributes, if any for re-writing below.
+                originalAttrs = cH5[extraTable].attrs
+                originalDtype = cH5[extraTable].dtype
+                del(cH5[extraTable])
+                cH5.create_dataset(extraTable, data = eTable,
+                                   dtype = originalDtype,
+                                   maxshape = tuple([None for x in eTable.shape]))
+                logging.info("Sorted dataset: %s" % extraTable)
+                logging.info("Writing attributes")
+                for k in originalAttrs.keys():
+                     # this block is necessary because I need to
+                     # convert the dataset into a string dataset from
+                     # object because an apparent h5py limitation.
+                     if originalAttrs[k].dtype == 'object':
+                          newDtype = H5.special_dtype(vlen = str)
+                     else:
+                          newDtype = originalAttrs[k].dtype
+
+                     cH5[extraTable].attrs.create(k, originalAttrs[k],
+                                                  dtype = newDtype)
+
+                logging.info("Finished processing dataset: %s" % extraTable)
+
+        ## set this flag for before.
+        success = True
+
+    except Exception, E:
+         logging.error(E)
+         logging.exception(E)
+
+    finally:
+        try:
+            cH5.close()
+            if success and fout:
+                logging.info("Overwriting input cmpH5 file.")
+                shutil.copyfile(_inFile, inFile)
+                fout.close()
+        except Exception as e:
+             raise PBH5ToolsException("sort", str(e))
+
+
+
+
diff --git a/pbh5tools/CmpH5Stats.py b/pbh5tools/CmpH5Stats.py
new file mode 100644
index 0000000..7683ade
--- /dev/null
+++ b/pbh5tools/CmpH5Stats.py
@@ -0,0 +1,79 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import numpy as NP
+import sys
+
+from mlab import rec2csv, rec2txt
+from pbh5tools.Metrics import *
+
+def prettyPrint(res):
+    print rec2txt(res, padding = 20, precision = 2)
+
+def makeTblFromInput(exprStr, defaultValue):
+    if not exprStr:
+        return defaultValue
+    else:
+        tbl = eval(exprStr)
+        if not isinstance(tbl, Tbl):
+          if hasEval(tbl): 
+              ## just a naked expression, wrap it. 
+              ntbl = Tbl()
+              ntbl.cols[exprStr] = tbl
+              tbl = ntbl  
+          elif isinstance(tbl, tuple):
+              ## table the tuple. 
+              ntbl = Tbl()
+              for i,e in enumerate(tbl):
+                  ntbl.cols[str(e)] = e
+              tbl = ntbl
+          else:
+              raise PBH5ToolsException("stats", "Invalid expression specified: must be" +
+                                       " a table, expression, or tuple.")
+        return tbl
+            
+def cmpH5Stats(cmpH5Filename, whatStr = None, whereStr = None,
+               groupByStr = None, sortByStr = None, limit = None, outFile = None):
+
+    reader     = CmpH5Reader(cmpH5Filename)
+    where      = DefaultWhere if whereStr is None else eval(whereStr)
+    groupBy    = DefaultGroupBy if groupByStr is None else eval(groupByStr)
+    groupByCsv = None
+    what       = makeTblFromInput(whatStr, DefaultWhat)
+    sortBy     = makeTblFromInput(sortByStr, DefaultSortBy)
+
+    res = query(reader, what, where, groupBy, groupByCsv, sortBy, limit)
+    res = toRecArray(res)
+    if not outFile:
+        prettyPrint(res)
+    else:
+        rec2csv(res, outFile)
+
+
diff --git a/pbh5tools/CmpH5Utils.py b/pbh5tools/CmpH5Utils.py
new file mode 100755
index 0000000..f343b1b
--- /dev/null
+++ b/pbh5tools/CmpH5Utils.py
@@ -0,0 +1,85 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import os
+import sys
+import shutil
+import datetime
+import logging
+import tempfile
+
+import h5py as H5
+import numpy as NP
+
+def deleteAttrIfExists(ds, nm):
+    if nm in ds.attrs:
+        del ds.attrs[nm]
+
+def deleteIfExists(ds, nm):
+    if nm in ds:
+        del ds[nm]
+
+def copyAttributes(inDs, outDs):
+    for k in inDs.attrs.keys():
+        logging.debug("copying attribute: %s" % k)
+        elt = inDs.attrs[k]
+        if isinstance(elt, basestring):
+            # h5py wants to simplify things down, so I think that this
+            # is a possibility.
+            # preserve numpy string type if possible
+            # the h5.special_dtype defined below will lose dtype.
+            newDtype = elt.dtype if hasattr(elt,'dtype') else H5.special_dtype(vlen = str)
+        elif elt.dtype == 'object':
+            # this has to do with a numpy problem.
+            newDtype = H5.special_dtype(vlen = str)
+        else:
+            newDtype = elt.dtype
+            
+        outDs.attrs.create(k, inDs.attrs[k], dtype = newDtype)
+
+def copyDataset(absDsName, inCmp, outCmp, selection = None,
+                copyAttrs = True):
+
+    inDs = inCmp[absDsName]
+    if selection is not None:
+        if len(inDs.shape) <= 1:
+            ndta = inDs.value[selection]
+        else:
+            ndta = inDs.value[selection,:]
+    else:
+        ndta = inDs.value
+
+    outDs = outCmp.create_dataset(absDsName, data = ndta,
+                                  dtype = inDs.dtype, chunks = True)
+    if copyAttrs:
+        copyAttributes(inDs, outDs)
+
+    return outDs
+
diff --git a/pbh5tools/Indexer.py b/pbh5tools/Indexer.py
new file mode 100755
index 0000000..36a54ae
--- /dev/null
+++ b/pbh5tools/Indexer.py
@@ -0,0 +1,64 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import ctypes
+import os
+import numpy
+import pkg_resources
+
+class Indexer(object):
+    def __init__(self):
+        self.SW_DLL_PATH = os.path.dirname(os.path.abspath(__file__)) + os.path.sep + "ci.so"
+        self._dll        = ctypes.CDLL(self.SW_DLL_PATH)
+
+        self._dll.compute_indices.argtypes = [numpy.ctypeslib.ndpointer(dtype = numpy.intc),
+                                              numpy.ctypeslib.ndpointer(dtype = numpy.intc),
+                                              numpy.ctypeslib.ndpointer(dtype = numpy.intc),
+                                              numpy.ctypeslib.ndpointer(dtype = numpy.intc),
+                                              ctypes.c_int]
+
+        self._dll.left_bin_search.argtypes = [ctypes.c_int, numpy.ctypeslib.ndpointer(dtype = numpy.intc), ctypes.c_int]
+        self._dll.right_bin_search.argtypes = [ctypes.c_int, numpy.ctypeslib.ndpointer(dtype = numpy.intc), ctypes.c_int]
+        self._dll.number_within_range.argtypes = [numpy.ctypeslib.ndpointer(dtype = numpy.intc),
+                                                  ctypes.c_int, ctypes.c_int, ctypes.c_int]
+
+    def compute(self, tStart, tEnd):
+        # XXX : this needs work.
+        tS = numpy.asarray(tStart, dtype = numpy.intc)
+        tE = numpy.asarray(tEnd, dtype = numpy.intc)
+        results = numpy.zeros(2 * len(tS), dtype = numpy.intc)
+        sortedEnds = numpy.asarray(numpy.sort(tE) - 1, dtype = numpy.intc)
+
+        self._dll.compute_indices(tS, tE, sortedEnds, results, len(tS))
+        return results.reshape((len(tS), 2))
+
+
+    def nwithin(self, vec, s, e):
+        return self._dll.number_within_range(numpy.asarray(vec, dtype = numpy.intc), s, e, len(vec))
diff --git a/pbh5tools/Metrics.py b/pbh5tools/Metrics.py
new file mode 100755
index 0000000..56a2bc1
--- /dev/null
+++ b/pbh5tools/Metrics.py
@@ -0,0 +1,567 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import sys
+import os
+import h5py
+import numpy as NP
+import inspect
+import re
+
+from pbcore.io import CmpH5Reader
+from pbh5tools.PBH5ToolsException import PBH5ToolsException
+
+NOTINCSV_LABEL = 'NotInCsv'
+
+def hasEval(thing):
+    return 'eval' in dir(thing)
+
+def procMe(thing):
+    if hasEval(thing):
+        return lambda cmpH5, idx : thing.eval(cmpH5, idx)
+    else:
+        return lambda cmpH5, idx : thing
+
+class Expr(object):
+    def eval(self, cmpH5, idx):
+        pass
+
+    def __add__(self, other):
+        return BinOp(self, other, '+')
+    def __div__(self, other):
+        return BinOp(self, other, '/')
+    def __sub__(self, other):
+        return BinOp(self, other, '-')
+    def __mul__(self, other):
+        return BinOp(self, other, '*')
+    def __radd__(self, other):
+        return BinOp(other, self, '+')
+    def __rdiv__(self, other):
+        return BinOp(other, self, '/')
+    def __rsub__(self, other):
+        return BinOp(other, self, '-')
+    def __rmul__(self, other):
+        return BinOp(other, self, '*')
+    def __eq__(self, other):
+        return BinOp(self, other, '=')
+    def __ne__(self, other):
+        return BinOp(self, other, '!=')
+    def __lt__(self, other):
+        return BinOp(self, other, '<')
+    def __le__(self, other):
+        return BinOp(self, other, '<=')
+    def __gt__(self, other):
+        return BinOp(self, other, '>')
+    def __ge__(self, other):
+        return BinOp(self, other, '>=')
+
+    def __and__(self, other):
+        return BinOp(self, other, '&')
+    def __or__(self, other):
+        return BinOp(self, other, '|')
+
+class BinOp(Expr):
+    def __init__(self, ll, rr, op):
+        self.ll = ll
+        self.rr = rr
+        self.l  = procMe(ll)
+        self.r  = procMe(rr)
+        self.op = op
+
+    def __str__(self):
+        return str(self.ll) + str(self.op) + str(self.rr)
+
+    def eval(self, cmpH5, idx):
+        if self.op == '+':
+            return self.l(cmpH5, idx) + self.r(cmpH5, idx)
+        elif self.op == '-':
+            return self.l(cmpH5, idx) - self.r(cmpH5, idx)
+        elif self.op == '/':
+            return self.l(cmpH5, idx) / self.r(cmpH5, idx)
+        elif self.op == '*':
+            return self.l(cmpH5, idx) * self.r(cmpH5, idx)
+        elif self.op == '=':
+            return self.l(cmpH5, idx) == self.r(cmpH5, idx)
+        elif self.op == '!=':
+            return self.l(cmpH5, idx) != self.r(cmpH5, idx)
+        elif self.op == '<':
+            return self.l(cmpH5, idx) < self.r(cmpH5, idx)
+        elif self.op == '<=':
+            return self.l(cmpH5, idx) <= self.r(cmpH5, idx)
+        elif self.op == '>':
+            return self.l(cmpH5, idx) > self.r(cmpH5, idx)
+        elif self.op == '>=':
+            return self.l(cmpH5, idx) >= self.r(cmpH5, idx)
+        elif self.op == '|':
+            return self.l(cmpH5, idx) | self.r(cmpH5, idx)
+        elif self.op == '&':
+            return self.l(cmpH5, idx) & self.r(cmpH5, idx)
+        elif self.op == ':':
+            return [(str(x) + ":" + str(y)) for x,y in zip(self.l(cmpH5, idx),
+                                                           self.r(cmpH5, idx))]
+        else:
+            raise Exception("Undefined operation:" + self.op)
+
+class Flatten(Expr):
+    def __init__(self, expr):
+        self.expr = expr
+
+    def eval(self, cmpH5, idx):
+        r = self.expr.eval(cmpH5, idx)
+        if isinstance(r, (list, tuple)):
+            return NP.concatenate(r)
+        else:
+            return r
+
+def processClass(cls, name, bases, dct):
+    ignoreRes = ['^Default', '^Metric$', '^Statistic$', '^Factor$',
+                 '^FactorStatistic']
+
+    if not any(map(lambda x : re.match(x, name), ignoreRes)):
+        if '__init__' in dct:
+            # if it has an init it takes arguments which define the
+            # metric.
+            f = dct['__init__']
+            a = inspect.getargspec(f)
+            if len(a.args) > 1:
+                argspec = '[' + ", ".join(a.args[1:]) + ']'
+            else:
+                argspec = ''
+            myName  = name
+        else:
+            myName  = re.sub('^_', '', name)
+            argspec = ''
+
+        if '__doc__' in dct:
+            docstr = dct['__doc__']
+        else:
+            docstr = ''
+
+        return myName + argspec + ('\n\t' + docstr if
+                                   docstr else docstr)
+    else:
+        return None
+
+
+class DocumentedMetric(type):
+    Metrics = []
+    def __new__(cls, name, bases, dct):
+        DocumentedMetric.Metrics.append(processClass(cls, name,
+                                                     bases, dct))
+        return type.__new__(cls, name, bases, dct)
+
+    @staticmethod
+    def list():
+        return filter(lambda x : x, DocumentedMetric.Metrics)
+
+class DocumentedStatistic(type):
+    Statistics = []
+    def __new__(cls, name, bases, dct):
+        DocumentedStatistic.Statistics.append(processClass(cls, name,
+                                                           bases, dct))
+        return type.__new__(cls, name, bases, dct)
+
+    @staticmethod
+    def list():
+        return filter(lambda x : x, DocumentedStatistic.Statistics)
+
+class Statistic(Expr):
+    __metaclass__ = DocumentedStatistic
+    def __init__(self, metric):
+        self.metric = metric
+
+    def eval(self, cmpH5, idx):
+        r = self.metric.eval(cmpH5, idx)
+        if isinstance(r, (list, tuple)):
+            return NP.array([self.f(rr) for rr in r])
+        else:
+            e = self.f(r)
+            return e if isinstance(e, NP.ndarray) else NP.array([e])
+
+    def __str__(self):
+        return self.__class__.__name__ + '(' + str(self.metric) + ')'
+
+class Metric(Expr):
+    __metaclass__ = DocumentedMetric
+
+    def eval(self, cmpH5, idx):
+        return self.produce(cmpH5, idx)
+
+    def __str__(self):
+        return re.sub('^_', '', self.__class__.__name__)
+
+
+class Factor(Metric):
+    def __rmul__(self, other):
+        return BinOp(other, self, ':')
+    def __mul__(self, other):
+        return BinOp(self, other, ':')
+
+class Tbl(object):
+    """The Tbl object provides a grouping construct for columns."""
+    def __init__(self, **args):
+        self.cols = args
+    def __iter__(self):
+        for a in self.cols:
+            yield (a, self.cols[a])
+    def eval(self, cmpH5, idx):
+        return [(a, self.cols[a].eval(cmpH5, idx)) for a in self.cols.keys()]
+
+def split(x, f):
+    # I'm thinking it is faster to do the allocation of the NP array
+    # rather than the appends.
+    assert(len(x) == len(f))
+    levels  = NP.unique(f)
+    counts  = {k:0 for k in levels}
+    for i in xrange(0, len(x)):
+        counts[f[i]] += 1
+    results = { k:NP.zeros(v, dtype = int) for k,v in counts.items() }
+    for i in xrange(0, len(x)):
+        k = f[i]
+        results[k][counts[k] - 1] = x[i]
+        counts[k] -= 1
+    # reverse it.
+    return { k:v[::-1] for k,v in results.items() }
+
+
+def toRecArray(res):
+    ## XXX : this ain't beautiful.
+    def myDtype(x):
+        if 'dtype' in dir(x):
+            return x.dtype
+        else:
+            return type(x)
+    def myLen(x):
+        if isinstance(x, NP.ndarray):
+            return len(x)
+        else:
+            return 1
+
+    def expand(groupName, seq):
+        return NP.array([groupName]*myLen(seq))
+
+    def convertToRecArray(elt, groupName = None):
+        ## recArrays don't like things other than strings for names.
+        nat = [(str(n[0]), myDtype(n[1])) for n in elt]
+        dta = [n[1] for n in elt]
+        if groupName:
+            nat.insert(0, ('Group', object))
+            dta.insert(0, expand(groupName, dta[0]))
+
+        return NP.rec.array(dta, dtype = nat)
+
+    if DefaultGroupBy.word() in res:
+        return convertToRecArray(res[DefaultGroupBy.word()])
+    else:
+        recArrays = []
+        for k in sorted(res.keys()):
+            recArrays.append(convertToRecArray(res[k], k))
+        return NP.hstack(recArrays)
+
+def groupCsv(csvFile, idxs, reader):
+    #csvFile is a csv text file with header. first column used as group name
+    #other columns are from 'listMetrics'
+    mapValToGrp = {} 
+    with open( csvFile ) as ofile:
+        header = ofile.readline().rstrip('\r\n')
+        #eval header after replacing s/,/* to conform to groupByStr format
+        #do this here before walking thru file in case of errors in heading
+        groupBy = eval('*'.join(header.split(',')[1:]))
+        for line in ofile.readlines():
+            columns = line.rstrip('\r\n').split(',')
+            mapValToGrp[ ':'.join(columns[1:]) ] = columns[0] 
+    return [ mapValToGrp.get(val,NOTINCSV_LABEL) for val in groupBy.eval(reader,idxs) ]
+        
+
+# Stats
+class Min(Statistic):
+    def f(self, x):
+        return NP.min(x[~NP.isnan(x)])
+
+class Max(Statistic):
+    def f(self, x):
+        return NP.max(x[~NP.isnan(x)])
+
+class Sum(Statistic):
+    def f(self, x):
+        return NP.sum(x[~NP.isnan(x)])
+
+class Mean(Statistic):
+    def f(self, x):
+        return NP.mean(x[~NP.isnan(x)])
+
+class Median(Statistic):
+    def f(self, x):
+        return NP.median(x[~NP.isnan(x)])
+
+class Count(Statistic):
+    def f(self, x):
+        return len(x)
+
+class Percentile(Statistic):
+    def __init__(self, metric, ptile = 95.0):
+        super(Percentile, self).__init__(metric)
+        self.ptile = ptile
+
+    def f(self, x):
+        return NP.percentile(x[~NP.isnan(x)], self.ptile)
+
+class Round(Statistic):
+    def __init__(self, metric, digits = 0):
+        super(Round, self).__init__(metric)
+        self.digits = digits
+    def f(self, x):
+        return NP.around(x, self.digits)
+
+
+##
+## XXX : Not sure that this is correct. This will work, but it begs
+## the question whether or not we need some new category, like
+## 'Operator' to encompass particular ways of tabulating.
+##
+## Additionally, FactorStatistics can be computed using a group by -
+## it is only the case that you need this in a where where you need
+## this new concept.
+class ByFactor(Metric):
+    __metaclass__ = DocumentedMetric
+
+    def __init__(self, metric, factor, statistic):
+        self.metric     = metric
+        self.factor     = factor
+        self.statistic  = statistic(metric)
+
+    def produce(self, cmpH5, idx):
+        r   = self.metric.eval(cmpH5, idx)
+        fr  = split(range(len(idx)), self.factor.eval(cmpH5, idx))
+        res = NP.zeros(len(idx), dtype = NP.int)
+        for v in fr.values():
+            res[v] = self.statistic.f(r[v])
+        return res
+
+class _MoleculeReadStart(ByFactor):
+    def __init__(self):
+        super(_MoleculeReadStart, self).__init__(ReadStart, MoleculeName, Min)
+
+class _MinSubreadLength(ByFactor):
+    def __init__(self):
+        super(_MinSubreadLength, self).__init__(ReadLength, MoleculeName, Min)
+
+class _MaxSubreadLength(ByFactor):
+    def __init__(self):
+        super(_MaxSubreadLength, self).__init__(ReadLength, MoleculeName, Max)
+
+class _UnrolledReadLength(ByFactor):
+    def __init__(self):
+        super(_UnrolledReadLength, self).__init__(ReadLength, MoleculeName, Sum)
+
+# Metrics
+class _DefaultWhere(Metric):
+    def produce(self, cmpH5, idx):
+        return NP.ones(len(idx), dtype = bool)
+DefaultWhere = _DefaultWhere()
+
+class _DefaultGroupBy(Metric):
+    @staticmethod
+    def word():
+        return 'DefaultGroupBy'
+
+    def produce(self, cmpH5, idx):
+        return NP.array([DefaultGroupBy.word()] * len(idx))
+DefaultGroupBy = _DefaultGroupBy()
+
+
+class _TemplateSpan(Metric):
+    """The number of template bases covered by the read"""
+    def produce(self, cmpH5, idx):
+        return (cmpH5.tEnd[idx] - cmpH5.tStart[idx])
+
+class _ReadLength(Metric):
+    def produce(self, cmpH5, idx):
+        return (cmpH5.rEnd[idx] - cmpH5.rStart[idx])
+
+class _NErrors(Metric):
+    def produce(self, cmpH5, idx):
+        return (cmpH5.nMM[idx] + cmpH5.nIns[idx] + cmpH5.nDel[idx])
+
+class _ReadDuration(Metric):
+    def produce(self, cmpH5, idx):
+        return NP.array([ sum(cmpH5[i].IPD() + cmpH5[i].PulseWidth())
+                          for i in idx ])
+class _FrameRate(Metric):
+    def produce(self, cmpH5, idx):
+        return NP.array([ cmpH5[i].movieInfo.FrameRate for i in idx ])
+
+class _IPD(Metric):
+    def produce(self, cmpH5, idx):
+        return [ cmpH5[i].IPD() for i in idx ]
+
+class _PulseWidth(Metric):
+    def produce(self, cmpH5, idx):
+        return [ cmpH5[i].PulseWidth() for i in idx ]
+
+class _Movie(Factor):
+    def produce(self, cmpH5, idx):
+        mtb = cmpH5.movieInfoTable
+        mapping = NP.zeros((NP.max([ i.ID for i in mtb]) + 1, ), dtype = object)
+        mapping[NP.array([i.ID for i in mtb])] = \
+            NP.array([i.Name for i in mtb])
+        return mapping[cmpH5.alignmentIndex.MovieID[idx]]
+
+class _Reference(Factor):
+    def produce(self, cmpH5, idx):
+        return NP.array([cmpH5[i].referenceInfo['FullName'] for i in idx])
+
+class _RefIdentifier(Factor):
+    def produce(self, cmpH5, idx):
+        return NP.array([cmpH5[i].referenceInfo['Name'] for i in idx])
+
+class _HoleNumber(Factor):
+    def produce(self, cmpH5, idx):
+        return cmpH5.alignmentIndex['HoleNumber'][idx]
+
+class _ReadStart(Metric):
+    def produce(self, cmpH5, idx):
+        return cmpH5.alignmentIndex['rStart'][idx]
+
+class _ReadEnd(Metric):
+    def produce(self, cmpH5, idx):
+        return cmpH5.alignmentIndex['rEnd'][idx]
+
+class _TemplateStart(Metric):
+    def produce(self, cmpH5, idx):
+        return cmpH5.alignmentIndex['tStart'][idx]
+
+class _TemplateEnd(Metric):
+    def produce(self, cmpH5, idx):
+        return cmpH5.alignmentIndex['tEnd'][idx]
+
+class _MoleculeId(Factor):
+    def produce(self, cmpH5, idx):
+        return cmpH5.alignmentIndex['MoleculeID'][idx]
+
+class _MoleculeName(Factor):
+    def produce(self, cmpH5, idx):
+        molecules = zip(cmpH5.alignmentIndex['MovieID'][idx],
+                        cmpH5.alignmentIndex['HoleNumber'][idx])
+        return NP.array(['%s_%s' % (m,h) for m,h in molecules])
+
+class _Strand(Factor):
+    def produce(self, cmpH5, idx):
+        return cmpH5.alignmentIndex['RCRefStrand'][idx]
+
+class _AlignmentIdx(Factor):
+    def produce(self, cmpH5, idx):
+        return idx
+
+class _Barcode(Factor):
+    def produce(self, cmpH5, idx):
+        return NP.array([cmpH5[i].barcodeName for i in idx])
+         
+class _AverageBarcodeScore(Metric):
+    def produce(self, cmpH5, idx):
+        bestScore = cmpH5.file[ '/AlnInfo/Barcode' ][idx,2].astype(float)
+        nScored   = cmpH5.file[ '/AlnInfo/Barcode' ][idx,0]
+        return bestScore / nScored
+
+class _MapQV(Metric):
+    def produce(self, cmpH5, idx):
+        return cmpH5.alignmentIndex.MapQV[idx]
+
+class _WhiteList(Factor):
+    def produce(self, cmpH5, idx):
+        return NP.array( [ '%s/%i' % ( cmpH5[i].movieInfo[1], cmpH5[i].HoleNumber ) for i in idx ] )
+
+class SubSample(Metric):
+    """boolean vector with true occuring at rate rate or nreads = n"""
+    def __init__(self, rate = 1, n = None):
+        self.rate = rate
+        self.n    = n
+
+    def produce(self, cmpH5, idx):
+        if self.n is not None:
+            return NP.in1d(idx, NP.floor(NP.random.uniform(0, len(idx), self.n)))
+        else:
+            return NP.array(NP.random.binomial(1, self.rate, len(idx)), dtype = bool)
+            
+###############################################################################
+##
+## Define the core metrics, try to define all metrics in terms of some
+## basic metrics.
+##
+###############################################################################
+ReadLength          = _ReadLength()
+TemplateSpan        = _TemplateSpan()
+NErrors             = _NErrors()
+ReadFrames          = _ReadDuration() * 1.0
+FrameRate           = _FrameRate()
+IPD                 = _IPD()
+PulseWidth          = _PulseWidth()
+Accuracy            = 1.0 - NErrors/(ReadLength * 1.0)
+PolRate             = TemplateSpan/(ReadFrames/(FrameRate * 1.0))
+Movie               = _Movie()
+DefaultWhat         = Tbl(readLength = ReadLength, accuracy = Accuracy)
+Reference           = _Reference()
+RefIdentifier       = _RefIdentifier()
+HoleNumber          = _HoleNumber()
+AlignmentIdx        = _AlignmentIdx()
+Strand              = _Strand()
+MoleculeId          = _MoleculeId()
+MoleculeName        = _MoleculeName()
+TemplateEnd         = _TemplateEnd()
+TemplateStart       = _TemplateStart()
+ReadEnd             = _ReadEnd()
+ReadStart           = _ReadStart()
+Barcode             = _Barcode()
+MapQV               = _MapQV()
+WhiteList           = _WhiteList()
+AverageBarcodeScore = _AverageBarcodeScore()
+
+MoleculeReadStart   = _MoleculeReadStart()
+MinSubreadLength    = _MinSubreadLength()
+MaxSubreadLength    = _MaxSubreadLength()
+UnrolledReadLength  = _UnrolledReadLength()
+
+DefaultSortBy       = Tbl(alignmentIdx = AlignmentIdx)
+
+def query(reader, what = DefaultWhat, where = DefaultWhere,
+          groupBy = DefaultGroupBy, groupByCsv = None, 
+          sortBy = DefaultSortBy, limit = None):
+    idxs = NP.where(where.eval(reader, range(0, len(reader))))[0]
+    if groupByCsv:
+        groupBy = groupCsv(groupByCsv, idxs, reader)
+    else:
+        groupBy = groupBy.eval(reader, idxs)
+    results = {}
+    
+    for k,v in split(idxs, groupBy).items():
+        sortVals = sortBy.eval(reader, v)
+        sortIdxs = v[NP.lexsort(map(lambda z : z[1], sortVals)[::-1])][:limit]
+        results[k] = what.eval(reader, sortIdxs)
+    return results
diff --git a/pbh5tools/PBH5ToolsException.py b/pbh5tools/PBH5ToolsException.py
new file mode 100755
index 0000000..4424276
--- /dev/null
+++ b/pbh5tools/PBH5ToolsException.py
@@ -0,0 +1,37 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+class PBH5ToolsException(Exception):
+    def __init__(self, command, msg):
+        self.command = command
+        self.msg     = msg
+
+    def __str__(self):
+        return "command: " + self.command + " produced the following error: " + self.msg
diff --git a/pbh5tools/__init__.py b/pbh5tools/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/pbh5tools/_version.py b/pbh5tools/_version.py
new file mode 100755
index 0000000..6b911a9
--- /dev/null
+++ b/pbh5tools/_version.py
@@ -0,0 +1,31 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+__version__ = "0.8.0"
diff --git a/pbh5tools/cbook.py b/pbh5tools/cbook.py
new file mode 100755
index 0000000..3884303
--- /dev/null
+++ b/pbh5tools/cbook.py
@@ -0,0 +1,86 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+import numpy.ma as ma
+
+"""
+A collection of utility functions and classes.  Many (but not all)
+from the Python Cookbook -- hence the name cbook
+"""
+def is_string_like(obj):
+    'Return True if *obj* looks like a string'
+    if isinstance(obj, (str, unicode)):
+        return True
+    # numpy strings are subclass of str, ma strings are not
+    if ma.isMaskedArray(obj):
+        if obj.ndim == 0 and obj.dtype.kind in 'SU':
+            return True
+        else:
+            return False
+    try:
+        obj + ''
+    except:
+        return False
+    return True
+
+
+def to_filehandle(fname, flag='rU', return_opened=False):
+    """
+    *fname* can be a filename or a file handle.  Support for gzipped
+    files is automatic, if the filename ends in .gz.  *flag* is a
+    read/write flag for :func:`file`
+    """
+    if is_string_like(fname):
+        if fname.endswith('.gz'):
+            import gzip
+            # get rid of 'U' in flag for gzipped files.
+            flag = flag.replace('U','')
+            fh = gzip.open(fname, flag)
+        elif fname.endswith('.bz2'):
+            # get rid of 'U' in flag for bz2 files
+            flag = flag.replace('U','')
+            import bz2
+            fh = bz2.BZ2File(fname, flag)
+        else:
+            fh = file(fname, flag)
+        opened = True
+    elif hasattr(fname, 'seek'):
+        fh = fname
+        opened = False
+    else:
+        raise ValueError('fname must be a string or file handle')
+    if return_opened:
+        return fh, opened
+    return fh
+
+def is_numlike(obj):
+    'return true if *obj* looks like a number'
+    try: obj+1
+    except: return False
+    else: return True
diff --git a/pbh5tools/ci.c b/pbh5tools/ci.c
new file mode 100644
index 0000000..d51154f
--- /dev/null
+++ b/pbh5tools/ci.c
@@ -0,0 +1,112 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+int _bisect(int val, int* vec,  int l,  int r) {
+    if ((r - l) <= 3) {
+	// a cheat :)
+	while (((r - 1) >= 0) && (vec[r-1] >= val)) {
+	    r--;
+	}
+	return r;
+    }
+    else {
+	int i = (l + r)/2;
+
+	if (vec[i] > val) {
+	    return _bisect(val, vec, l, i);
+	} else {
+	    return _bisect(val, vec, i, r);
+	}
+    }
+}
+
+int left_bin_search(int val, int* vec, int l) {
+    int s = _bisect(val, vec, 0, l);
+    int v = -1;
+    
+    if (s == 0) {
+	return s;
+    }
+    else if (s == l) {
+	v = vec[--s];
+    } 
+    else {
+	v = vec[s];
+    }
+
+    if (v > val) {
+	s--;
+    }
+    while (s > 0 && vec[s-1] == vec[s])	s--;
+
+    return s;
+}
+
+int right_bin_search(int val, int* vec,  int l) {
+    int s = _bisect(val, vec, 0, l);
+    
+    if (s == l)
+	return(s);
+    while (s + 1 < l && vec[s + 1] == val)
+	s++;
+    return s;
+}
+
+int number_within_range(int* vec, int s, int e, int vlen) {
+    int i,t = 0;
+    int lI = left_bin_search(s, vec, vlen);
+    int rI = right_bin_search(e, vec, vlen);
+    
+    for (i = lI; i < rI; i++) {
+	if ((s <= vec[i]) && (vec[i] < e)) t++;
+    }
+    return t;
+}
+
+
+#define _2COL_ME_(i,j) (((i)*2) + (j))
+
+void print_matrix(int* mat, int n, int m) {
+    int i,j;
+    for (i = 0; i < n; i++) {
+	for (j = 0; j < m; j++) {
+	    printf("%d ", mat[_2COL_ME_(i,j)]);
+	}
+	printf("\n");
+    }
+    printf("----");
+}
+
+void compute_indices(int* t_start, int* t_end, int* sorted_t_ends, int* results, int n) {
+    int i,k,n_ending,advance = 0;
+
+    // print_matrix(results, n, 2);
+
+    for (i = 1; i < n; i++) {
+	// print_matrix(results, n, 2);
+
+	n_ending = number_within_range(sorted_t_ends, t_start[i-1], t_start[i], n);
+	// printf("n_ending: %d\n", n_ending);
+	if (n_ending == 0) { 
+            results[_2COL_ME_(i, 0)] = results[_2COL_ME_(i - 1, 0)] + 1;
+	    results[_2COL_ME_(i, 1)] = results[_2COL_ME_(i - 1, 1)] + 1;
+	} 
+	else {
+	    results[_2COL_ME_(i, 1)] = results[_2COL_ME_(i - 1, 1)] - n_ending + 1;
+	    
+	    advance = 0;
+            for (k = i - 1 - results[_2COL_ME_(i - 1, 0)]; k < i; k++) {
+                if (t_end[k] > t_start[i]) {
+                    break;
+		}
+		advance++;
+	    }
+	    
+	    results[_2COL_ME_(i, 0)] = results[_2COL_ME_(i - 1, 0)] - advance + 1;
+	}
+    }
+}
+
+
+
diff --git a/pbh5tools/mlab.py b/pbh5tools/mlab.py
new file mode 100755
index 0000000..9b7efac
--- /dev/null
+++ b/pbh5tools/mlab.py
@@ -0,0 +1,390 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+"""
+
+Numerical python functions written for compatability with MATLAB
+commands with the same names.
+
+record array helper functions
+-------------------------------
+
+A collection of helper methods for numpyrecord arrays
+
+:meth:`rec2txt`
+    pretty print a record array
+
+:meth:`rec2csv`
+    store record array in CSV file
+
+"""
+
+import csv, os, copy
+import cbook
+import numpy as np
+
+# a series of classes for describing the format intentions of various rec views
+class FormatObj:
+    def tostr(self, x):
+        return self.toval(x)
+
+    def toval(self, x):
+        return str(x)
+
+    def fromstr(self, s):
+        return s
+
+
+    def __hash__(self):
+        """
+        override the hash function of any of the formatters, so that we don't
+        create duplicate excel format styles
+        """
+        return hash(self.__class__)
+
+class FormatString(FormatObj):
+    def tostr(self, x):
+        val = repr(x)
+        return val[1:-1]
+
+#class FormatString(FormatObj):
+# def tostr(self, x):
+# return '"%r"'%self.toval(x)
+
+
+
+class FormatFormatStr(FormatObj):
+    def __init__(self, fmt):
+        self.fmt = fmt
+
+    def tostr(self, x):
+        if x is None: return 'None'
+        return self.fmt%self.toval(x)
+
+
+
+
+class FormatFloat(FormatFormatStr):
+    def __init__(self, precision=4, scale=1.):
+        FormatFormatStr.__init__(self, '%%1.%df'%precision)
+        self.precision = precision
+        self.scale = scale
+
+    def __hash__(self):
+        return hash((self.__class__, self.precision, self.scale))
+
+    def toval(self, x):
+        if x is not None:
+            x = x * self.scale
+        return x
+
+    def fromstr(self, s):
+        return float(s)/self.scale
+
+
+class FormatInt(FormatObj):
+
+    def tostr(self, x):
+        return '%d'%int(x)
+
+    def toval(self, x):
+        return int(x)
+
+    def fromstr(self, s):
+        return int(s)
+
+class FormatBool(FormatObj):
+
+
+    def toval(self, x):
+        return str(x)
+
+    def fromstr(self, s):
+        return bool(s)
+
+class FormatPercent(FormatFloat):
+    def __init__(self, precision=4):
+        FormatFloat.__init__(self, precision, scale=100.)
+
+class FormatThousands(FormatFloat):
+    def __init__(self, precision=4):
+        FormatFloat.__init__(self, precision, scale=1e-3)
+
+
+class FormatMillions(FormatFloat):
+    def __init__(self, precision=4):
+        FormatFloat.__init__(self, precision, scale=1e-6)
+
+
+class FormatDate(FormatObj):
+    def __init__(self, fmt):
+        self.fmt = fmt
+
+    def __hash__(self):
+        return hash((self.__class__, self.fmt))
+
+
+    def toval(self, x):
+        if x is None: return 'None'
+        return x.strftime(self.fmt)
+
+    def fromstr(self, x):
+        import dateutil.parser
+        return dateutil.parser.parse(x).date()
+
+class FormatDatetime(FormatDate):
+    def __init__(self, fmt='%Y-%m-%d %H:%M:%S'):
+        FormatDate.__init__(self, fmt)
+
+    def fromstr(self, x):
+        import dateutil.parser
+        return dateutil.parser.parse(x)
+
+
+
+
+defaultformatd = {
+    np.bool_ : FormatBool(),
+    np.int16 : FormatInt(),
+    np.int32 : FormatInt(),
+    np.int64 : FormatInt(),
+    np.float32 : FormatFloat(),
+    np.float64 : FormatFloat(),
+    np.object_ : FormatObj(),
+    np.string_ : FormatString(),
+    }
+
+def get_formatd(r, formatd=None):
+    'build a formatd guaranteed to have a key for every dtype name'
+    if formatd is None:
+        formatd = dict()
+
+    for i, name in enumerate(r.dtype.names):
+        dt = r.dtype[name]
+        format = formatd.get(name)
+        if format is None:
+            format = defaultformatd.get(dt.type, FormatObj())
+        formatd[name] = format
+    return formatd
+
+def csvformat_factory(format):
+    format = copy.deepcopy(format)
+    if isinstance(format, FormatFloat):
+        format.scale = 1. # override scaling for storage
+    #    format.fmt = '%r'
+    return format
+
+
+def rec2csv(r, fname, delimiter=',', formatd=None, missing='',
+            missingd=None, withheader=True):
+    """
+    Save the data from numpy recarray *r* into a
+    comma-/space-/tab-delimited file.  The record array dtype names
+    will be used for column headers.
+
+    *fname*: can be a filename or a file handle.  Support for gzipped
+      files is automatic, if the filename ends in '.gz'
+
+    *withheader*: if withheader is False, do not write the attribute
+      names in the first row
+
+    for formatd type FormatFloat, we override scaling in CSV but the
+      precision is unchanged
+    
+
+
+    .. seealso::
+
+        :func:`csv2rec`
+            For information about *missing* and *missingd*, which can
+            be used to fill in masked values into your CSV file.
+    """
+
+    if missingd is None:
+        missingd = dict()
+
+    def with_mask(func):
+        def newfunc(val, mask, mval):
+            if mask:
+                return mval
+            else:
+                return func(val)
+        return newfunc
+
+    if r.ndim != 1:
+        raise ValueError('rec2csv only operates on 1 dimensional recarrays')
+
+    formatd = get_formatd(r, formatd)
+    funcs = []
+    for i, name in enumerate(r.dtype.names):
+        funcs.append(with_mask(csvformat_factory(formatd[name]).tostr))
+
+    fh, opened = cbook.to_filehandle(fname, 'wb', return_opened=True)
+    writer = csv.writer(fh, delimiter=delimiter)
+    header = r.dtype.names
+    if withheader:
+        writer.writerow(header)
+
+    # Our list of specials for missing values
+    mvals = []
+    for name in header:
+        mvals.append(missingd.get(name, missing))
+
+    ismasked = False
+    if len(r):
+        row = r[0]
+        ismasked = hasattr(row, '_fieldmask')
+
+    for row in r:
+        if ismasked:
+            row, rowmask = row.item(), row._fieldmask.item()
+        else:
+            rowmask = [False] * len(row)
+        writer.writerow([func(val, mask, mval) for func, val, mask, mval
+                         in zip(funcs, row, rowmask, mvals)])
+    if opened:
+        fh.close()
+
+def rec2txt(r, header=None, padding=3, precision=3, fields=None):
+    """
+    Returns a textual representation of a record array.
+
+    *r*: numpy recarray
+
+    *header*: list of column headers
+
+    *padding*: space between each column
+
+    *precision*: number of decimal places to use for floats.
+        Set to an integer to apply to all floats.  Set to a
+        list of integers to apply precision individually.
+        Precision for non-floats is simply ignored.
+
+    *fields* : if not None, a list of field names to print.  fields
+    can be a list of strings like ['field1', 'field2'] or a single
+    comma separated string like 'field1,field2'
+
+    Example::
+
+      precision=[0,2,3]
+
+    Output::
+
+      ID    Price   Return
+      ABC   12.54    0.234
+      XYZ    6.32   -0.076
+    """
+
+    if fields is not None:
+        r = rec_keep_fields(r, fields)
+
+    if cbook.is_numlike(precision):
+        precision = [precision]*len(r.dtype)
+
+    def get_type(item,atype=int):
+        tdict = {None:int, int:float, float:str}
+        try: atype(str(item))
+        except: return get_type(item,tdict[atype])
+        return atype
+
+    def get_justify(colname, column, precision):
+        ntype = type(column[0])
+
+       
+
+        if ntype==np.int or ntype==np.int16 or ntype==np.int32 or ntype==np.int64 or ntype==np.int8 or ntype==np.int_:
+            length = max(len(colname),np.max(map(len,map(str,column))))
+            return 1, length+padding, "%d" # right justify
+
+        # JDH: my powerbook does not have np.float96 using np 1.3.0
+        """
+        In [2]: np.__version__
+        Out[2]: '1.3.0.dev5948'
+
+        In [3]: !uname -a
+        Darwin Macintosh-5.local 9.4.0 Darwin Kernel Version 9.4.0: Mon Jun  9 19:30:53 PDT 2008; root:xnu-1228.5.20~1/RELEASE_I386 i386 i386
+
+        In [4]: np.float96
+        ---------------------------------------------------------------------------
+        AttributeError                            Traceback (most recent call la
+        """
+        if ntype==np.float or ntype==np.float32 or ntype==np.float64 or (hasattr(np, 'float96') and (ntype==np.float96)) or ntype==np.float_:
+            fmt = "%." + str(precision) + "f"
+            length = max(len(colname),np.max(map(len,map(lambda x:fmt%x,column))))
+            return 1, length+padding, fmt   # right justify
+
+
+        if ntype==np.str or ntype==np.str_ or ntype==np.string0 or ntype==np.string_:
+            length = max(len(colname),column.itemsize)
+            return 1, length+padding, "%s" # left justify // JHB changed the 0 to a 1
+
+        return 0, max(len(colname),np.max(map(len,map(str,column))))+padding, "%s"
+
+    if header is None:
+        header = r.dtype.names
+
+    justify_pad_prec = [get_justify(header[i],r.__getitem__(colname),precision[i]) for i, colname in enumerate(r.dtype.names)]
+
+    justify_pad_prec_spacer = []
+    for i in range(len(justify_pad_prec)):
+        just,pad,prec = justify_pad_prec[i]
+        if i == 0:
+            justify_pad_prec_spacer.append((just,pad,prec,0))
+        else:
+            pjust,ppad,pprec = justify_pad_prec[i-1]
+            if pjust == 0 and just == 1:
+                justify_pad_prec_spacer.append((just,pad-padding,prec,0))
+            elif pjust == 1 and just == 0:
+                justify_pad_prec_spacer.append((just,pad,prec,padding))
+            else:
+                justify_pad_prec_spacer.append((just,pad,prec,0))
+
+    def format(item, just_pad_prec_spacer):
+        just, pad, prec, spacer = just_pad_prec_spacer
+        if just == 0:
+            return spacer*' ' + str(item).ljust(pad)
+        else:
+            if get_type(item) == float:
+                item = (prec%float(item))
+            elif get_type(item) == int:
+                item = (prec%int(item))
+
+            return item.rjust(pad)
+
+    textl = []
+    textl.append(''.join([format(colitem,justify_pad_prec_spacer[j]) for j, colitem in enumerate(header)]))
+    for i, row in enumerate(r):
+        textl.append(''.join([format(colitem,justify_pad_prec_spacer[j]) for j, colitem in enumerate(row)]))
+        if i==0:
+            textl[0] = textl[0].rstrip()
+
+    text = os.linesep.join(textl)
+    return text
+
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..e57400f
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,37 @@
+from setuptools import setup, Extension, find_packages
+
+import os
+import sys
+
+vFile = 'pbh5tools/_version.py'
+
+if os.path.exists(vFile):
+    lines = open(vFile, 'r').read().splitlines()
+    for line in lines:
+        elts = line.split('=')
+        elts = [e.strip() for e in elts]
+        if len(elts) == 2 and elts[0] == '__version__':
+            _ReadVersion = elts[1].replace('\'', '').replace('\"', '')
+            break
+else:
+    _ReadVersion = '0.0.0'
+
+setup(
+    name = 'pbh5tools',
+    version=_ReadVersion,
+    author='Pacific Biosciences',
+    author_email='devnet at pacificbiosciences.com',
+    license=open('LICENSES.txt').read(),
+    scripts = ['bin/bash5tools.py',
+               'bin/cmph5tools.py'],
+    packages = find_packages("."),
+    package_dir = {'':'.'},
+    ext_modules=[Extension('pbh5tools/ci', ['pbh5tools/ci.c'],
+                           extra_compile_args=["-O3","-shared"])],
+    zip_safe = False,
+    install_requires=[
+        'pbcore >= 0.8.0',
+        'numpy >= 1.6.0',
+        'h5py >= 1.3.0'
+        ]
+    )
diff --git a/tests/cram/bash5tools.t b/tests/cram/bash5tools.t
new file mode 100644
index 0000000..52807bb
--- /dev/null
+++ b/tests/cram/bash5tools.t
@@ -0,0 +1,312 @@
+
+  $ . $TESTDIR/portability.sh
+
+Set up some vars ...
+
+  $ INH5=`python -c "from pbcore import data; print data.getBasH5s()[0]"`
+  $ MOVIENAME=$(basename `python -c "from pbcore import data; print data.getBasH5s()[0][:-7]"`)
+  $ CMD="bash5tools.py $INH5"
+
+  $ $CMD --readType=ccs
+  $ head ${MOVIENAME}.fasta
+  >m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/ccs
+  ACGGCAGGATGCCCGTTCTGCGAGGCGGTGGCCAGTGTAATGAGGTGCTTTATGACTCTG
+  CCGCCGTCATAAAATGGTATGCCGAAAAGGGATGCTGAAATTGAGAACGAAAGCCTGCGC
+  CGGGAGGTTGAGAACTGCGGCAGGCCGACGGCAGATCTCCAGCCAGAGAACTATTGAGTA
+  CGAGCACCCATCGACCTACGCGTGCCGCAGGCCGACGCACAGGAACTGACAGAATGCCAG
+  AGACTCCCGCTGAAGGTGGTGGAAACCGCCATCTGTACTGTTTTCGTGGCTGTCGCGGAT
+  CGCAGGGTGAAATTGCCAGTATTCTCGACGGGGGGCCCCCTGTCGGGGCAGCGGCGTTTT
+  >m110818_075520_42141_c100129202555500000315043109121112_s1_p0/9/ccs
+  ATGTTGCCATCCGTGGCAATCAATGCTGCTAACGTGTGACCGCATTCAAAAATGTTGTCT
+  GCGATTGACTCTTCTTTGTGGCATTGCACCACCAGAAGCGTCATACACGGCTTAAACAAG
+
+  $ $CMD --readType=unrolled
+  $ head ${MOVIENAME}.fasta
+  >m110818_075520_42141_c100129202555500000315043109121112_s1_p0/7/0_1578
+  TGAAGCGGAAGTGAATCAACAGGTTCTGGCGTTAGATTTCTCTACGTTGCCCCCATGCAG
+  TTGTTTAACGTAACCTCCGAGCCACACCGGCAACTCAACAACAAAGGGTGAGTGGACATT
+  TCCCCAGACTTTCTTCCGGCGGGGGTTTTGCCCAAAATCAACTTTGTAACCCGAAAGCGG
+  TGATACGCCCGAGCGTAATTGGCCACGCATATCCCCTGTTCGAACGCTCTCACTCGCTCC
+  GGTACGCGGAGAAATGGTCCGGTGCATGCTCCCACCTCGCCGGGCTCGTTCAGGAACAAA
+  GCTTTAACACCAAGCCAACAAGAGGAGGGAAAAGAGAGAGGATTAAAACCCTTGGGCCTG
+  AAACAGAAACCCCGAGGCCAGAAGTGGGAGCATGACAACCGGCCCATTCTGCAGCGTACG
+  TGATCGCGGAAGCTGCGAAACAGGGGGATTGCGGACACAAATTACGGGCTCGGACGTATC
+  ACGCTTCAGATCCAACGACTAGCCAAAACCCGCCCGGAAAAGAAGTGCCCCTATGAAAAT
+
+  $ $CMD --readType=subreads
+  $ head ${MOVIENAME}.fasta
+  >m110818_075520_42141_c100129202555500000315043109121112_s1_p0/7/0_299
+  TGAAGCGGAAGTGAATCAACAGGTTCTGGCGTTAGATTTCTCTACGTTGCCCCCATGCAG
+  TTGTTTAACGTAACCTCCGAGCCACACCGGCAACTCAACAACAAAGGGTGAGTGGACATT
+  TCCCCAGACTTTCTTCCGGCGGGGGTTTTGCCCAAAATCAACTTTGTAACCCGAAAGCGG
+  TGATACGCCCGAGCGTAATTGGCCACGCATATCCCCTGTTCGAACGCTCTCACTCGCTCC
+  GGTACGCGGAGAAATGGTCCGGTGCATGCTCCCACCTCGCCGGGCTCGTTCAGGAACAA
+  >m110818_075520_42141_c100129202555500000315043109121112_s1_p0/7/343_991
+  TAAAACCCTTGGGCCTGAAACAGAAACCCCGAGGCCAGAAGTGGGAGCATGACAACCGGC
+  CCATTCTGCAGCGTACGTGATCGCGGAAGCTGCGAAACAGGGGGATTGCGGACACAAATT
+  ACGGGCTCGGACGTATCACGCTTCAGATCCAACGACTAGCCAAAACCCGCCCGGAAAAGA
+
+  $ $CMD --readType=ccs --outType=fastq
+  $ fold -80 ${MOVIENAME}.fastq
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/ccs
+  ACGGCAGGATGCCCGTTCTGCGAGGCGGTGGCCAGTGTAATGAGGTGCTTTATGACTCTGCCGCCGTCATAAAATGGTAT
+  GCCGAAAAGGGATGCTGAAATTGAGAACGAAAGCCTGCGCCGGGAGGTTGAGAACTGCGGCAGGCCGACGGCAGATCTCC
+  AGCCAGAGAACTATTGAGTACGAGCACCCATCGACCTACGCGTGCCGCAGGCCGACGCACAGGAACTGACAGAATGCCAG
+  AGACTCCCGCTGAAGGTGGTGGAAACCGCCATCTGTACTGTTTTCGTGGCTGTCGCGGATCGCAGGGTGAAATTGCCAGT
+  ATTCTCGACGGGGGGCCCCCTGTCGGGGCAGCGGCGTTTT
+  +
+  #$'0'&../1.*+*+*))02)'1&&,//2..%#'%#&&11692,-/..&&&26:1,+292*)&%)(.02:,,,,2//,,&
+  )676&''$%%%%.343(&,+-1*(+$$$9&&&&))-(&-''-.-$$,&88*),.<:2/.00522,$!"!'((3'#%)7.-
+  /:$$$.%114*-$#,/0'&'20)(9*&/10..0.($,,;5$$2/-.3)((%(101))230))/,/(/,%%1/***0,--3
+  +*(%%$%!&089&)(')+,-78,,,+++$'2/.0&&123&"$$#(((&&002*&'%#''3,'##',*''/66475481&0
+  .---9 at 66?,,,*+'$$$$$%?55++%%,,.&&//&"%)*
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/9/ccs
+  ATGTTGCCATCCGTGGCAATCAATGCTGCTAACGTGTGACCGCATTCAAAAATGTTGTCTGCGATTGACTCTTCTTTGTG
+  GCATTGCACCACCAGAAGCGTCATACACGGCTTAAACAAGTGCGTGACCAGGTGGGTTGGGGTAAGGTTTTGGGATTAGC
+  ATCGTCACAGCGCGATATGCTGCGCTTGCCTGGCATCCTTGAATAGCCGACGCCCTTTGCATCTTCCGCACTCTTTCTCG
+  ACAACTCTCCCCACAGCTCTGGTTTTGGCAATATCAACCGCAACGGCCTGTACCATGGCAATCTCTGCAATCTTGCCCCC
+  GGCGTCGCGCACTACGGCCAATAATCCGCATAAGCGGAATGTGCGAGCACTTGAGTACCTTTGCCTTAGTATTTCCTTCA
+  AGCTTTGCCACACCCACGGTAATTTCCCCGATACCTTGTGTGCAAATTGCATCAGATAGTTGATAGCCTTTTGTTTGTCG
+  TCTGGCTGGAGTTCGTGCTTACCGCAGAATGCAGCCATACCGAATCCGGCTTGTGATGCGCCATCCCCATAGCAGCCATC
+  ACATCATACCGGAAAGAGAGGTCAGAGCCGTGGCCCCGTGGTGAGTCGCTCATCATCGGGCTTTTTGCGATGAAATTTAG
+  CTA
+  +
+  $#$,,.//.267,+/02++,4''0/))333+,,.*+-.,+2+),()6&&&&&/3++).22+-++124+*4,)+++,+')0
+  1+,+14(!%&,,-23''/,,,)),()-)&&((1&'',&&1.%*),/''.1*+3+*+**'(((+),&'$'''(((*&&4+4
+  4,,444344+*())234,,,,3((2++,&&-01,)5,,')(+134+*/3410&'&&'',34,,''&)),/./3()))244
+  4202,,3+#$$$$11123,%'')**)+,,124,,++1''.,&'5++022,,31242-..''+)+133,%&,++,,*****
+  /144332+("$+)14++'&++)(++*++*+,&(3)%(10++,+343/24400+&%/3''-../$$$,222401002124&
+  &(4++*,+,,,+###$,),-'''''****22+51212+,+())*21,,)),+,444434+++()*000(((((..+)*2.
+  $$2((&(&&11(',),,(01,&'&%0,+220)()$%&34011%%")*%&++,,%%-%%-*%%&-%%%%%-+%%,)&)(++
+  ++--%"!&%+%$"%%%%"**,--,%"$)*),%%$***+-*++,'$%++,%#%-,---%%%+%%%%%""$%%-+,+%%%%-
+  %$$
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/1000/ccs
+  CACGAAGGTCGATGCGGCGACCAAATCGTTGTAAATCCCCGTAAAGGCAGATGCGCGCCATGCCGGTGACGCCAGAGGGA
+  GTGTGTGCGTCGCTGCCATTTGTCGGTGTACCTCTCTCGTTTGCTCAGTTGTTCAGGAATATGTGCAGCAGCTCGCCGTC
+  GCCGCAGTAAATTGCGCGTGATTTCGGCACTGATGAACCAAAAACAGCACAGCCAGCACATCCGCCCGGCTGTGCCGCTG
+  ACAACGGCACCTGATACAGCCCCGTCGCCTCCAGATTATCCTGATAATCTGGAGGCGACGGGGCTGTATCAGGTGCCGTT
+  GTCAGCGGCACAGCCGGGCGATGTTGGCTGCTGTGCTGTTTTTTGGTTCATCAGTGGAATCACGCCGCAATTTACTGCGG
+  CGACGGCGAGCTGCTGCACCATATTCCTGAACAACTGAGCAAACGAGAGGAGGTAACGACAAATGGCAGCGACGCACCAC
+  TCCCTCTGCAGCACCCATGGGAATTCATTTAACATTTAAGACGATTTGGTCGCGCATCGACTT
+  +
+  '',.-..45./0662.0128&&+,+,''5670087('/000%%11,---/978))4''..9'$''18;;:*)5///,//8
+  1038225520*+873..))11222002600///82/222+++022((2/11'''-/1(((.-%%((;--::22%%77:;6
+  4))+,.3$$11%%28'&007;&&&)'),27621/-)*..&'(((*-,+/''.+,8:9710(&)/%%%$'.02281110%#
+  '-,,3,-2'$'/0;('5;7////5222()1,-,/&&)*'&/$&&,%&#'/)*+12111'&...1/0.)(')**,''0200
+  14783,%&4..4+*3,,,.1-.-$%$#+092166**/,%&&&&&/1020029*15("''&+210-.$%11)('0,.-&))
+  ).-2/0/.+.&../6/.4**633,,&'3+&*$%%#(/.-3)))+.+/+$$1)),(%!#/%'''&''.''.')/%*&&#!,
+  '#&%'&.!%%&$&"#"&%(&#'$+&#$***((.*''',+!%',-&((++1,)!!!,00$/($&
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/1006/ccs
+  CCATACCTCCCCTTATAGGAACACGCTTTGCCAAGGCTTTTCACCGCTGTCGCCAGCGCTCGGAATTAGAAAGCCCACCG
+  ACCGGTTATCCAGCCTTCTCCTTGCCACGCGCTTTTGAAGGACACGGAACCAGCCGCCCCAGCTTGTCGGATGCGAAACA
+  CGATGATATTCACCACCCCGCGCCTGGCGTCCTCAGCGCGTAGGCCTTCGATATCGTCGGTGGGTCATACGTGGACTGTC
+  CGCTTGCTCACTCCGTGCCGCCGGACGCGTGATGTTATCTCCTCACTGCGGCCCATATCCACCTCAACCGGATCGAAGCT
+  TTCACCGGGTCCGGTGTATTTGCCCTTAAGCACGGCAGAA
+  +
+  *()+)%!(++,+&$'-+&&!#($"$-+,,-+,$!'*#)++*%#$%!!$$)$!"$$%%"$%%$%$%$%%%$$$!$%$$$%%
+  %$$"##%%%$%%%$$##"#$%$%%##$%%"%%$%%$%$$###"%%#$#$##$%#$%$#"$%%!"%"%$#$%%%%%$%$%%
+  %$%%%%#%$%%%"$#"$"$$!$$$#$%%##""%%%!%%%%%%$%""$$%%%%%%&)--)*)(+)-%%,*,&!&&+-#"%)
+  $$"$*%%-%$--+,--*(*+*+%%#$%+,%%%%%$)'#$-+,-+,--,-'$''()&--'&()++++,*+%$!$%%+)$$%
+  "")-('!!!"$"!"!%--'"'($%"$##),--%%%)%!!$
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/1007/ccs
+  AAAGTAATAATCATGGCCCGGTTGATATTGATACTGGCGGTATCCAGTACAGCGCCGTACCAAGATAACGCGTGCTGGTT
+  TCAACCTGTCTGATATCCGCAATCTGCTTTTCCGAGAACCAGAACTCAAACTGTACCGTCGGGTCATAAACGGCAAGATG
+  CGCGGTGGCGGGTTATCTGAAAATAGCCGGCGTCAGCTCAATCCCTCGACGGTGGCTGCCGGTGCCGGCAATCCGGAACG
+  ATACCGACGCCGGATCCGCCCTGCTGCCCACGCATTTACCGCCCCGGACCTGTCAGCTGTAGTTCCCCAGCGCCAGTTGC
+  GTGGAAGCGGTATGTGGTTTTCCCGTCGTCCGGCCGTGCTGACCAGCCGCTCAACTGGCCGTCGTCCGCTGTTACGGTCA
+  GACGGAGCAGGAAAACTCACGCCCTTCACCACCTTCGGTGTGTCCCATTCCGCGCCAGCACCTGATATTCCCCGCTGTCT
+  GCAGATGACTTCCTGCGGTCAGGTGCTGCCCGCTGGCGGCGTGACACCATTCACCGGTGGCCACCTCTGTTCGCCGTCAA
+  +
+  ///13-.900922;12())23'''58522/-<;9911)''%22+++65228;:+')-32//..12977/-'892*+(('/
+  /.-/6899;:8(%'99/075476564-()))(2,+.+++21,))*,'&+*+,,,402443./.0444$$$()+,11+(,3
+  +'(&(1$$,%%%.2220+)&'''/21,,&(1+,003-.4214'''+2++2-..$$44401-./4'&((&''$'',./2-(
+  )3,--.1($(0()03''.())),)*1$$$$43,*/002002''&#*,1011+++,,#"$$(212++++,,,3./0/)**,
+  '))).0'*(*/4441+,&''''''1++00/.$$$2,))01,,+,)*%%0(33$$)3''0012444((5440%%&,+11((
+  ,+0+,,2,,./''&&32++,+'(()123(),1222+&(.))44'''*%%))532/123%%++++,)+(()***+,4,,22
+  4,,2'/-(''.++242++,//-,/2,,,'#))+,*+-''103,)+,)+,**+--1((3'&&&)&&5444++),%%&3,)+
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/2001/ccs
+  GAACTTTTATGAAAACCACGTTGAGCCGACTATTCGTGATATTCCGTCGTGCTGGCCGCTGGCCCCCGTGGTATGGCAAA
+  AAGCACCGTGGATACACGCCACCATGAAGCGTTCACTAATGGCGTGGCTTCTGGTGCCTGGGCGGTAAAGCGGCAAAAAA
+  CTACCGTGAAAAAGTCGGTGGATGTGGCGGGTTATGATGAACTTGGCTGCTTTGATTGATGATATTGAACAGAGGCTCTC
+  CGAGTCCTGGGTGAAACAAGCGTATTGAAGCTCGGTCCTGGCCCAAAGTCCATCCGTGGCTTTCCACCGCCAAAAGTGAG
+  AGGCACCTAGTCAGACTTGAACGCGG
+  +
+  -)++%%%%"%%%,,+%%%-%%)%""$*)*,-$"%%%$%%,,++**+"++##%%"%#)-++*+%%%%"#$)*)*-%%""%%
+  %$%--&%#!%*+)$$--$!##*+),,++'(+%"%-%++,-(()#$&&*+++$!&+()*-)***()*+,,(%%,-$%%%%%
+  +%%)+,-,$#$$$)%#"%%*+-((-+,-*+++,,,----+,,+,+,+*,,%%%%-$$,,++-*-++$!$+-$"$*+++-+
+  '%!%%$)!%$$$%$%%%$%%%$$%$%#$%%%%%###$%%$%#%$$%%%"$$%%$%%%#$$#$$#$$$%%$$##"$#%$"#
+  $$%$$$%%$$%%%%$!#%%$$!%%$%
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/2003/ccs
+  CAGTGACGACTGCGTTTGGTCTGTCCAGACTAAAGTTGAATGATTAGCAGTATGGTGATCAGTCAACCACCAGGGAATAT
+  CCTTCATATTTTATCGGCTTCACCAACGCTGCCTCATGCTCTGAAGCTCAGAACACTTATGTTCTATACATGCATTAAAC
+  AAGGGTAACTCATAGAATGGTGCTTTAACATTTTTTACACGATCAGATCCGGAGGTCCATCAGCCAGATGTTCTTATTCT
+  TTGTCTCCATGCGTTGCTCTTCATTAGCGGTTTATTACATTTTCGTATGGATTTGAGCACGTGGGCCTTACATACATCGT
+  CGTTGTATTCCCTCCAGAATGGCCAGGCAGGCGCACTTTTTACGACCAT
+  +
+  -7;44''%-?;772$!!*,&217>,'13*''&,,2--.68759&&'/4*+))3++43 at 80062-(*44<)),'((,-.+/
+  76&&'**<%%%%8))8)45>78-.3446862)*/9$"21&#%>--(0*%&</%%2**2+*.%&&258;?.4.-+00$##0
+  *#(*)+(*71;/71)--$--2-+*$+.-,'$$$$$$2%$.2(+,200#+))-8(()!&78:23%$.6.&*,,'$(-11.)
+  --)%##%&*01.)"#%$39)&*/'&0*2./%%#&&&'!!'/0020.02'%2010-#$0'*&$%++,.12,.6+%3/;.%.
+  4&%-/1/(($''&%&'857:**-,)&&*(53--.//%&%$%,4).-.++
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/2007/ccs
+  TGACGCTGGCATTCGCATCAAAGGAGAGTGAGATCGGTTTTGTAAAAGATAACGCTTGTGAAAATGCTGAATTTCGCGTC
+  GTCTTCACAGCGATGCCAGAGTCTGTAGTGTCAGATGATGACCGTACTCAAACATCGGTTGAGTATTATCTTACTGTTTC
+  TTTACATAAACATTGCTGATACCGTTTAGCTGAAACGACATACATTGCAAGGAGTTTATAAATGAGTATCAATGGTTA
+  +
+  7LEA.*B555;...=>9:D9:900<9=A776C5;B88////::&&&&5=?)))00=CFAF8999>71143<)))+DIH?4
+  <<>33<@KE at 77:;<:<8??-98<44:;3222K=<E*,G@;5686;>6;1119KEC))&+2.5:/66*=4445<E;8::;
+  444:IK>...4I5566BAC?6''96667CFAB555629<F at HJGAB55>?66*:3444HCCCLIIIC?=F<<C9%11=
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/2008/ccs
+  CGGCAAATGTCATCGACGTTTTATCCGGAAACTGCTGTCTGGCTTTTTGATTCAGAATAGCCTGACGGCAGCTGGCGAAG
+  GGCGTTTCCTGCTGAGGGTCATTGAACAGTCCCATGTCGGCAGCATAGCACACAGAATATGAAGCCCGCTGCCAGAAAAA
+  TGCATTCCGTGGTTGTCATACCGGTTCTCTCATCTGCTTCTGGCTTCGCCACCATCATTCCAGCTGTGAAAGGGATGCGG
+  CAACGATGAAAATCCTTCGGTCTGTTTCTACTGGTATTGCAAAACCTGATCCAATTTGAGCAAGGCTAT
+  +
+  969C(((G,,B8+KA7:8.....=6666'''@D;00 at D7722@)))))+<---=>))0-6..3@?+**0/$03%%-&)((
+  (()*&*)+:-1-0<=+*&-=H33F?>/'(3++*0C==?564,-10E),22-1'&1*+4;<<&&+%%%(103010=,,,,,
+  2:4+66332<4412:<))A?5453$$$'210/88.-/44G;32,'))6..A22).51..'(899%$,2455022%>@=00
+  ;.883*/>%%%"%#)**=))332-(+*/,><(%&&?33'%+%,,:<9715((11%&&6.:2$#23'30>
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/3006/ccs
+  AGCGTGTTATCCCGGTGCTTTTTGCCATACCACGGGCCAGCGCCAGCAGCGACGGAATATCACGAATTAGTCGGCTCACG
+  TGGGGTTTTCATAAGTTCTTCGGGCATCCCATTCCGTCGGCAACCAGATAAGGGGTGTTGCGGCTGCTTATGCTTATAAA
+  GTAGGCATACACCCAGCAGCATTTTGGAATAACCGACCACCGGGCAGACTTCACCACATTCAACCTCACGGATGTAGTCG
+  CTGGCCCATTCCATTCATTGAATGGCCCGCTGAAAGGGCAGTGTTTTCCCAGCCGCCCTTCCTGGTATGCGGATTCTTTC
+  GGGAGATAGTAATAGCATCCGCCTTCAACCGGCGGTCTGGCTCCGGCCCTGAACAGTGAGCGAGCCCGGCGCGGACAAAA
+  TGCCGCAGCCTGTTAACCCGACGTCTCGGAATTATCTCACCTCCAGCCACCCCCCCGGTATCAGTTCATCCAGGCGCGGG
+  CTGCGTGTCAGGCTTGATGATATCCCGTTCAGGAAACAACATGTCGGTTTTCCAGTTCCGGAACGCCCGCTGCACCGACA
+  GGGAGCCCGTCGAGAACGATTCACCTGGATCCGGACACCGAAAAGTACAGAATGCGGTTCCACCACTTCAGCGGATCCTG
+  GCATTCTTCAG
+  +
+  3/1''5**43(((#&793$$$$$*&('4)*+04%&%&62,&+,-,21./22-7,,+33-0268,''((;75-////0+))
+  ("$##%''')77-,*,45014''&-3,%#'+&&+,)++$%&//+,,33,),(***/++2.*#$20,+/1,42#"#.3&&&
+  '*%-.*,2$#*(++2/*),+*++++.0'''&(012'$$.&%&&&(+0*4*+33''&'))+(%'-,+,2,--.2,)''*.)
+  '5%%(((,$$)&+-+"((')$&2..&'&.*+0***#$#'&*,($''&$'&(-**,&''.0*+211+*+/((11++,+++)
+  (**.))2/1.//$"-4+-.,(')#%$+0&$()))(('%()%'--))&&'-1'&1&(((('24(()))-.)%'(0.1))))
+  +**+))($')+(""%%'$%.0-#')/+$$'%*),0*((*%&-(()3%##"$$$$$$!%.('022((+*++++%&**/"#"
+  $'(%!$$(((!%+$"#$)-+$+#"%$%$%%+++$%##$,,++,+-+,+,,,+,,-%%%+%%$*)%!##'##+++*,,,--
+  %%%%&%**-,---+$#!!"!!$*''###,,)(%%+*,%%,&&&%'$%%%-**)$%+,%##!"#((*$$%-,+)(%#$!%)
+  %-((,'),-)%
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/3008/ccs
+  ATTTGCATATACCCATTGCTCACGAAAAAAATGTCCTTGTCGATATAGGATGAATCGCTTGGTGTACCTCATCTACTGCG
+  AAAACTTGACCTTTCTCTCCCATATTGCAGTCGCGGCACGATGGAACTAAATTAATAGGCATCACCGAAAATTCAGGATA
+  ATGTGCAATAGGAAGAAAATGATCTATATTTTTTGTCTGTCCTATATCACCACAAAATGGACATTTTTCACCTGATGAAA
+  CAAGCATGTCATCGTAATATGTTCTAGCGGGTTTGTTTTTATCTCGGAGATTATTTTCATAAAGCTTTTCTAA
+  +
+  `RRRCCS]YPQBBBCWWURUGGUR66666667U\MMFFFOFFa````%%%Q]>>XYaeLLNR\QQZIIK]XZO4[RONQ]
+  ////0LLRTRRGGGSNJJ)))Rd_LLTNOQOCCNJLFFMYY`NNMTNMFFF%%55OO<<::[]UGHL7777BBDYZZVT=
+  =]`WCCMNNVAANNN7777G^aa]ZWY_;;;;;;<^VV``;;XLLYHHUGGG>,,,,ZNNNSZ:::::OROOYTMNNEEE
+  Y@@RLM[RR`_ZX]`??EEHKCCADBBXHHHEEEF33333?PHI\NNXY_OOR----<c[888?V>>>>EP,,
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/4004/ccs
+  AAATGCTTCCTTCGCGCTTGGAAGGCAAACCCGAAAAGAGGCTGAACGGCACGAAATTCCCAGTTACAATGCGCAAACAC
+  AAAGCGAACAGTTGGCCGTCCAAACGGACGACAATAACCGGACGAGGGACGGGAGGGCGATAACGGGCAAACATGCAGGA
+  GGGAAGAATTTTTTGGCAAAAAAAATGTTTTTTTTTTTTTTTTCGTCTTTTTTTTGCTTTTTCCTATTTTCAAAATTTTT
+  TTGGGAATTTTTTTTTTTTTTTTCAAAACGTTTTTTGGTATGAATTATTTCCAGATATTTAATTTTTTCTCGTAGAACAA
+  ATAACCGGCCCATCTAATGAAAAAAAATATAAAACCGTTGTTTCAAATAACAGCGGAAAAAAAGTGTTAATTAGATTAAA
+  AAACAAGCTTGTATGACAAAAAAAACCCCCCTCTGAAGTGTTCTTCTGACGGTCGTCAGGAGAAGTGGTGCTGTGGGCCC
+  TGAGACGCCTATAGGGACACGAGAACATGCGAGGAAAAAAAATTTCAGCAGCAGCGCCGGGATTGGTTGGAAGGCGGGAG
+  TCAGTTCGCGGTACTTGAGGAGGCGCGCAACGTCGCCAGCTGTCTGCACA
+  +
+  """"()%%''*+#(')1$$/.(("$&!##!#$1&&$"&2,,$,(("$./-1$!&'($'+%$##!"(*$$'+*%$"$$*+.
+  &"$*+&('(")&($#'$#',-$$$"%%.+*'!##&$%)+$$*'/("##!("$#!!"$"$"%##$$##"#$&&)$"!$$#"
+  "'%#%)##$*)&((#&!$$$$$$$$##!$$$#"#$$$$$$$#""$%$&&%$###$(($%%%$"$&"#&%&%"$$##%&%$
+  %&'&"#$!$$$$$$$$#$$#"$"##$#!!#!$"!#$&(%$#%'$&&$&&(**++&!"$$$"$&&&%&&&$'"#"$'*!"#
+  !%$$##"##$#!$&#$)%&"######"$"&%%$&')#"$##*'%('%!"%$%$%#"''#%%''%&%$%!#$%%$)'###"
+  #$#%%%%%$%#$"$)$!!!!!!!!!"%##$#!!#"%'$""'&&%%&%$$%$%$*%!(,$##$""'&&%"*#)($%%&!$#
+  #""%&'$"$'%%)&"$'%#$!!$*'&&&%!""#$"#$$$$$#''$*+$#'&#),,'(($#$($$"'**)**+()*$$!%(
+  ++%%%#%#-%%$(+%$%#$%&+,+%"*)##!,('%""",)%%,,*+,)##
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/4005/ccs
+  GCTGCAACTGAGCACATACGCAATCTGCTCCGCCGACACGTTATGGAAGCTGCGAGCCATCGCGTCAGCCCCACGTCGGC
+  GTTCTGTGGTCAGCTCCCGAAGGCTTTCAGCGACCTTGTCCACTCCACGCCGATGCAGAGGAGAAACGGCGCCACACTCT
+  GGCCTGATGGACGCAATCTGAGCCTCACCGCTTACCCCGCCCTTAACCAGTGCGCTGAGTGACTCTTGGTCTGGTTAAAC
+  GTCAGCCTGCCCGCCTGCCGCCTCTGGACGAGGACCGCCATACGTCTGCCGTTCAGTCCGCCTGATTGCCGAAAGGACCC
+  AGGCGTTTTGTTGAAATCGGAC
+  +
+  ..21,)+$0++,'&+44/3*'(0024212/0..001+,/21+44*)((%-/$$-02./1232%%+),2&&&$#%&&''')
+  $#&*1/0..&+%,-&#'&%!%++0&&&4++&&301+++211*$#(0+*301*)*'(1%$)-*/&&%'&)-'$%#*+,,)+
+  /1$"&()4++**,/+.//.++3-,%%1002,&($!!"!#"$$+/&"+++---002-&"%*3,'###'&'*1.1212)'(*
+  --%%*$%%)"$#"&()+$%"!#(,-$#$%'***#%)#$%+(%++&-%$%%%$$+)*+%%$%%%%$')+++&!##(*,$$%
+  *$%'%"%%##%,-+,,--))*+
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/4006/ccs
+  GGCGCACGGAGGAGCAAGCGTGACAGTCCCACGTCATGCCCGCCGACGATATCGAGCTCGCGCTCACCGCCAGGGTGTGA
+  AGTGAATTCACGGTGCCGCCGAAAGCTGGGCCGGCTTTCGTTCCTTCGCCGGTCAGGAGAAGGCGGACCCCGTCGTGGGC
+  CATTCCGAGCCTGGAGACAGCGGTCGAAAAAGCCTTCGCCAAGCCGGTGGCCAAATGGTCGGCCAGCGAGAATCCGTGC
+  +
+  *2&%$(-(*%'$(95.-41',.66',5$$$",%#(534(((5/0/-610'#()2.&&+-4-693#'%-37760,/1041-
+  -.44''+.6*#..21*+2*52(*(&#%//.%&$$-,'-*,44////&$/-014253912+++-'&.'#()+4-%&+))).
+  -**-22$%0//*%&%%((')!!.3(*(-,*-0'*&&"3$",&..0($*'%-,&&!!!#!!!-%&''""&'$'/%)'&+&
+  @m110818_075520_42141_c100129202555500000315043109121112_s1_p0/4007/ccs
+  TAATAAATCTCGGAGAACTAACAACTTTCCGCGGACGAAGACAGCAAGGTATGTCTATATGGAAAGGCAATCCCAATAAC
+  GGCACGAGAGCCAAAGAACACAAGGCACCCAGCGTACTACTCCGTAGCCGCCTCTGCTTTTTTTGCTTGGCTGCATAATG
+  TGATGCCAGACGCGCAAAAGTAGAGCCGCACTGAGAGCGTCCTCAACAGCCCAGACGGAGAGCACACTAACGACCAGATG
+  TTGATCAGGATCTGCGTGATTAGTCGCGCACCTGAACTGTTTGGCGCCCGGCGGCCACATATATGTCACACTTCAGCTTG
+  TCTCCACCCTGACCTCCACAGGAGGCAGGCTCCGGCCTCGGTGCCACCGCGAGCCCACTCACGCGCTCGTATAGCAGTCT
+  CACCAGCGCATTCTAAGTTCCTCACCTGGCCGGTCTTGACCGCCATCTACCGGATGATCAGTCGTCCTTTTGATCTCTGC
+  TGACAGCTCAATTAGGCTACCGCACAACAATTCCCACGACGCAAATCCATGCGGTGGCTGAAGCGCGCGAAGCCGTCAAC
+  TACCTGGAGACACCACGCTCCGGCGCGATAGGTAGCAGGAACGACCGGTGGTCTGCTATGAAGCCAGGCATCCTTGTTGT
+  ATCCGTGAATGCTATGGTTGTCTCTCGCTGGGTGTGGTCTGGTCGATGTCGCAAGCACTCTGACATGGGGCACGGAAACA
+  CGACTGACAGCTCAACCGAATAACATACTCCACTTCGCACATGCAACAATCATACTGCCAGGGAGAGCGAACGTAAAAAT
+  AGATGAGATTTGCCGTTATCAGACGCATCACCGCGCGCGAAGAAGTACGAGTGGTAGAGGACGGTGACACGCTCTCGCTC
+  ATGAATCCTCTGCAACGTATCGCGAGCGCAAGGCTTCATGTCCGTCGCGCGGTCGAGTGTGTCTATGACCGCCTGTTTTC
+  GTTCCAGGCGACAGGACACGCGCTCGTAACCCGTATATTTGCTAGCAGACATACCTGGCACGAACAGTGAGCCAAGGAGA
+  GAGGCGACGCGATACGACTCACGTGTAAAGCCAGCACAGACTATGGGACCAAGACTATCAAGTGCGGCCAGTGCACCATC
+  TGCGGCTAACATTTTGCCAGGATCCAGGCAGTAGGATATATGGTTTCTGGCGCCGCCAGCGGTTGCAGAGGAACAGGCTT
+  CGCGATATACCAGACACCTATGACTGATGCACCTACAGTTATCACGAAGAGCAGTGGTCGCGCTCTTCTGCAGCCACTCG
+  CCATCTCGTGGCGCTGACGTAGCACCGAGCGCGCGATCGTGACAGCTTATTCATTGACAGGGCGACACGATCGCAAGAGA
+  GAGCAAGACGGCACTGGGCTCGATTGTGCACAATGAGTCATTAGAACAGCACGGCGCGCCGAAGTCGCACTTCCGGTCAT
+  TCTCCTCGTCACGTTTGCGCGAGTGGCTTACACGATCGGGACGACATGCCGCACCACGCAGCGTCTAACGAAGGAGAGTA
+  GGAGGATAAATCACCGAAGCACAGATAGCGCAACGGGCAGGCGACAGACCCACCACGCACAAGCACGCATGGTAGCACGC
+  GCAGCAAGACAAAAGCGAGCCAACAGCGAGAAGCTCAACGAATCTGCTGCACTTCACCCCACGGATCACGCCGAACCACA
+  TCCACCAATACGGGTTTGTCAGCGAACAGGCCTTGTGAAGTTCCGATT
+  +
+  #$$!#$#!$%"#%%$$#$""#$$%$#$"$%$$""%$"$$$!#"!"$$$$!!##!$%#"%#""#%%"!$##"#$"#$##$#
+  ##""$$%#"$$$"#!#""#!"$$!#$$##$$##$!!"#$####%%##!!"##!$""#!!!"""!####$$%$#$#$"#"#
+  "$$###$"$%#"#$%$%$#$"$""#"""#####$%%$$!!#$""$%%$"!#"$$####%$$#$#!##$$%%#%$%#"$!"
+  "#$$$$""$##$$$"#"#""%%"%#$$!##"$#%$$##$#$$$%$$!##"$%$%$%"$$!"##$"#$"#"$"#%""$$%$
+  ###"$##"$%$#$%%##$$"$$$""""##"$#$#%"""##%$"#$"#"###$##"$%$%#"""##!"#%%#$"#$$###$
+  "!""$"$#$$###$#%$$%$$%#"""%$%#$#"#$$$"!!"##$$%%$##%$%!!$$%%$!#!"##$$$""$%$""#$$"
+  $$##!$#"$#$#%$#$"##$%"#!$#$#"$""#%$%$$##"%"!#!"%#$"""""#$!!"!#"###$$$$%%$%$%$#""
+  $##"$$$!##""#$"##!########$$$$$$%$$%#"""$!$"$%$%%$%$#%%$%$#%$%$$%#$%%%%"%$%#$%%"
+  !#"""$$!"#######$#$##"!"!#$$$$%%%%$$%$#$$%%$$#%#%$%$"%%$"!!##$#"#$$%%%"%%$%$%%#"
+  "#$##$$$##$%$"#$%%$%%##"$"%$!!##$"$$#"$###$%$%%$%#%%"$#$%$$"$#$$#"!"!"""!$$$$%$"
+  "$%%$$$%""!""##$$#%$#""!"#$##!####$#"#$#$"$%#$#$#$"$##!"#$###""""$$"#""#!""###$#
+  !!!#$"$%##"$###$##%$!##!$!#"!##$%$$%$!#$#$$%$#"$$%#$"#!$"$"#$#"$$$##"###$$$#"##$
+  $"""!$#$!##""!!"!$!$#!#$#$%#$"!##$#$#"#"$#$##"#$$$%$"#$#$%$$$#!!###"$$###$$"#!"#
+  ####$$###$"""#!""!!"$$$$%$"###""%"#"#$$!#!$%##"!$$""#$"$$###$$$%#""##!"!#####%##
+  $#"!#!#$%$$#$%$%!"##$#$$$"#%#$##!"$#$##%%##$$%$$!##$$%$###$"!#""##!$$$%##"$$%$##
+  "#$$#"#$$$%$#"!#"$!%%$#!#!!#!%$!""##$$""##""#$$$$"##$"""#"#!!!"$####$#$%$#""##""
+  #"!"#"$$$####"%%"#$%$#$%#$$$####$###$"!""!#""#"%$#%$%""!#"!###"$#""######$"!#!#$
+  $$"#"""#""!!!##"#"$$!$##%$"!$""$$%#""$$$$$"$""""#"#!#!####""!""$"#!!""###$""""""
+  %"###"#!!###$#"#!!!$!""$"!"""""##!""#"##"""!#""#""$####$##"$$#"#"$#$#$"!$$#"$$"!
+  $$!##"!"#""#"!#$$#"!!"!##$$$$$$$%%$%$$###"$"$#$$#"%%#$%#"#"##$"#"#!!$$$$!!##$"!"
+  ###"$""#$#"###"$$#"""#$#$#$"$$!$$!!!"""#"!%$!"##!!!#!"$!!!!!"!!""!!"!!!!"!!##""$
+  !"$!!"!"!"#!#!!!$"!##""!!"$#""!!!#"#!#%"!"!"$#"$
+
+  $ $CMD --readType=unrolled --outType=fastq
+
+  $ $CMD --readType=subreads --outType=fastq
+
+Test out some filters.  For example, there are no subreads > 1000
+bases, but there are unrolled reads that long.
+
+  $ $CMD --readType=subreads --outType=fasta --minLength=1000
+  $ linecount ${MOVIENAME}.fasta
+  0
+
+  $ $CMD --readType=unrolled --outType=fasta --minLength=1000
+  $ linecount ${MOVIENAME}.fasta
+  395
+
+There are no reads with readScore >= 0.95,
+
+  $ $CMD --readType=subreads --outType=fasta --minReadScore=0.95
+  $ linecount ${MOVIENAME}.fasta
+  0
+
+but ZMW 9 has readScore > 0.85:
+
+  $ $CMD --readType=subreads --outType=fasta --minReadScore=0.85
+  $ grep ">" ${MOVIENAME}.fasta
+  >m110818_075520_42141_c100129202555500000315043109121112_s1_p0/9/0_18
+  >m110818_075520_42141_c100129202555500000315043109121112_s1_p0/9/69_769
+  >m110818_075520_42141_c100129202555500000315043109121112_s1_p0/9/815_1498
+  >m110818_075520_42141_c100129202555500000315043109121112_s1_p0/9/1549_2080
diff --git a/tests/cram/groupcsv.t b/tests/cram/groupcsv.t
new file mode 100644
index 0000000..bb6b8a6
--- /dev/null
+++ b/tests/cram/groupcsv.t
@@ -0,0 +1,47 @@
+  $ export INCMP=$TESTDIR/../../etc/aligned_reads_ss.cmp.h5
+  $ export INCSV=$TESTDIR/../../etc/grouped.csv
+  $ cmph5tools.py select --groupByCsv $INCSV $INCMP
+
+  $ cmph5tools.py stats grpA.cmp.h5
+  readLength                    accuracy
+  426                               0.90
+  538                               0.92
+  552                               0.92
+  589                               0.89
+  538                               0.92
+  129                               0.91
+  118                               0.83
+  126                               0.85
+  239                               0.92
+  $ cmph5tools.py stats grpB.cmp.h5
+  readLength                    accuracy
+  156                               0.92
+  347                               0.86
+  531                               0.92
+  550                               0.93
+  122                               0.80
+  86                                0.93
+  132                               0.90
+  $ cmph5tools.py stats grpC.cmp.h5
+  readLength                    accuracy
+  151                               0.90
+  193                               0.90
+  439                               0.90
+  546                               0.90
+  563                               0.90
+  565                               0.92
+  568                               0.89
+  148                               0.95
+  59                                0.88
+  $ cmph5tools.py stats grpD.cmp.h5
+  readLength                    accuracy
+  95                                0.94
+  304                               0.88
+  455                               0.95
+  489                               0.86
+  584                               0.87
+  558                               0.90
+  567                               0.87
+  543                               0.90
+  170                               0.98
+  171                               0.82
diff --git a/tests/cram/merge.t b/tests/cram/merge.t
new file mode 100644
index 0000000..d15c128
--- /dev/null
+++ b/tests/cram/merge.t
@@ -0,0 +1,27 @@
+  $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"`
+  $ cmph5tools.py select $INH5 --outFile left.cmp.h5 --idx 0 1 2 3
+  $ echo $?  
+  0
+  $ cmph5tools.py select $INH5 --outFile right.cmp.h5 --idx 4 5 6 7
+  $ echo $?  
+  0
+  $ cmph5tools.py merge --outFile merged.cmp.h5 left.cmp.h5 right.cmp.h5
+  $ echo $?
+  0
+  $ cmph5tools.py select $INH5 --outFile tot.cmp.h5 --idx 0 1 2 3 4 5 6 7
+  $ echo $?
+  0
+  $ cmph5tools.py sort merged.cmp.h5
+  $ cmph5tools.py sort tot.cmp.h5
+  $ cmph5tools.py equal tot.cmp.h5 merged.cmp.h5
+  $ echo $?
+  0
+  $ cmph5tools.py merge --outFile merged1.cmp.h5 $INH5
+  $ echo $?
+  0
+  $ cmph5tools.py sort --inPlace merged1.cmp.h5
+  $ echo $?
+  0
+  $ cmph5tools.py equal merged1.cmp.h5 $INH5
+  $ echo $?       
+  0
diff --git a/tests/cram/portability.sh b/tests/cram/portability.sh
new file mode 100644
index 0000000..cea9fdf
--- /dev/null
+++ b/tests/cram/portability.sh
@@ -0,0 +1,4 @@
+#
+# This is a portable alternative to "wc", which differs b/w GNU and BSD
+#
+alias linecount="awk 'END{print NR}'"
diff --git a/tests/cram/select.t b/tests/cram/select.t
new file mode 100644
index 0000000..b2e980a
--- /dev/null
+++ b/tests/cram/select.t
@@ -0,0 +1,49 @@
+  $ . $TESTDIR/portability.sh
+
+Set up basic commands
+  $ INH5=`python -c "from pbcore import data ; print data.getCmpH5()"`
+  $ CMD="cmph5tools.py stats $INH5"
+Test basic output
+  $ $CMD --what "ReadStart" --limit 5
+  ReadStart
+  3                            
+  353                          
+  3580                         
+  3253                         
+  0                            
+Test basic output (2)                   
+  $ $CMD --what "MoleculeReadStart"  --limit 5
+                      MoleculeReadStart
+                                      3
+                                      3
+                                   3253
+                                   3253
+                                      0
+Boolean output vector
+  $ $CMD --what "MoleculeReadStart < 20"  --limit 5
+  MoleculeReadStart < 20
+  True                                      
+  True                                      
+  False                                     
+  False                                     
+  True                                      
+Test where clause produces correct subset                                
+  $ $CMD --what "MaxSubreadLength > 100" --limit 10
+  MaxSubreadLength > 100
+  True                                      
+  True                                      
+  True                                      
+  True                                      
+  True                                      
+  True                                      
+  True                                      
+  True                                      
+  True                                      
+  True                                      
+Test stdout same as in csv output                                   
+  $ $CMD --what "UnrolledReadLength" | linecount
+  85
+
+  $ $CMD --what "UnrolledReadLength" --outFile out.csv
+  $ linecount out.csv
+  85
diff --git a/tests/cram/sort-extended.t b/tests/cram/sort-extended.t
new file mode 100644
index 0000000..f694eba
--- /dev/null
+++ b/tests/cram/sort-extended.t
@@ -0,0 +1,29 @@
+  $ export INCMP=$TESTDIR/../../etc/aligned_reads_ss.cmp.h5
+  $ cmph5tools.py sort $INCMP --outFile out.cmp.h5
+
+  $ cmph5tools.py select --groupBy Barcode \
+  > --where "(Barcode == 'F_42--R_42') | (Barcode == 'F_28--R_28')" $INCMP
+  $ cmph5tools.py sort --inPlace F_42--R_42.cmp.h5
+  $ cmph5tools.py merge --outFile m1.cmp.h5 F_42--R_42.cmp.h5 \
+  > F_28--R_28.cmp.h5
+
+  $ cmph5tools.py merge --outFile m2.cmp.h5 F_28--R_28.cmp.h5 \
+  > F_42--R_42.cmp.h5
+  $ cmph5tools.py equal m1.cmp.h5 m2.cmp.h5
+  * alignments differ (glob)
+  [1]
+  $ cmph5tools.py sort --inPlace m1.cmp.h5
+  $ cmph5tools.py sort --inPlace m2.cmp.h5
+  $ cmph5tools.py equal m1.cmp.h5 m2.cmp.h5
+
+  $ cmph5tools.py select --where "SubSample(.05)" $INCMP \
+  > --outFile ss1.cmp.h5
+  $ cmph5tools.py select --where "SubSample(.10)" $INCMP \
+  > --outFile ss2.cmp.h5
+  $ cmph5tools.py equal ss1.cmp.h5 ss2.cmp.h5
+  cmp.h5 files differ in length* (glob)
+  [1]
+  $ cmph5tools.py validate ss1.cmp.h5
+  $ cmph5tools.py validate ss2.cmp.h5
+  $ cmph5tools.py validate m1.cmp.h5      
+  $ cmph5tools.py validate m2.cmp.h5      
diff --git a/tests/cram/sort.t b/tests/cram/sort.t
new file mode 100644
index 0000000..67efa39
--- /dev/null
+++ b/tests/cram/sort.t
@@ -0,0 +1,21 @@
+  $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"`
+  $ cmph5tools.py sort --deep --outFile tmp.cmp.h5 $INH5
+  $ echo $?  
+  0
+  $ cmph5tools.py sort --deep --inPlace tmp.cmp.h5
+  $ echo $?  
+  0
+  $ cmph5tools.py sort --deep tmp.cmp.h5
+  $ echo $?
+  0
+  $ cmph5tools.py sort --outFile ftmp.cmp.h5 $INH5
+  $ echo $?
+  0
+  $ python -c "from pbcore.io import CmpH5Reader; a = CmpH5Reader('tmp.cmp.h5'); b = CmpH5Reader('ftmp.cmp.h5'); print(all([a[i] == b[i] for i in xrange(len(a))]));"
+  True
+  $ cmph5tools.py sort --outFile ptmp.cmp.h5 --deep --usePythonIndexer $INH5
+  $ echo $?
+  0
+  $ cmph5tools.py equal tmp.cmp.h5 ptmp.cmp.h5
+  $ echo $?
+  0
diff --git a/tests/cram/stats.t b/tests/cram/stats.t
new file mode 100644
index 0000000..8afac75
--- /dev/null
+++ b/tests/cram/stats.t
@@ -0,0 +1,221 @@
+  $ . $TESTDIR/portability.sh
+
+Set up inputs and basic command string.
+  $ INH5=`python -c "from pbcore import data ; print data.getCmpH5()"`
+  $ CMD="cmph5tools.py stats $INH5"
+
+Print Readlength to stdout
+  $ $CMD --what "ReadLength" --limit 5
+  ReadLength
+  301                           
+  404                           
+  342                           
+  254                           
+  267                           
+Print multiple columns                        
+  $ $CMD --what "Tbl(readlength = ReadLength, accuracy = Accuracy)" --limit 5
+  readlength                    accuracy
+  301                               0.84
+  404                               0.82
+  342                               0.87
+  254                               0.82
+  267                               0.84
+Aggregate statistics on entire dataset
+  $ $CMD --what "Tbl(mrl = Percentile(ReadLength, 90), macc = Mean(Accuracy))"
+                         mrl                    macc
+                      481.80                    0.83
+Access movie name for each alignment
+  $ $CMD --what "Movie" --limit 5
+                         Movie
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0
+Aggregate statistics for a single movie
+  $ $CMD --what "Tbl(mrl = Percentile(ReadLength, 90), macc = Mean(Accuracy))" --where "Movie == 'm110818_075520_42141_c100129202555500000315043109121112_s1_p0'"
+                         mrl                    macc
+                      547.80                    0.84
+Aggregate statistics grouped by movie                      
+  $ $CMD --what "Tbl(mrl = Percentile(ReadLength, 90), macc = Mean(Accuracy))" --groupBy "Movie"
+                         Group                       mrl                    macc
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                    547.80                    0.84
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    445.20                    0.83
+Aggregate statistics grouped by multiple factors
+  $ $CMD --what "Tbl(mrl = Percentile(ReadLength, 90), macc = Mean(Accuracy))" --groupBy "Movie * Reference"
+                         Group                       mrl                    macc
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0:lambda_NEB3011                    547.80                    0.84
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0:lambda_NEB3011                    445.20                    0.83
+Per-alignment metrics grouped by multiple factors
+  $ $CMD --what "Tbl(readlength = ReadLength, errorRate = 1 - Accuracy, ipd = Mean(IPD))" --groupBy "Movie * Reference" --limit 2
+                         Group                    readlength                    errorRate                     ipd
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0:lambda_NEB3011                    342                                0.13                    0.18
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0:lambda_NEB3011                    254                                0.18                    0.25
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0:lambda_NEB3011                    301                                0.16                    0.21
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0:lambda_NEB3011                    404                                0.18                    0.28
+Per-alignment metrics grouped and filtered 
+  $ $CMD --what "Tbl(readlength = ReadLength, errorRate = 1 - Accuracy, ipd = Mean(IPD), holeNumber = HoleNumber)" --groupBy "Movie * Reference" --where "HoleNumber != 9" --limit 2
+                         Group                    readlength                    errorRate                     ipd                    holeNumber
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0:lambda_NEB3011                    342                                0.13                    0.18                    2001                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0:lambda_NEB3011                    254                                0.18                    0.25                    2001                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0:lambda_NEB3011                    301                                0.16                    0.21                    3008                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0:lambda_NEB3011                    404                                0.18                    0.28                    3008                          
+Check MapQV output
+  $ $CMD --what "Tbl(readlength = ReadLength, mapqv = MapQV)" --where "(ReadLength > 400) & (MapQV > 0)" --limit 5
+  readlength                    mapqv
+  404                           254                      
+  684                           254                      
+  674                           254                      
+  486                           254                      
+  571                           254                      
+Generate whitelist for other pipelines.  **zmw's not unique, includes header**
+  $ $CMD --what WhiteList --where "ReadLength > 400" --outFile out.csv
+  $ tail -n +2 out.csv | uniq
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0/3008
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0/1000
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0/2006
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0/1007
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0/4007
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0/1002
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0/9
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0/9
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0/2003
+Test 'select' subtool commands
+  $ SCMD="cmph5tools.py select $INH5"
+Generate new cmp.h5 files per movie with length/accuracy filtering
+  $ $SCMD --where "(ReadLength > 100) & (Accuracy < .8)" --groupBy "Movie"
+  $ cmph5tools.py summarize *.cmp.h5
+  ----------------------------------------
+  filename: m110818_075520_42141_c100129202555500000315043109121112_s1_p0.cmp.h5
+  version:  1.2.0.SF
+  n reads:  3
+  n refs:   1
+  n movies: 1
+  n bases:  666
+  avg rl:   222
+  avg acc:  0.7942
+  
+  \t Movie Summary: (esc)
+          Group     nBases     avgAccuracy     avgReadLength
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0     666                0.8             222.0
+  
+  \t Reference Summary: (esc)
+          Group     nBases     avgAccuracy     avgReadLength
+  lambda_NEB3011     666                0.8             222.0
+  ----------------------------------------
+  filename: m110818_075520_42141_c100129202555500000315043109121112_s2_p0.cmp.h5
+  version:  1.2.0.SF
+  n reads:  3
+  n refs:   1
+  n movies: 1
+  n bases:  676
+  avg rl:   225
+  avg acc:  0.7943
+  
+  \t Movie Summary: (esc)
+          Group     nBases     avgAccuracy     avgReadLength
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0     676                0.8             225.3
+  
+  \t Reference Summary: (esc)
+          Group     nBases     avgAccuracy     avgReadLength
+  lambda_NEB3011     676                0.8             225.3
+Make new cmp.h5 with subset of indexed alignments
+  $ $SCMD --idx 1 2 3 4 --outFile 1234.cmp.h5
+  $ cmph5tools.py summarize 1234.cmp.h5
+  ----------------------------------------
+  filename: 1234.cmp.h5
+  version:  1.2.0.SF
+  n reads:  4
+  n refs:   1
+  n movies: 2
+  n bases:  1267
+  avg rl:   317
+  avg acc:  0.8391
+  
+  \t Movie Summary: (esc)
+          Group     nBases     avgAccuracy     avgReadLength
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0     863                0.8             287.7
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0     404                0.8             404.0
+  
+  \t Reference Summary: (esc)
+          Group     nBases     avgAccuracy     avgReadLength
+  lambda_NEB3011     1267               0.8             316.8
+Check that default 'what' clause prints readlength and accuracy (all reads)
+  $ cmph5tools.py stats $INH5 --limit 5
+  readLength                    accuracy
+  301                               0.84
+  404                               0.82
+  342                               0.87
+  254                               0.82
+  267                               0.84
+Export to csv file 
+  $ $CMD --what "Tbl(readlength = ReadLength, errorRate = 1 - Accuracy, ipd = Mean(IPD))" --groupBy "Movie * Reference" --outFile out.csv
+  $ linecount out.csv
+  85
+  $ tail -n 1 out.csv
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0:lambda_NEB3011,182,0.1538,0.3418
+Export to csv file (2)
+  $ $CMD --what "Tbl(readlength = ReadLength, errorRate = 1 - Accuracy, ipd = Mean(IPD))" --groupBy "Movie * Reference" --where "Accuracy > .85" --outFile out.csv
+  $ linecount out.csv
+  26
+Check sortBy clause -- note order of sort metrics is not deterministic
+  $ $CMD --what "(Accuracy,ReadLength)" --sortBy "Round(Accuracy, 2), ReadLength" --where "Movie == 'm110818_075520_42141_c100129202555500000315043109121112_s1_p0'" --limit 10
+  ReadLength                    1.0-NErrors/ReadLength*1.0
+  116                                                 0.84
+  128                                                 0.80
+  153                                                 0.84
+  188                                                 0.84
+  195                                                 0.86
+  204                                                 0.79
+  211                                                 0.83
+  250                                                 0.80
+  254                                                 0.82
+  257                                                 0.84
+Check sortBy and grouping (no aggregate)
+  $ $CMD --what "(Accuracy, )" --sortBy ReadLength --groupBy Movie --limit 2
+                         Group                    1.0-NErrors/ReadLength*1.0
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                          0.84
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                          0.80
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                          0.79
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                          0.81
+Check sortBy and grouping (with aggregate)
+  $ $CMD --what "(Mean(Accuracy), )" --sortBy ReadLength --groupBy Movie
+                         Group                    Mean(1.0-NErrors/ReadLength*1.0)
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.84
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.83
+Check arithmetic in what clause
+  $ $CMD --what "(Mean(100* Accuracy)/Sum(Accuracy), )" --sortBy ReadLength --groupBy Movie
+                         Group                    Mean(100*1.0-NErrors/ReadLength*1.0)/Sum(1.0-NErrors/ReadLength*1.0)
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                                                    2.56
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                                                    2.22
+Output columns from tuple and group by multiple factors (sorting within group)
+  $ $CMD --what "(Sum(ReadLength), Mean(Accuracy))" --sortBy ReadLength --groupBy HoleNumber*Movie --limit 1
+                         Group                    Mean(1.0-NErrors/ReadLength*1.0)                    Sum(ReadLength)
+  1000:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.84                    188                                
+  1000:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.82                    674                                
+  1002:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.87                    117                                
+  1004:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.84                    197                                
+  1006:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.80                    128                                
+  1007:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.82                    542                                
+  1008:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.80                    133                                
+  1009:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.81                    244                                
+  2000:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.81                    159                                
+  2001:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.82                    254                                
+  2002:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.88                    132                                
+  2003:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.84                    153                                
+  2004:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.89                    277                                
+  2006:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.79                    111                                
+  2007:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.80                    250                                
+  2008:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.84                    116                                
+  2009:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.82                    170                                
+  3002:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.81                    112                                
+  3006:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.80                    265                                
+  3008:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.82                    300                                
+  3008:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.84                    301                                
+  4004:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.89                    252                                
+  4006:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.86                    195                                
+  4007:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.81                    446                                
+  4009:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.84                    267                                
+  8:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.88                    174                                
+  9:m110818_075520_42141_c100129202555500000315043109121112_s1_p0                                                0.85                    516                                
+  9:m110818_075520_42141_c100129202555500000315043109121112_s2_p0                                                0.84                    148                                
diff --git a/tests/cram/valid.t b/tests/cram/valid.t
new file mode 100644
index 0000000..8e78012
--- /dev/null
+++ b/tests/cram/valid.t
@@ -0,0 +1,2 @@
+  $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"`
+  $ cmph5tools.py validate $INH5
diff --git a/tests/test_cmph5lib_CmpH5Merge.py b/tests/test_cmph5lib_CmpH5Merge.py
new file mode 100755
index 0000000..0f28ade
--- /dev/null
+++ b/tests/test_cmph5lib_CmpH5Merge.py
@@ -0,0 +1,106 @@
+import os
+import tempfile
+import logging
+import unittest
+import shutil
+
+from pbcore.io.CmpH5IO import CmpH5Reader
+import sys
+from pbh5tools.CmpH5Select import cmpH5Select
+from pbh5tools.CmpH5Merge import cmpH5Merge
+
+_DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'etc')
+_CMP_H5 = os.path.join(_DATA_DIR, 'aligned_reads_ss.cmp.h5')
+
+log = logging.getLogger(__name__)
+
+
+class _TestBase(unittest.TestCase):
+
+    # Overwrite in subclasses to disable deleting tempdir
+    DELETE_TEMP_DIR = True
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dirName = tempfile.mkdtemp()
+
+    @classmethod
+    def tearDownClass(cls):
+        if hasattr(cls, 'dirName'):
+            if os.path.exists(cls.dirName):
+                if cls.DELETE_TEMP_DIR:
+                    shutil.rmtree(cls.dirName)
+                else:
+                    log.debug("running in debug mode. Not deleting temp dir {d}".format(d=cls.dirName))
+
+    def _getTempFile(self, suffix=None):
+        """Write temp files to central location"""
+        f = tempfile.NamedTemporaryFile(dir=self.dirName, suffix=suffix)
+        return f.name
+
+
+def _getNAlignments(fileName):
+    """Util func to extract the total number of alignments from a cmp.h5 file
+
+    :param fileName: Path to cmp.h5 file
+
+    :type fileName: str
+    :rtype: int
+    """
+    if not os.path.exists(fileName):
+        msg = "Unable to find {f} to get number of aligments".format(f=fileName)
+        log.error(msg)
+        sys.stderr.write(msg + "\n")
+        return 0
+
+    nalignments = 0
+    with CmpH5Reader(fileName) as r:
+        for alignment in r:
+            nalignments += 1
+
+    return nalignments
+
+
+class TestCmpH5Merge(_TestBase):
+    def test_basic(self):
+        inputFiles = [_CMP_H5]
+        outputFile = self._getTempFile(suffix="_merge_basic.cmp.h5")
+        cmpH5Merge(inputFiles, outputFile)
+
+        n = _getNAlignments(outputFile)
+        correctNAlignments = 2764
+        self.assertEqual(n, correctNAlignments)
+
+
+class TestMergeException(_TestBase):
+    def test_output_file_in_inputs(self):
+        inputFiles = [_CMP_H5]
+        outputFile = _CMP_H5
+        with self.assertRaises(ValueError) as e:
+            cmpH5Merge(inputFiles, outputFile)
+
+
+class TestCmpH5Select(_TestBase):
+    def test_basic(self):
+
+        b1 = 'F_42--R_42'
+        b2 = 'F_28--R_28'
+
+        where = "(Barcode == '{b1}') | (Barcode == '{b2}')".format(b1=b1, b2=b2)
+        outFile = self._getTempFile(suffix="_.cmp.h5")
+        # this interface doesn't really make sense. The outFile is required,
+        # but it is never used.
+        cmpH5Select(_CMP_H5, outFile, groupByStr="Barcode",
+                    whereStr=where, outDir=self.dirName)
+
+        #n = _getNAlignments(outFile)
+        #self.assertEqual(n, 85)
+
+        nalignmentsTuple = [(b1, 76), (b2, 9)]
+        for b, nalignments in nalignmentsTuple:
+            p = os.path.join(self.dirName, b + ".cmp.h5")
+            self.assertTrue(os.path.exists(p), "Unable to find {b}".format(b=p))
+            n = _getNAlignments(p)
+            _d = dict(n=n, m=nalignments, p=p)
+            msg = "Incorrect alignments (got {n} expected {m} ) from {p}".format(**_d)
+            self.assertEqual(n, nalignments, msg)
diff --git a/tests/test_cmph5lib_CmpH5Sort.py b/tests/test_cmph5lib_CmpH5Sort.py
new file mode 100644
index 0000000..bef3184
--- /dev/null
+++ b/tests/test_cmph5lib_CmpH5Sort.py
@@ -0,0 +1,122 @@
+from nose.tools import assert_equal
+from nose import SkipTest
+
+import bisect
+import h5py as h
+from numpy import *
+
+from pbcore import data
+import pbh5tools.CmpH5Sort as CS
+import pbcore.io.rangeQueries as RQ
+from pbcore.io import CmpH5Reader
+
+def brute_force_number_in_range(s, e, vec):
+    return(len(filter(lambda x : s <= x < e, vec)))
+
+def generate_positions(size, coverage, lScale = 50):
+    NN = size*coverage
+    tS = random.randint(0, size, NN)
+    tE = tS + array(map(int, random.exponential(lScale, NN) + 1))
+    ar = array([tS, tE]).transpose()
+    ar = ar[lexsort((tE, tS)),]
+    return(ar)
+
+def brute_force_search(tStart, tEnd, nBack, nOverlap, start, end):
+    toKeep = array([False]*len(tStart))
+    res = array(range(0, len(tStart)))
+
+    for i in range(0, len(tStart)):
+        # four cases to deal with.
+        if (tStart[i] >= start and tStart[i] <= end):
+            toKeep[i] = True
+        elif (tEnd[i] > start and tEnd[i] <= end):
+            toKeep[i] = True
+        elif (tStart[i] <= start and tEnd[i] >= end):
+            toKeep[i] = True
+        elif (tStart[i] >= start and tEnd[i] <= end):
+            toKeep[i] = True
+        else:
+            continue
+    return(res[toKeep])
+
+def brute_force_get_reads(aIdx, start, end, format):
+    if aIdx.shape[0] == 0:
+        return aIdx
+    idxs = self.__brute_force_search(aIdx[:,format.TARGET_START],
+                                     aIdx[:,format.TARGET_END],
+                                     aIdx[:,format.N_BACK],
+                                     aIdx[:,format.N_OVERLAP],
+                                     start, end)
+    return(aIdx[idxs,])
+
+def compare_implementations(size, coverage = 1):
+    NN = size * coverage
+    ar = generate_positions(size, coverage)
+    res = CS.computeIndices(ar[:,0], ar[:,1])
+    resDP = CS.computeIndicesDP(ar[:,0], ar[:,1])
+    assert(sum(res[:,0:2] == resDP[:,0:2]) == NN*2)
+
+class TestCmpH5Format:
+    def test___init__(self):
+        cmpH5_format = CS.CmpH5Format(h.File(data.getCmpH5(),"r"))
+        assert_equal(cmpH5_format.ALN_INFO, 'AlnInfo')
+
+class TestNumberWithinRange:
+    def test_number_within_range(self):
+        for j in range(0, 100):
+            a = sort(random.randint(0, 100, 100))
+            s,e = sort(random.randint(0, 100, 2))
+            assert_equal(CS.numberWithinRange(s,e,a), 
+                         brute_force_number_in_range(s,e,a))
+       
+class TestComputeIndices:
+    def test_compute_indices(self):
+        for i in [100, 200]:
+            for j in [1, 5]:
+                compare_implementations(i, j)
+
+class TestComputeIndicesDP:
+    def test_compute_indices_d_p(self):
+        for i in [100, 200]:
+            for j in [1, 5]:
+                compare_implementations(i, j)
+
+class TestComputeRefIndexTable:
+    def test_compute_ref_index_table(self):
+        refIDs = [5,1,1,1,1,1,1,1,1,1,2,2,2,2,2]
+        tbl    = CS.computeRefIndexTable(array(refIDs))
+        utbl   = [ 5, 0, 1,
+                   1, 1, 10,
+                   2, 10, 15 ]
+        assert_equal(sum(tbl.ravel() == utbl), len(utbl))
+
+class TestGetOverlappingRanges:
+    def test_get_overlapping_ranges(self):
+        for i in [100, 500]:
+            for j in [.1, 1, 5, 10]:
+                for k in range(0, 10):
+                    ar = generate_positions(i, j)
+                    idx = CS.computeIndicesDP(ar[:,0], ar[:,1])
+                    aArray = hstack((ar, idx))
+                    s = random.randint(0, i, 1)
+                    e = int(1 + random.exponential(30, 1))
+                    x = RQ.getOverlappingRanges(aArray[:,0], aArray[:,1], 
+                                                aArray[:,2], aArray[:,3], s, 
+                                                s + e + 1)
+                    y = brute_force_search(aArray[:,0], aArray[:,1], aArray[:,2], 
+                                           aArray[:,3], s, s + e)
+                    assert(all(sort(x) == sort(y)))
+
+
+class TestGetReadsInRange:
+    def __init__(self):
+        self.h5FileName = data.getCmpH5()
+        self.cmpH5 = CmpH5Reader(self.h5FileName)
+        
+    def test_get_reads_in_range(self):
+        for j in range(0, 100):
+            a = sort(random.randint(0, 100, 100))
+            s,e = sort(random.randint(0, 100, 2))
+            assert_equal(CS.numberWithinRange(s,e,a),  brute_force_number_in_range(s,e,a))
+       
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbh5tools.git



More information about the debian-med-commit mailing list