[med-svn] [python-pbcore] 01/02: Imported Upstream version 1.0.0
Afif Elghraoui
afif-guest at moszumanska.debian.org
Sun Jun 7 10:13:48 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository python-pbcore.
commit 25f1d31c7ca61c4685718d91da161c227bdedbfa
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Sat Jun 6 23:34:11 2015 -0700
Imported Upstream version 1.0.0
---
.gitignore | 7 +
CHANGELOG.org | 45 +
LICENSES.txt | 32 +
Makefile | 52 +
README.md | 19 +
doc/Makefile | 156 +++
doc/conf.py | 253 ++++
doc/index.rst | 24 +
doc/modules.rst | 7 +
doc/pacbio-theme/static/headerGradient.jpg | Bin 0 -> 7099 bytes
doc/pacbio-theme/static/pacbio.css | 238 ++++
doc/pacbio-theme/static/pacbioLogo.png | Bin 0 -> 3128 bytes
doc/pacbio-theme/static/pygments.css | 55 +
doc/pacbio-theme/theme.conf | 4 +
doc/pbcore.chemistry.rst | 22 +
doc/pbcore.data.rst | 10 +
doc/pbcore.deprecated.rst | 30 +
doc/pbcore.io.rst | 182 +++
doc/pbcore.model.rst | 2 +
doc/pbcore.rst | 22 +
doc/pbcore.util.rst | 18 +
pbcore/__init__.py | 31 +
pbcore/chemistry/__init__.py | 31 +
pbcore/chemistry/chemistry.py | 91 ++
pbcore/chemistry/resources/mapping.xml | 154 +++
pbcore/data/1.4_bas_files.fofn | 2 +
pbcore/data/2.0_bax_files.fofn | 3 +
pbcore/data/2.1_bax_files.fofn | 3 +
pbcore/data/2.1_ccs_files.fofn | 1 +
pbcore/data/2.3_bax_files.fofn | 3 +
pbcore/data/Fluidigm_human_amplicons.fasta | 250 ++++
pbcore/data/Fluidigm_human_amplicons.fasta.fai | 48 +
pbcore/data/Fluidigm_human_amplicons_tiny.fasta | 19 +
pbcore/data/__init__.py | 167 +++
pbcore/data/aligned_reads_1.bam | Bin 0 -> 34798 bytes
pbcore/data/aligned_reads_1.bam.bai | Bin 0 -> 160 bytes
pbcore/data/aligned_reads_1.cmp.h5 | Bin 0 -> 263540 bytes
pbcore/data/bam_mapping.bam | Bin 0 -> 172060 bytes
pbcore/data/bam_mapping.bam.bai | Bin 0 -> 112 bytes
pbcore/data/bam_mapping.bam.pbi | Bin 0 -> 41344 bytes
pbcore/data/barcodes-ed65-450.fasta | 900 ++++++++++++++
pbcore/data/barcodes-ed65-450.fasta.fai | 450 +++++++
pbcore/data/bc_files.fofn | 3 +
pbcore/data/blasr-output.m4 | 2 +
pbcore/data/blasr-output.m5 | 2 +
pbcore/data/cmph5_mapping.cmp.h5 | Bin 0 -> 236542 bytes
pbcore/data/lambdaNEB.fa | 608 ++++++++++
pbcore/data/lambdaNEB.fa.fai | 1 +
...c100129202555500000315043109121112_s1_p0.bas.h5 | Bin 0 -> 1159590 bytes
...c100129202555500000315043109121112_s2_p0.bas.h5 | Bin 0 -> 984538 bytes
...00497142550000001823078008081323_s1_p0.1.bax.h5 | Bin 0 -> 485799 bytes
...00497142550000001823078008081323_s1_p0.2.bax.h5 | Bin 0 -> 715572 bytes
...00497142550000001823078008081323_s1_p0.3.bax.h5 | Bin 0 -> 739046 bytes
...c100497142550000001823078008081323_s1_p0.bas.h5 | Bin 0 -> 260202 bytes
...00569412550000001823090301191423_s1_p0.1.ccs.h5 | Bin 0 -> 487912 bytes
...00564662550000001823085912221321_s1_p0.1.bax.h5 | Bin 0 -> 501435 bytes
...00564662550000001823085912221321_s1_p0.1.rgn.h5 | Bin 0 -> 17000 bytes
...00564662550000001823085912221321_s1_p0.2.bax.h5 | Bin 0 -> 406010 bytes
...00564662550000001823085912221321_s1_p0.2.rgn.h5 | Bin 0 -> 17000 bytes
...00564662550000001823085912221321_s1_p0.3.bax.h5 | Bin 0 -> 588082 bytes
...00564662550000001823085912221321_s1_p0.3.rgn.h5 | Bin 0 -> 17060 bytes
...c100564662550000001823085912221321_s1_p0.bas.h5 | Bin 0 -> 260202 bytes
...100626172550000001823119008061414_s1_p0.1.bc.h5 | Bin 0 -> 62704 bytes
...100626172550000001823119008061414_s1_p0.2.bc.h5 | Bin 0 -> 45120 bytes
...100626172550000001823119008061414_s1_p0.3.bc.h5 | Bin 0 -> 27920 bytes
...00564852550000001823085912221377_s1_X0.1.bax.h5 | Bin 0 -> 1284247 bytes
...4852550000001823085912221377_s1_X0.subreads.bam | Bin 0 -> 202934 bytes
...00702482550000001823141103261590_s1_p0.1.bax.h5 | Bin 0 -> 856704 bytes
...00702482550000001823141103261590_s1_p0.2.bax.h5 | Bin 0 -> 404288 bytes
...00702482550000001823141103261590_s1_p0.3.bax.h5 | Bin 0 -> 610688 bytes
...c100702482550000001823141103261590_s1_p0.bas.h5 | Bin 0 -> 1318480 bytes
pbcore/data/variants.gff | 11 +
pbcore/io/BarcodeH5Reader.py | 374 ++++++
pbcore/io/BasH5IO.py | 1026 ++++++++++++++++
pbcore/io/FastaIO.py | 459 +++++++
pbcore/io/FastqIO.py | 259 ++++
pbcore/io/FofnIO.py | 96 ++
pbcore/io/GffIO.py | 233 ++++
pbcore/io/__init__.py | 40 +
pbcore/io/_utils.py | 246 ++++
pbcore/io/align/BamAlignment.py | 571 +++++++++
pbcore/io/align/BamIO.py | 394 ++++++
pbcore/io/align/BlasrIO.py | 116 ++
pbcore/io/align/CmpH5IO.py | 1277 ++++++++++++++++++++
pbcore/io/align/PacBioBamIndex.py | 121 ++
pbcore/io/align/_AlignmentMixin.py | 210 ++++
pbcore/io/align/_BamSupport.py | 127 ++
pbcore/io/align/__init__.py | 34 +
pbcore/io/base.py | 109 ++
pbcore/io/opener.py | 134 ++
pbcore/io/rangeQueries.py | 182 +++
pbcore/model/__init__.py | 29 +
pbcore/sequence.py | 62 +
pbcore/util/Process.py | 68 ++
pbcore/util/ToolRunner.py | 115 ++
pbcore/util/__init__.py | 29 +
pbcore/util/decorators.py | 17 +
setup.py | 32 +
tests/test_pbcore_data.py | 12 +
tests/test_pbcore_io_AlnFileReaders.py | 375 ++++++
tests/test_pbcore_io_BarcodeH5Reader.py | 141 +++
tests/test_pbcore_io_BasH5Collection.py | 28 +
tests/test_pbcore_io_BasH5Reader.py | 494 ++++++++
tests/test_pbcore_io_BlasrIO.py | 10 +
tests/test_pbcore_io_FastaIO.py | 133 ++
tests/test_pbcore_io_FastaTable.py | 80 ++
tests/test_pbcore_io_FastqIO.py | 183 +++
tests/test_pbcore_io_FofnIO.py | 22 +
tests/test_pbcore_io_GffIO.py | 100 ++
tests/test_pbcore_io_rangeQueries.py | 71 ++
tests/test_pbcore_io_unaligned_bam.py | 68 ++
tests/test_pbcore_util_sequences.py | 48 +
112 files changed, 12083 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a0638ec
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+build
+dist
+doc/_build
+*~
+*.pyc
+*.egg-info
+nosetests.xml
\ No newline at end of file
diff --git a/CHANGELOG.org b/CHANGELOG.org
new file mode 100644
index 0000000..2764237
--- /dev/null
+++ b/CHANGELOG.org
@@ -0,0 +1,45 @@
+* Version 1.0.0
+- BAM fixes
+- Better FOFN handling
+- Recognize additional part numbers for P6 chemistry
+- Better --debug in ToolRunner
+
+* Version 0.9.5
+- BAM fixes
+- Adopt aStart, aEnd conventions from BAM spec, replacing rStart, rEnd
+- AlnIndex sharing among CmpH5Reader objects
+
+* Version 0.9.4
+- BAM support moved to 3.0 spec; support for earlier PacBio BAMs
+ dropped
+- Deprecation warning decorators added
+- BAM-incompatible cmp.h5 accessors deprecated
+- Moved to support 3.0 FASTA conventions in Fasta readers
+- Rename FastaTable to IndexedFastaReader
+
+* Version 0.9.3
+- ".open" script added for convenience
+- openers added (factory methods invoking the appropriate Reader
+ class; useful for applications that want to transparently use either
+ BAM or cmp.h5)
+
+* Version 0.9.2
+- BAM support: Addition of BamReader, IndexedBamReader, and BamAlignment
+- Minor CmpH5Reader API changes for greater compatibility with
+ BamReader (deprecation of movieInfo in favor of readGroupInfo)
+- Removed unused components from CmpH5Reader API
+- Add example BAM file
+- Length accessors for FAST[AQ] record types
+
+* Version 0.9.1 (SMRTanalysis 2.3.0p1)
+- FASTA header parsing into "id" and "metadata" now available in the
+ FastaRecord types
+
+* Version 0.9.0 (SMRTanalysis 2.3.0)
+- pbcore.chemistry: a new subpackage for decoding barcode information
+ to the human-readable chemistry name
+- BasH5Reader: more robust handling of broken region tables
+- CmpH5Reader: loading an empty cmp.h5 will raise an EmptyCmpH5Error.
+ This is because the semantics of an empty cmp.h5 were never defined,
+ and for example it is not defined whether or not a cmp.h5 lacking a
+ movie table is compliant.
diff --git a/LICENSES.txt b/LICENSES.txt
new file mode 100644
index 0000000..5360bf2
--- /dev/null
+++ b/LICENSES.txt
@@ -0,0 +1,32 @@
+Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of Pacific Biosciences nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..64e10c7
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,52 @@
+.PHONY: clean doc doc-clean tests check test install build bdist gh-pages
+
+build:
+ python setup.py build
+
+bdist:
+ python setup.py build --executable="/usr/bin/env python"
+ python setup.py bdist --formats=egg
+
+install:
+ python setup.py install
+
+clean: doc-clean
+ rm -rf build/;\
+ find . -name "*.egg-info" | xargs rm -rf;\
+ rm -rf dist/;\
+ find . -name "*.pyc" | xargs rm -f;
+ rm -f nosetests.xml
+
+doc:
+ sphinx-apidoc -o doc/ pbcore/ && cd doc/ && make html
+doc-clean:
+ cd doc && rm -rf _templates _static _build searchindex.js objects.inv
+
+doctest:
+ cd doc && make doctest
+
+unit-test:
+ nosetests --with-xunit tests -v
+
+test: doctest unit-test
+
+tests: test
+check: test
+
+GH_PAGES_SOURCES = pbcore doc
+
+gh-pages:
+ git checkout gh-pages
+ rm -rf _static _sources *.js *.html *.inv
+ git checkout master $(GH_PAGES_SOURCES)
+ cd doc && make html
+ mv -fv doc/_build/html/* .
+ rm -rf $(GH_PAGES_SOURCES)
+ git add --all && git commit -m "Automatic update of gh-pages branch" && git checkout master
+
+pip-install:
+ @which pip > /dev/null
+ @pip freeze|grep 'pbcore=='>/dev/null \
+ && pip uninstall -y pbcore \
+ || echo -n ''
+ @pip install --no-index ./
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4ffa878
--- /dev/null
+++ b/README.md
@@ -0,0 +1,19 @@
+
+The pbcore package provides Python APIs for interacting with PacBio
+data files and writing bioinformatics applications.
+
+Installation:
+-------------
+ % pip install numpy
+ % pip install h5py
+ % python setup.py install
+
+Requirements:
+-------------
+- Python 2.7
+- h5py >= 2.0
+- numpy >= 1.6.0
+
+Documentation:
+--------------
+http://pacificbiosciences.github.io/pbcore/
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..e367d03
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,156 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext api-doc
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ -rm -rf $(BUILDDIR)/*
+
+api-doc:
+ sphinx-apidoc -o . -d 4 ../pbcore/
+
+html: api-doc
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbcore.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbcore.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/pbcore"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbcore"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100755
index 0000000..6f0945d
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,253 @@
+# -*- coding: utf-8 -*-
+#
+# pbcore documentation build configuration file, created by
+# sphinx-quickstart on Thu Nov 10 14:37:34 2011.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+globals = {}
+execfile("../pbcore/__init__.py", globals)
+__VERSION__ = globals["__VERSION__"]
+
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.todo',
+ 'sphinx.ext.doctest']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'pbcore'
+copyright = u'2011-2015, Pacific Biosciences'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = __VERSION__
+# The full version, including alpha/beta/rc tags.
+release = __VERSION__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'pacbio-theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+html_theme_path = ["../../../../doc/theme/","./"]
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pbcoredoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ('index', 'pbcore.tex', u'pbcore Documentation',
+ u'devnet at pacificbiosciences.com', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'pbcore', u'pbcore Documentation',
+ [u'devnet at pacificbiosciences.com'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ ('index', 'pbcore', u'pbcore Documentation', u'devnet at pacificbiosciences.com',
+ 'pbcore', 'One line description of project.', 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {'http://docs.python.org/': None}
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..1a4889a
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,24 @@
+pbcore
+######
+
+The `pbcore` package provides Python modules for processing PacBio
+data files and building PacBio bioinformatics applications.
+
+
+Library API documentation
+=========================
+
+ :doc:`pbcore.io`: Classes for reading/writing PacBio data formats and essential common data formats
+
+ :doc:`pbcore.model`: Common base classes
+
+ :doc:`pbcore.util`: Utilities for building bioinformatics applications
+
+ :doc:`pbcore.data`: Small bundled data files that are handy for testing and debugging
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/doc/modules.rst b/doc/modules.rst
new file mode 100644
index 0000000..e7de000
--- /dev/null
+++ b/doc/modules.rst
@@ -0,0 +1,7 @@
+pbcore
+======
+
+.. toctree::
+ :maxdepth: 4
+
+ pbcore
diff --git a/doc/pacbio-theme/static/headerGradient.jpg b/doc/pacbio-theme/static/headerGradient.jpg
new file mode 100644
index 0000000..883f147
Binary files /dev/null and b/doc/pacbio-theme/static/headerGradient.jpg differ
diff --git a/doc/pacbio-theme/static/pacbio.css b/doc/pacbio-theme/static/pacbio.css
new file mode 100644
index 0000000..b4ab87f
--- /dev/null
+++ b/doc/pacbio-theme/static/pacbio.css
@@ -0,0 +1,238 @@
+/**
+ * Sphinx stylesheet -- default theme
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+ at import url("basic.css");
+
+/* -- page layout ----------------------------------------------------------- */
+
+body {
+ font-family: Arial, sans-serif;
+ font-size: 100%;
+ background-color: #555;
+ color: #555;
+ margin: 0;
+ padding: 0;
+ min-width: 500px;
+ max-width: 956px;
+ margin: 0 auto;
+}
+
+div.documentwrapper {
+ float: left;
+ width: 100%;
+}
+
+div.bodywrapper {
+ margin: 0 0 0 230px;
+}
+
+hr{
+ border: 1px solid #B1B4B6;
+
+}
+
+div.document {
+ background-color: #eee;
+}
+
+div.body {
+ background-color: #ffffff;
+ color: #3E4349;
+ padding: 30px 30px 30px 30px;
+ font-size: 0.8em;
+}
+
+div.footer {
+ color: #555;
+ background-color: #fff;
+ padding: 13px 0;
+ text-align: center;
+ font-size: 75%;
+
+}
+div.footer a {
+ color: #444;
+ text-decoration: underline;
+}
+
+div.related {
+ background: #fff url(headerGradient.jpg);
+ line-height: 80px;
+ color: #fff;
+ font-size: 0.80em;
+ height: 79px;
+ z-index: -1;
+}
+
+div.related ul {
+ background: url(pacbioLogo.png) 10px no-repeat;
+ padding: 0 0 0 200px;
+}
+
+div.related a {
+ color: #E2F3CC;
+}
+
+div.sphinxsidebar {
+ font-size: 0.75em;
+ line-height: 1.5em;
+}
+
+div.sphinxsidebarwrapper{
+ padding: 20px 0;
+}
+
+div.sphinxsidebar h3,
+div.sphinxsidebar h4 {
+ font-family: Arial, sans-serif;
+ color: #222;
+ font-size: 1.2em;
+ font-weight: bold;
+ margin: 0;
+ padding: 5px 10px 0 10px;
+}
+
+div.sphinxsidebar h4{
+ font-size: 1.1em;
+}
+
+div.sphinxsidebar h3 a {
+ color: #444;
+}
+
+
+div.sphinxsidebar p {
+ color: #888;
+ padding: 0px 20px;
+ margin-top: 5px;
+}
+
+div.sphinxsidebar p.topless {
+}
+
+div.sphinxsidebar ul {
+ margin: 5px 20px 10px 20px;
+ padding: 0;
+ color: #000;
+}
+
+div.sphinxsidebar a {
+ color: #444;
+}
+
+div.sphinxsidebar input {
+ border: 1px solid #ccc;
+ font-family: sans-serif;
+ font-size: 1em;
+}
+
+div.sphinxsidebar input[type=text]{
+ margin-left: 20px;
+}
+
+/* -- body styles ----------------------------------------------------------- */
+
+a {
+ color: #005B81;
+ text-decoration: none;
+}
+
+a:hover {
+ color: #E32E00;
+ text-decoration: underline;
+}
+
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+ font-family: Arial, sans-serif;
+ font-weight: bold;
+ color: #264868;
+ margin: 30px 0px 10px 0px;
+ padding: 5px 0 5px 0px;
+}
+
+div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 180%; font-weight: normal; }
+div.body h2 { font-size: 125%; }
+div.body h3 { font-size: 110%; }
+div.body h4 { font-size: 100%; }
+div.body h5 { font-size: 100%; }
+div.body h6 { font-size: 100%; }
+
+a.headerlink {
+ color: #c60f0f;
+ font-size: 0.8em;
+ padding: 0 4px 0 4px;
+ text-decoration: none;
+}
+
+a.headerlink:hover {
+ background-color: #c60f0f;
+ color: white;
+}
+
+div.body p, div.body dd, div.body li {
+ line-height: 1.5em;
+ font-size: 1em;
+}
+
+div.admonition p.admonition-title + p {
+ display: inline;
+}
+
+div.highlight{
+ background-color: white;
+}
+
+div.note {
+ background-color: #eee;
+ border: 1px solid #ccc;
+}
+
+div.seealso {
+ background-color: #ffc;
+ border: 1px solid #ff6;
+}
+
+div.topic {
+ background-color: #eee;
+}
+
+div.warning {
+ background-color: #ffe4e4;
+ border: 1px solid #f66;
+}
+
+p.admonition-title {
+ display: inline;
+}
+
+p.admonition-title:after {
+ content: ":";
+}
+
+pre {
+ padding: 10px;
+ background-color: White;
+ color: #222;
+ line-height: 1.2em;
+ border: 1px solid #C6C9CB;
+ font-size: 1.2em;
+ margin: 1.5em 0 1.5em 0;
+ -webkit-box-shadow: 1px 1px 1px #d8d8d8;
+ -moz-box-shadow: 1px 1px 1px #d8d8d8;
+}
+
+tt {
+ background-color: #ecf0f3;
+ color: #222;
+ padding: 1px 2px;
+ font-size: 1.2em;
+ font-family: monospace;
+}
+
diff --git a/doc/pacbio-theme/static/pacbioLogo.png b/doc/pacbio-theme/static/pacbioLogo.png
new file mode 100644
index 0000000..b2e4887
Binary files /dev/null and b/doc/pacbio-theme/static/pacbioLogo.png differ
diff --git a/doc/pacbio-theme/static/pygments.css b/doc/pacbio-theme/static/pygments.css
new file mode 100644
index 0000000..4588cde
--- /dev/null
+++ b/doc/pacbio-theme/static/pygments.css
@@ -0,0 +1,55 @@
+.c { color: #999988; font-style: italic } /* Comment */
+.k { font-weight: bold } /* Keyword */
+.o { font-weight: bold } /* Operator */
+.cm { color: #999988; font-style: italic } /* Comment.Multiline */
+.cp { color: #999999; font-weight: bold } /* Comment.preproc */
+.c1 { color: #999988; font-style: italic } /* Comment.Single */
+.gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
+.ge { font-style: italic } /* Generic.Emph */
+.gr { color: #aa0000 } /* Generic.Error */
+.gh { color: #999999 } /* Generic.Heading */
+.gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
+.go { color: #111 } /* Generic.Output */
+.gp { color: #555555 } /* Generic.Prompt */
+.gs { font-weight: bold } /* Generic.Strong */
+.gu { color: #aaaaaa } /* Generic.Subheading */
+.gt { color: #aa0000 } /* Generic.Traceback */
+.kc { font-weight: bold } /* Keyword.Constant */
+.kd { font-weight: bold } /* Keyword.Declaration */
+.kp { font-weight: bold } /* Keyword.Pseudo */
+.kr { font-weight: bold } /* Keyword.Reserved */
+.kt { color: #445588; font-weight: bold } /* Keyword.Type */
+.m { color: #009999 } /* Literal.Number */
+.s { color: #bb8844 } /* Literal.String */
+.na { color: #008080 } /* Name.Attribute */
+.nb { color: #999999 } /* Name.Builtin */
+.nc { color: #445588; font-weight: bold } /* Name.Class */
+.no { color: #ff99ff } /* Name.Constant */
+.ni { color: #800080 } /* Name.Entity */
+.ne { color: #990000; font-weight: bold } /* Name.Exception */
+.nf { color: #990000; font-weight: bold } /* Name.Function */
+.nn { color: #555555 } /* Name.Namespace */
+.nt { color: #000080 } /* Name.Tag */
+.nv { color: purple } /* Name.Variable */
+.ow { font-weight: bold } /* Operator.Word */
+.mf { color: #009999 } /* Literal.Number.Float */
+.mh { color: #009999 } /* Literal.Number.Hex */
+.mi { color: #009999 } /* Literal.Number.Integer */
+.mo { color: #009999 } /* Literal.Number.Oct */
+.sb { color: #bb8844 } /* Literal.String.Backtick */
+.sc { color: #bb8844 } /* Literal.String.Char */
+.sd { color: #bb8844 } /* Literal.String.Doc */
+.s2 { color: #bb8844 } /* Literal.String.Double */
+.se { color: #bb8844 } /* Literal.String.Escape */
+.sh { color: #bb8844 } /* Literal.String.Heredoc */
+.si { color: #bb8844 } /* Literal.String.Interpol */
+.sx { color: #bb8844 } /* Literal.String.Other */
+.sr { color: #808000 } /* Literal.String.Regex */
+.s1 { color: #bb8844 } /* Literal.String.Single */
+.ss { color: #bb8844 } /* Literal.String.Symbol */
+.bp { color: #999999 } /* Name.Builtin.Pseudo */
+.vc { color: #ff99ff } /* Name.Variable.Class */
+.vg { color: #ff99ff } /* Name.Variable.Global */
+.vi { color: #ff99ff } /* Name.Variable.Instance */
+.il { color: #009999 } /* Literal.Number.Integer.Long */
+
diff --git a/doc/pacbio-theme/theme.conf b/doc/pacbio-theme/theme.conf
new file mode 100644
index 0000000..dd24a1a
--- /dev/null
+++ b/doc/pacbio-theme/theme.conf
@@ -0,0 +1,4 @@
+[theme]
+inherit = default
+stylesheet = pacbio.css
+pygments_style = tango
diff --git a/doc/pbcore.chemistry.rst b/doc/pbcore.chemistry.rst
new file mode 100644
index 0000000..ad7d687
--- /dev/null
+++ b/doc/pbcore.chemistry.rst
@@ -0,0 +1,22 @@
+pbcore.chemistry package
+========================
+
+Submodules
+----------
+
+pbcore.chemistry.chemistry module
+---------------------------------
+
+.. automodule:: pbcore.chemistry.chemistry
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: pbcore.chemistry
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/pbcore.data.rst b/doc/pbcore.data.rst
new file mode 100644
index 0000000..e74af8b
--- /dev/null
+++ b/doc/pbcore.data.rst
@@ -0,0 +1,10 @@
+pbcore.data
+===========
+
+:mod:`pbcore.data`
+------------------
+
+.. automodule:: pbcore.data
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/pbcore.deprecated.rst b/doc/pbcore.deprecated.rst
new file mode 100644
index 0000000..2141176
--- /dev/null
+++ b/doc/pbcore.deprecated.rst
@@ -0,0 +1,30 @@
+pbcore.deprecated package
+=========================
+
+Submodules
+----------
+
+pbcore.deprecated.BasH5IO module
+--------------------------------
+
+.. automodule:: pbcore.deprecated.BasH5IO
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+pbcore.deprecated.ReferenceEntry module
+---------------------------------------
+
+.. automodule:: pbcore.deprecated.ReferenceEntry
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: pbcore.deprecated
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/pbcore.io.rst b/doc/pbcore.io.rst
new file mode 100644
index 0000000..34a9744
--- /dev/null
+++ b/doc/pbcore.io.rst
@@ -0,0 +1,182 @@
+pbcore.io
+=========
+
+The ``pbcore.io`` package provides a number of lightweight interfaces
+to PacBio data files and other standard bioinformatics file formats.
+Preferred usage is to import classes directly from the ``pbcore.io``
+package, e.g.::
+
+ >>> from pbcore.io import CmpH5Reader
+
+The classes within ``pbcore.io`` adhere to a few conventions, in order
+to provide a uniform API:
+
+ - Each data file type is thought of as a container of a `Record`
+ type; all `Reader` classes support streaming access, and
+ `CmpH5Reader` and `BasH5Reader` additionally provide random-access
+ to alignments/reads.
+
+ - The constructor argument needed to instantiate `Reader` and
+ `Writer` objects can be either a filename (which can be suffixed
+ by ".gz" for all but the h5 file types) or an open file handle.
+ The reader/writer classes will do what you would expect.
+
+
+ - The reader/writer classes all support the context manager idiom.
+ Meaning, if you write::
+
+ >>> with CmpH5Reader("aligned_reads.cmp.h5") as r:
+ ... print r[0].read()
+
+ the `CmpH5Reader` object will be automatically closed after the
+ block within the "with" statement is executed.
+
+
+BAM/cmp.h5 compatibility: quick start
+-------------------------------------
+
+If you have an application that uses the `CmpH5Reader` and you want to
+start using BAM files, your best bet is to use the following generic
+factory functions:
+
+.. autofunction:: pbcore.io.openIndexedAlignmentFile
+
+.. autofunction:: pbcore.io.openAlignmentFile
+
+.. note::
+
+ Since BAM files contain a subset of the information that was
+ present in cmp.h5 files, you will need to provide these functions
+ an indexed FASTA file for your reference. For *full*
+ compatibility, you need the `openIndexedAlignmentFile` function,
+ which requires the existence of a `bam.pbi` file (PacBio BAM index
+ companion file).
+
+
+
+
+`bas.h5` / `bax.h5` Formats (PacBio basecalls file)
+---------------------------------------------------
+
+The `bas.h5`/ `bax.h5` file formats are container formats for PacBio
+reads, built on top of the HDF5 standard. Originally there was just
+one `bas.h5`, but eventually "multistreaming" came along and we had to
+split the file into three `bax.h5` *parts* and one `bas.h5` file
+containing pointers to the *parts*. Use ``BasH5Reader`` to read any
+kind of `bas.h5` file, and ``BaxH5Reader`` to read a `bax.h5`.
+
+.. note::
+
+ In contrast to GFF, for example, the `bas.h5` read coordinate
+ system is 0-based and start-inclusive/end-exclusive, i.e. the same
+ convention as Python and the C++ STL.
+
+.. autoclass:: pbcore.io.BasH5Reader
+ :members:
+ :undoc-members:
+
+.. autoclass:: pbcore.io.BasH5IO.Zmw
+ :members:
+ :undoc-members:
+
+.. autoclass:: pbcore.io.BasH5IO.ZmwRead
+ :members:
+ :undoc-members:
+
+
+BAM format
+----------
+
+The BAM format is a standard format described aligned and unaligned
+reads. PacBio is transitioning from the cmp.h5 format to the BAM
+format. For basic functionality, one should use :class:`BamReader`;
+for full compatibility with the :class:`CmpH5Reader` API (including
+alignment index functionality) one should use
+:class:`IndexedBamReader`, which requires the auxiliary *PacBio BAM
+index file* (``bam.pbi`` file).
+
+.. autoclass:: pbcore.io.BamAlignment
+ :members:
+ :undoc-members:
+
+.. autoclass:: pbcore.io.BamReader
+ :members:
+ :undoc-members:
+
+.. autoclass:: pbcore.io.IndexedBamReader
+ :members:
+ :undoc-members:
+
+
+
+`cmp.h5` format (legacy PacBio alignment file)
+----------------------------------------------
+
+The `cmp.h5` file format is an alignment format built on top of the HDF5
+standard. It is a simple container format for PacBio alignment records.
+
+.. note::
+
+ In contrast to GFF, for example, all `cmp.h5` coordinate systems
+ (refererence, read) are 0-based and start-inclusive/end-exclusive,
+ i.e. the same convention as Python and the C++ STL.
+
+
+.. autoclass:: pbcore.io.CmpH5Reader
+ :members:
+ :undoc-members:
+
+.. autoclass:: pbcore.io.CmpH5Alignment
+ :members:
+ :undoc-members:
+
+
+FASTA Format
+------------
+
+FASTA is a standard format for sequence data. We recommmend using the
+`FastaTable` class, which provides random access to indexed FASTA
+files (using the conventional SAMtools "fai" index).
+
+.. autoclass:: pbcore.io.FastaTable
+ :members:
+
+.. autoclass:: pbcore.io.FastaRecord
+ :members:
+
+.. autoclass:: pbcore.io.FastaReader
+ :members:
+
+.. autoclass:: pbcore.io.FastaWriter
+ :members:
+
+
+FASTQ Format
+------------
+
+FASTQ is a standard format for sequence data with associated quality scores.
+
+.. autoclass:: pbcore.io.FastqRecord
+ :members:
+
+.. autoclass:: pbcore.io.FastqReader
+ :members:
+
+.. autoclass:: pbcore.io.FastqWriter
+ :members:
+
+
+
+GFF Format (Version 3)
+----------------------
+
+The GFF format is an open and flexible standard for representing genomic features.
+
+.. autoclass:: pbcore.io.Gff3Record
+ :members:
+
+.. autoclass:: pbcore.io.GffReader
+ :members:
+
+.. autoclass:: pbcore.io.GffWriter
+ :members:
diff --git a/doc/pbcore.model.rst b/doc/pbcore.model.rst
new file mode 100644
index 0000000..957bedb
--- /dev/null
+++ b/doc/pbcore.model.rst
@@ -0,0 +1,2 @@
+pbcore.model
+============
diff --git a/doc/pbcore.rst b/doc/pbcore.rst
new file mode 100644
index 0000000..b390063
--- /dev/null
+++ b/doc/pbcore.rst
@@ -0,0 +1,22 @@
+pbcore package
+==============
+
+Subpackages
+-----------
+
+.. toctree::
+
+ pbcore.chemistry
+ pbcore.data
+ pbcore.deprecated
+ pbcore.io
+ pbcore.model
+ pbcore.util
+
+Module contents
+---------------
+
+.. automodule:: pbcore
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/pbcore.util.rst b/doc/pbcore.util.rst
new file mode 100644
index 0000000..ddf521d
--- /dev/null
+++ b/doc/pbcore.util.rst
@@ -0,0 +1,18 @@
+pbcore.util
+===========
+
+:mod:`Process` Module
+---------------------
+
+.. automodule:: pbcore.util.Process
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+:mod:`ToolRunner` Module
+------------------------
+
+.. automodule:: pbcore.util.ToolRunner
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
new file mode 100644
index 0000000..394f477
--- /dev/null
+++ b/pbcore/__init__.py
@@ -0,0 +1,31 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+__VERSION__ = "1.0.0"
diff --git a/pbcore/chemistry/__init__.py b/pbcore/chemistry/__init__.py
new file mode 100644
index 0000000..47d189f
--- /dev/null
+++ b/pbcore/chemistry/__init__.py
@@ -0,0 +1,31 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from .chemistry import *
diff --git a/pbcore/chemistry/chemistry.py b/pbcore/chemistry/chemistry.py
new file mode 100644
index 0000000..e9c31fd
--- /dev/null
+++ b/pbcore/chemistry/chemistry.py
@@ -0,0 +1,91 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+
+__all__ = ["tripleFromMetadataXML",
+ "decodeTriple",
+ "ChemistryLookupError" ]
+
+import xml.etree.ElementTree as ET, os.path
+from pkg_resources import Requirement, resource_filename
+from collections import OrderedDict
+
+class ChemistryLookupError(Exception): pass
+
+def _loadBarcodeMappingsFromFile(mapFile):
+ try:
+ tree = ET.parse(mapFile)
+ root = tree.getroot()
+ mappingElements = root.findall("Mapping")
+ mappings = OrderedDict()
+ mapKeys = ["BindingKit", "SequencingKit", "SoftwareVersion", "SequencingChemistry"]
+ for mapElement in mappingElements:
+ bindingKit = mapElement.find("BindingKit").text
+ sequencingKit = mapElement.find("SequencingKit").text
+ softwareVersion = mapElement.find("SoftwareVersion").text
+ sequencingChemistry = mapElement.find("SequencingChemistry").text
+ mappings[(bindingKit, sequencingKit, softwareVersion)] = sequencingChemistry
+ return mappings
+ except:
+ raise ChemistryLookupError, "Error loading chemistry mapping xml"
+
+def _loadBarcodeMappings():
+ mappingFname = resource_filename(Requirement.parse('pbcore'),'pbcore/chemistry/resources/mapping.xml')
+ return _loadBarcodeMappingsFromFile(mappingFname)
+
+_BARCODE_MAPPINGS = _loadBarcodeMappings()
+
+def tripleFromMetadataXML(metadataXmlPath):
+ """
+ Scrape the triple from the metadata.xml, or exception if the file
+ or the relevant contents are not found
+ """
+ nsd = {None: "http://pacificbiosciences.com/PAP/Metadata.xsd",
+ "pb": "http://pacificbiosciences.com/PAP/Metadata.xsd"}
+ try:
+ tree = ET.parse(metadataXmlPath)
+ root = tree.getroot()
+ bindingKit = root.find("pb:BindingKit/pb:PartNumber", namespaces=nsd).text
+ sequencingKit = root.find("pb:SequencingKit/pb:PartNumber", namespaces=nsd).text
+ # The instrument version is truncated to the first 2 dot delimited components
+ instrumentControlVersion = root.find("pb:InstCtrlVer", namespaces=nsd).text
+ verComponents = instrumentControlVersion.split(".")[0:2]
+ instrumentControlVersion = ".".join(verComponents)
+ return (bindingKit, sequencingKit, instrumentControlVersion)
+ except Exception as e:
+ raise ChemistryLookupError, \
+ ("Could not find, or extract chemistry information from, %s" % (metadataXmlPath,))
+
+def decodeTriple(bindingKit, sequencingKit, softwareVersion):
+ """
+ Return the name of the chemisty configuration given the
+ configuration triple that was recorded on the instrument.
+ """
+ return _BARCODE_MAPPINGS.get((bindingKit, sequencingKit, softwareVersion), "unknown")
diff --git a/pbcore/chemistry/resources/mapping.xml b/pbcore/chemistry/resources/mapping.xml
new file mode 100644
index 0000000..8eda49c
--- /dev/null
+++ b/pbcore/chemistry/resources/mapping.xml
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="utf-8"?>
+<MappingTable>
+ <DefaultSequencingChemistry>XL-C2</DefaultSequencingChemistry>
+ <Mapping>
+ <SequencingChemistry>C2</SequencingChemistry>
+ <BindingKit>001672551</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>1.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>C2</SequencingChemistry>
+ <BindingKit>001672551</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>2.0</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>C2</SequencingChemistry>
+ <BindingKit>001672551</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>2.1</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>C2</SequencingChemistry>
+ <BindingKit>001672551</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>2.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>XL-C2</SequencingChemistry>
+ <BindingKit>100150800</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>1.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>XL-C2</SequencingChemistry>
+ <BindingKit>100150800</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>2.0</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>XL-C2</SequencingChemistry>
+ <BindingKit>100150800</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>2.1</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>XL-C2</SequencingChemistry>
+ <BindingKit>100150800</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>2.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>XL-XL</SequencingChemistry>
+ <BindingKit>100150800</BindingKit>
+ <SequencingKit>100180800</SequencingKit>
+ <SoftwareVersion>1.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>XL-XL</SequencingChemistry>
+ <BindingKit>100150800</BindingKit>
+ <SequencingKit>100180800</SequencingKit>
+ <SoftwareVersion>2.0</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>XL-XL</SequencingChemistry>
+ <BindingKit>100150800</BindingKit>
+ <SequencingKit>100180800</SequencingKit>
+ <SoftwareVersion>2.1</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>XL-XL</SequencingChemistry>
+ <BindingKit>100150800</BindingKit>
+ <SequencingKit>100180800</SequencingKit>
+ <SoftwareVersion>2.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P4-C2</SequencingChemistry>
+ <BindingKit>100236500</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>1.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P4-C2</SequencingChemistry>
+ <BindingKit>100236500</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>2.0</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P4-C2</SequencingChemistry>
+ <BindingKit>100236500</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>2.1</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P4-C2</SequencingChemistry>
+ <BindingKit>100236500</BindingKit>
+ <SequencingKit>001558034</SequencingKit>
+ <SoftwareVersion>2.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P4-XL</SequencingChemistry>
+ <BindingKit>100236500</BindingKit>
+ <SequencingKit>100180800</SequencingKit>
+ <SoftwareVersion>2.0</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P4-XL</SequencingChemistry>
+ <BindingKit>100236500</BindingKit>
+ <SequencingKit>100180800</SequencingKit>
+ <SoftwareVersion>2.1</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P4-XL</SequencingChemistry>
+ <BindingKit>100236500</BindingKit>
+ <SequencingKit>100180800</SequencingKit>
+ <SoftwareVersion>2.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P5-C3</SequencingChemistry>
+ <BindingKit>100256000</BindingKit>
+ <SequencingKit>100254800</SequencingKit>
+ <SoftwareVersion>2.1</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P5-C3</SequencingChemistry>
+ <BindingKit>100256000</BindingKit>
+ <SequencingKit>100254800</SequencingKit>
+ <SoftwareVersion>2.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P6-C4</SequencingChemistry>
+ <BindingKit>100356300</BindingKit>
+ <SequencingKit>100356200</SequencingKit>
+ <SoftwareVersion>2.1</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P6-C4</SequencingChemistry>
+ <BindingKit>100356300</BindingKit>
+ <SequencingKit>100356200</SequencingKit>
+ <SoftwareVersion>2.3</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P6-C4</SequencingChemistry>
+ <BindingKit>100372700</BindingKit>
+ <SequencingKit>100356200</SequencingKit>
+ <SoftwareVersion>2.1</SoftwareVersion>
+ </Mapping>
+ <Mapping>
+ <SequencingChemistry>P6-C4</SequencingChemistry>
+ <BindingKit>100372700</BindingKit>
+ <SequencingKit>100356200</SequencingKit>
+ <SoftwareVersion>2.3</SoftwareVersion>
+ </Mapping>
+</MappingTable>
diff --git a/pbcore/data/1.4_bas_files.fofn b/pbcore/data/1.4_bas_files.fofn
new file mode 100644
index 0000000..2cb97cd
--- /dev/null
+++ b/pbcore/data/1.4_bas_files.fofn
@@ -0,0 +1,2 @@
+m110818_075520_42141_c100129202555500000315043109121112_s1_p0.bas.h5
+m110818_075520_42141_c100129202555500000315043109121112_s2_p0.bas.h5
diff --git a/pbcore/data/2.0_bax_files.fofn b/pbcore/data/2.0_bax_files.fofn
new file mode 100644
index 0000000..d77163a
--- /dev/null
+++ b/pbcore/data/2.0_bax_files.fofn
@@ -0,0 +1,3 @@
+m130522_092457_42208_c100497142550000001823078008081323_s1_p0.1.bax.h5
+m130522_092457_42208_c100497142550000001823078008081323_s1_p0.2.bax.h5
+m130522_092457_42208_c100497142550000001823078008081323_s1_p0.3.bax.h5
diff --git a/pbcore/data/2.1_bax_files.fofn b/pbcore/data/2.1_bax_files.fofn
new file mode 100644
index 0000000..a0f0097
--- /dev/null
+++ b/pbcore/data/2.1_bax_files.fofn
@@ -0,0 +1,3 @@
+m130731_192718_42129_c100564662550000001823085912221321_s1_p0.1.bax.h5
+m130731_192718_42129_c100564662550000001823085912221321_s1_p0.2.bax.h5
+m130731_192718_42129_c100564662550000001823085912221321_s1_p0.3.bax.h5
diff --git a/pbcore/data/2.1_ccs_files.fofn b/pbcore/data/2.1_ccs_files.fofn
new file mode 100644
index 0000000..55d2143
--- /dev/null
+++ b/pbcore/data/2.1_ccs_files.fofn
@@ -0,0 +1 @@
+m130727_114215_42211_c100569412550000001823090301191423_s1_p0.1.ccs.h5
diff --git a/pbcore/data/2.3_bax_files.fofn b/pbcore/data/2.3_bax_files.fofn
new file mode 100644
index 0000000..4a4fec0
--- /dev/null
+++ b/pbcore/data/2.3_bax_files.fofn
@@ -0,0 +1,3 @@
+m140912_020930_00114_c100702482550000001823141103261590_s1_p0.1.bax.h5
+m140912_020930_00114_c100702482550000001823141103261590_s1_p0.2.bax.h5
+m140912_020930_00114_c100702482550000001823141103261590_s1_p0.3.bax.h5
diff --git a/pbcore/data/Fluidigm_human_amplicons.fasta b/pbcore/data/Fluidigm_human_amplicons.fasta
new file mode 100644
index 0000000..b6d7ac5
--- /dev/null
+++ b/pbcore/data/Fluidigm_human_amplicons.fasta
@@ -0,0 +1,250 @@
+>ref000001|EGFR_Exon_2
+TTTCTTCCAGTTTGCCAAGGCACGAGTAACAAGCTCACGCAGTTGGGCACTTTTGAAGAT
+CATTTTCTCAGCCTCCAGAGGATGTTCAATAACTGTGAGGTGGTCCTTGGGAATTTGGAA
+ATTACCTATGTGCAGAGGAATTATGATCTTTCCTTCTTAAAGGTTGGTGACTTTGATTTT
+CCT
+>ref000002|EGFR_Exon_3
+TTCTTAGACCATCCAGGAGGTGGCTGGTTATGTCCTCATTGCCCTCAACACAGTGGAGCG
+AATTCCTTTGGAAAACCTGCAGATCATCAGAGGAAATATGTACTACGAAAATTCCTATGC
+CTTAGCAGTCTTATCTAACTATGATGCAAATAAAACCGGACTGAAGGAGCTGCCCATGAG
+AAATTTACAGGGTGAGAGGCTGG
+>ref000003|EGFR_Exon_4
+AGCTGGAAAGAGTGCTCACCGCAGTTCCATTCTCCCGCAGAAATCCTGCATGGCGCCGTG
+CGGTTCAGCAACAACCCTGCCCTGTGCAACGTGGAGAGCATCCAGTGGCGGGACATAGTC
+AGCAGTGACTTTCTCAGCAACATGTCGATGGACTTCCAGAACCACCTGGGCAGCTGTAAG
+TGTCGCATACACACTATCTCTGCCTCCAGCTCCTA
+>ref000004|EGFR_Exon_5
+GCGTCATCAGTTTCTCATCATTTCACTGAGATATGCATCTATTACTTTTACATTTCAGGC
+CAAAAGTGTGATCCAAGCTGTCCCAATGGGAGCTGCTGGGGTGCAGGAGAGGAGAACTGC
+CAGAAACGTAAGTCAGTGAACAGCCTCAGACCCATGT
+>ref000005|EGFR_Exon_6
+CCCTGGGAAATGATCCTACCCTCACTCTTCAGCTCACAGGGAACCTTTGCTCTTTTTCAG
+TGACCAAAATCATCTGTGCCCAGCAGTGCTCCGGGCGCTGCCGTGGCAAGTCCCCCAGTG
+ACTGCTGCCACAACCAGTGTGCTGCAGGCTGCACAGGCCCCCGGGAGAGCGACTGCCTGG
+TAAGA
+>ref000006|EGFR_Exon_7
+CCAGCGTGTCCTCTCTCCTCCATAGGTCTGCCGCAAATTCCGAGACGAAGCCACGTGCAA
+GGACACCTGCCCCCCACTCATGCTCTACAACCCCACCACGTACCAGATGGATGTGAACCC
+CGAGGGCAAATACAGCTTTGGTGCCACCTGCGTGAAGAAGTGTCCCCGTGAGTCCTCCTC
+TGTGGGCCCTCTAACTGGTCAGGCATCCTTGTC
+>ref000007|EGFR_Exon_8
+CAAAGGAGGATGGAGCCTTTCCATCACCCCTCAAGAGGACCTGGACCGCCTGTGTGAGGC
+CCGAGCACCTGGTGCCACCGTCATCACCTTCCTTTCATGCTCTCTTCCCCAGGTAATTAT
+GTGGTGACAGATCACGGCTCGTGCGTCCGAGCCTGTGGGGCCGACAGCTATGAGATGGAG
+GAAGACGGCGTCCGCAAGTGTAAGAAGTGCGAAGGGCCTTGCCGCAAAGGTAGGAAGCCC
+GCCGGTGTGCGGACGAGGCTTGTTCTCGGCTGCTGAGGCTGGGCTCTCATGCCACCTCCA
+AAGGAACACATC
+>ref000008|EGFR_Exon_9
+TCCAACAAATGTGAACGGAATACACGTCTCTCTTATCTCTGCAGTGTGTAACGGAATAGG
+TATTGGTGAATTTAAAGACTCACTCTCCATAAATGCTACGAATATTAAACACTTCAAAAA
+CTGCACCTCCATCAGTGGCGATCTCCACATCCTGCCGGTGGCATTTAGGGGGTGAGTCAC
+AGGTTCAGTTGCTTG
+>ref000009|EGFR_Exon_10
+GATCAATAATCACCCTGTTGTTTGTTTCAGTGACTCCTTCACACATACTCCTCCTCTGGA
+TCCACAGGAACTGGATATTCTGAAAACCGTAAAGGAAATCACAGGTTTGAGCTGAATTAT
+CACATGAATATAAATGGGAAATCAGTGTTTTAGAGAGAGAACTTTTCGACATATTTCCTG
+TTCCCTTGGAA
+>ref000010|EGFR_Exon_11
+TCCTACGTGGTGTGTGTCTGAAGTCTTTCATCTGCCTTACAGGGTTTTTGCTGATTCAGG
+CTTGGCCTGAAAACAGGACGGACCTCCATGCCTTTGAGAACCTAGAAATCATACGCGGCA
+GGACCAAGCAACAGTAAGTTGACCACAGCCAAAGC
+>ref000011|EGFR_Exon_12
+CCACATGATTTTTCTTCTCTCCAATGTAGTGGTCAGTTTTCTCTTGCAGTCGTCAGCCTG
+AACATAACATCCTTGGGATTACGCTCCCTCAAGGAGATAAGTGATGGAGATGTGATAATT
+TCAGGAAACAAAAATTTGTGCTATGCAAATACAATAAACTGGAAAAAACTGTTTGGGACC
+TCCGGTCAGAAAACCAAAATTATAAGCAACAGAGGTGAAAACAGCTGCAGTAAGTCACCG
+>ref000012|EGFR_Exon_13
+GCTCTGTCACTGACTGCTGTGACCCACTCTGTCTCCGCAGAGGCCACAGGCCAGGTCTGC
+CATGCCTTGTGCTCCCCCGAGGGCTGCTGGGGCCCGGAGCCCAGGGACTGCGTCTCTTGC
+CGGAATGTCAGCCGAGGCAGGGAATGCGTGGACAAGTGCAACCTTCTGGAGGGGTAGGAG
+GTTATTTCTTTAATCCCCTTGCGTTGATCAAAAATAAGGCTCCAGGTTGTTGTTATAGC
+>ref000013|EGFR_Exon_14
+GCTGACGGGTTTCCTCTTCCTCCTCTCAGTGAGCCAAGGGAGTTTGTGGAGAACTCTGAG
+TGCATACAGTGCCACCCAGAGTGCCTGCCTCAGGCCATGAACATCACCTGCACAGGACGG
+GTAAGAGCCCCTTGCTGCTATCCACGTC
+>ref000014|EGFR_Exon_15
+GCATGAACATTTTTCTCCACCTTGGTGCAGGGACCAGACAACTGTATCCAGTGTGCCCAC
+TACATTGACGGCCCCCACTGCGTCAAGACCTGCCCGGCAGGAGTCATGGGAGAAAACAAC
+ACCCTGGTCTGGAAGTACGCAGACGCCGGCCATGTGTGCCACCTGTGCCATCCAAACTGC
+ACCTACGGGTGAGTGGAAAGTGAAGGAGAACAGAA
+>ref000015|EGFR_Exon_16
+TTTCTCTTTCACTTCCTACAGATGCACTGGGCCAGGTCTTGAAGGCTGTCCAACGAATGG
+GTAAGTGTTCACAGCTCTGTGTCACATGGACCTCGTCAAGAATGACCACACTGCTGTGG
+>ref000016|EGFR_Exon_17
+TGGAATCTGTCAGCAACCTCACCCTTCCTTGTTCCTCCACCTCATTCCAGGCCTAAGATC
+CCGTCCATCGCCACTGGGATGGTGGGGGCCCTCCTCTTGCTGCTGGTGGTGGCCCTGGGG
+ATCGGCCTCTTCATGCGAAGGCGCCACATCGTTCGGAAGCGCACGCTGCGGAGGCTGCTG
+CAGGAGAGGGAGGTGAGTGCCAGTCCTGGG
+>ref000017|EGFR_Exon_18
+GCTGAGGTGACCCTTGTCTCTGTGTTCTTGTCCCCCCCAGCTTGTGGAGCCTCTTACACC
+CAGTGGAGAAGCTCCCAACCAAGCTCTCTTGAGGATCTTGAAGGAAACTGAATTCAAAAA
+GATCAAAGTGCTGGGCTCCGGTGCGTTCGGCACGGTGTATAAGGTAAGGTCCCTGGCACA
+GGCCTCTGGGCTGGGCCGCAGGGCCTCTCATGGTCTGGTGGG
+>ref000018|EGFR_Exon_19
+TCACAATTGCCAGTTAACGTCTTCCTTCTCTCTCTGTCATAGGGACTCTGGATCCCAGAA
+GGTGAGAAAGTTAAAATTCCCGTCGCTATCAAGGAATTAAGAGAAGCAACATCTCCGAAA
+GCCAACAAGGAAATCCTCGATGTGAGTTTCTGCTTTGCTGTGTGG
+>ref000019|EGFR_Exon_20
+CCACACTGACGTGCCTCTCCCTCCCTCCAGGAAGCCTACGTGATGGCCAGCGTGGACAAC
+CCCCACGTGTGCCGCCTGCTGGGCATCTGCCTCACCTCCACCGTGCAGCTCATCACGCAG
+CTCATGCCCTTCGGCTGCCTCCTGGACTATGTCCGGGAACACAAAGACAATATTGGCTCC
+CAGTACCTGCTCAACTGGTGTGTGCAGATCGCAAAGGTAATCAGGGAAGGGAGATACGG
+>ref000020|EGFR_Exon_21
+CCTCACAGCAGGGTCTTCTCTGTTTCAGGGCATGAACTACTTGGAGGACCGTCGCTTGGT
+GCACCGCGACCTGGCAGCCAGGAACGTACTGGTGAAAACACCGCAGCATGTCAAGATCAC
+AGATTTTGGGCTGGCCAAACTGCTGGGTGCGGAAGAGAAAGAATACCATGCAGAAGGAGG
+CAAAGTAAGGAGGTGGCTTTAGGTCAG
+>ref000021|EGFR_Exon_22 MetadataTest
+CACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCAAGTGGATGGCATTGGAATCAATT
+TTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGTGAGTCATAATCCTGA
+TGCTAATGAGTTTGTACTGAGGCCAAGCTGG
+>ref000022|EGFR_Exon_23
+CATGATCCCACTGCCTTCTTTTCTTGCTTCATCCTCTCAGGGGTGACTGTTTGGGAGTTG
+ATGACCTTTGGATCCAAGCCATATGACGGAATCCCTGCCAGCGAGATCTCCTCCATCCTG
+GAGAAAGGAGAACGCCTCCCTCAGCCACCCATATGTACCATCGATGTCTACATGATCATG
+GTCAAGTGTGAGTGACTGGTGGGTCTGTCCACACT
+>ref000023|EGFR_Exon_24
+TTCCAGTGTTCTAATTGCACTGTTTTTTCTCATTCCTTCCCCAGGCTGGATGATAGACGC
+AGATAGTCGCCCAAAGTTCCGTGAGTTGATCATCGAATTCTCCAAAATGGCCCGAGACCC
+CCAGCGCTACCTTGTCATTCAGGTACAAATTGCAGTCTGTGCTTCCATTGGGAAGAGTCC
+CTC
+>ref000024|EGFR_Exon_25
+CTAATAGCCTCAAAATCTCTGCACCAGGGGGATGAAAGAATGCATTTGCCAAGTCCTACA
+GACTCCAACTTCTACCGTGCCCTGATGGATGAAGAAGACATGGACGACGTGGTGGATGCC
+GACGAGTACCTCATCCCACAGCAGGGCTTCTTCAGCAGCCCCTCCACGTCACGGACTCCC
+CTCCTGAGCTCTCTGGTATGAAATCTCTGTCTCTCTCTCTCTCTCAAGCTGTGTCTACTC
+ATTTGAACAAA
+>ref000025|EGFR_Exon_26
+CATTCCATGGGCAACTTCTCTGTTTCTTTTTCAGAGTGCAACCAGCAACAATTCCACCGT
+GGCTTGCATTGATAGAAATGGGGTATGTATGAACACCTTATAAGCCAGAA
+>ref000026|EGFR_Exon_27
+CCTTCCCTCATTTCCTCCTGCAGCTGCAAAGCTGTCCCATCAAGGAAGACAGCTTCTTGC
+AGCGATACAGCTCAGACCCCACAGGCGCCTTGACTGAGGACAGCATAGACGACACCTTCC
+TCCCAGTGCCTGGTGAGTGGCTTGTCTGGA
+>ref000027|EGFR_Exon_28.1
+CCTCTGATTTCTTTCCACTTTCAGAATACATAAACCAGTCCGTTCCCAAAAGGCCCGCTG
+GCTCTGTGCAGAATCCTGTCTATCACAATCAGCCTCTGAACCCCGCGCCCAGCAGAGACC
+CACACTACCAGGACCCCCACAGCACTGCAGTGGGCAACCCCGAGTATCTCAACACTGTCC
+AGCCCACCTGTGTCAACAGCACATTCGACAGCCCTGCCCACTGGGCCCAGAAAGGCAGCC
+ACCAAATTAG
+>ref000028|EGFR_Exon_28.2
+TGTCAACAGCACATTCGACAGCCCTGCCCACTGGGCCCAGAAAGGCAGCCACCAAATTAG
+CCTGGACAACCCTGACTACCAGCAGGACTTCTTTCCCAAGGAAGCCAAGCCAAATGGCAT
+CTTTAAGGGCTCCACAGCTGAAAATGCAGAATACCTAAGGGTCGCGCCACAAAGCAGTGA
+ATTTATTGGAGCATGACCACGGAGGATAGTATGAGCCCTAAAAATCCAGACTCTTTCGAT
+ACCCAGGACC
+>ref000029|MET_Exon_1.1
+CTCTCGCCTTGAACCTGTTTTGGCAGATAAACCTCTCATAATGAAGGCCCCCGCTGTGCT
+TGCACCTGGCATCCTCGTGCTCCTGTTTACCTTGGTGCAGAGGAGCAATGGGGAGTGTAA
+AGAGGCACTAGCAAAGTCCGAGATGAATGTGAATATGAAGTATCAGCTTCCCAACTTCAC
+CGCGGAAACACCCATCCAGAATGTCATTCTACATGAGCATCACATTTTCCTTGGTGCCAC
+TAACTACATTTATGTTTTAAATGAGGAAGACCTTCAGAAGGTTGCTGAGTACAAGACTGG
+GCCTGTGCTG
+>ref000030|MET_Exon_1.2
+TTCCTTGGTGCCACTAACTACATTTATGTTTTAAATGAGGAAGACCTTCAGAAGGTTGCT
+GAGTACAAGACTGGGCCTGTGCTGGAACACCCAGATTGTTTCCCATGTCAGGACTGCAGC
+AGCAAAGCCAATTTATCAGGAGGTGTTTGGAAAGATAACATCAACATGGCTCTAGTTGTC
+GACACCTACTATGATGATCAACTCATTAGCTGTGGCAGCGTCAACAGAGGGACCTGCCAG
+CGACATGTCTTTCCCCACAATCATACTGCTGACATACAGTCGGAGGTTCACTGCATATTC
+TCCC
+>ref000031|MET_Exon_2
+TGGATTCACATTAACTCTATGACCATATTTTATTCCAGACACTTCTGAGAAATTCATCAG
+GCTGTGAAGCGCGCCGTGATGAATATCGAACAGAGTTTACCACAGCTTTGCAGCGCGTTG
+ACTTATTCATGGGTCAATTCAGCGAAGTCCTCTTAACATCTATATCCACCTTCATTAAAG
+GAGACCTCACCATAGCTAATCTTGGGACATCAGAGGGTCGCTTCATGCAGGTAAGTGCTT
+TCTGAGAGTAGCTGTGTCTGTTCTATCTGGTATTGTGCAA
+>ref000032|MET_Exon_3
+TGAGCTTGTTGGAATAAGGATGTTATAACTTTTTTGCTGTTTAGGTTGTGGTTTCTCGAT
+CAGGACCATCAACCCCTCATGTGAATTTTCTCCTGGACTCCCATCCAGTGTCTCCAGAAG
+TGATTGTGGAGCATACATTAAACCAAAATGGCTACACACTGGTTATCACTGGGAAGAAGG
+TAAGCTGTTCCCACAGGGAATTTCCATAGACG
+>ref000033|MET_Exon_4
+GAAGCTCTTTCCACCCCTTCTCTTCACAGATCACGAAGATCCCATTGAATGGCTTGGGCT
+GCAGACATTTCCAGTCCTGCAGTCAATGCCTCTCTGCCCCACCCTTTGTTCAGTGTGGCT
+GGTGCCACGACAAATGTGTGCGATCGGAGGAATGCCTGAGCGGGACATGGACTCAACAGA
+TCTGTCTGCCTGCAATCTACAAGGTAGGAATCTCTAACAGCTGGCA
+>ref000034|MET_Exon_5
+TGTCCTTGTAGGTTTTCCCAAATAGTGCACCCCTTGAAGGAGGGACAAGGCTGACCATAT
+GTGGCTGGGACTTTGGATTTCGGAGGAATAATAAATTTGATTTAAAGAAAACTAGAGTTC
+TCCTTGGAAATGAGAGCTGCACCTTGACTTTAAGTGAGAGCACGATGAATACGTAAGGAT
+CTTAAAATGCTTTGCTGGGG
+>ref000035|MET_Exon_6
+GAAAATTCCTTGGATTTGTCATGTATTAAACTTTGGGTTTTTTTTCCAGATTGAAATGCA
+CAGTTGGTCCTGCCATGAATAAGCATTTCAATATGTCCATAATTATTTCAAATGGCCACG
+GGACAACACAATACAGTACATTCTCCTATGTGGTAAGGAAGATTCTATCCTATCATG
+>ref000036|MET_Exon_7
+GTTTTGTTTTTATCTCCCCTCCAGGATCCTGTAATAACAAGTATTTCGCCGAAATACGGT
+CCTATGGCTGGTGGCACTTTACTTACTTTAACTGGAAATTACCTAAACAGTGGGAATTCT
+AGACACATTTCAATTGGTGGAAAAACATGTACTTTAAAAAGGTGTTGTAAATTTATTTTT
+TGTTGCATCTGTCAATTTGAA
+>ref000037|MET_Exon_8
+GGAACCATTGAGTTATATCCTTTTGATTTGTGGATATAATTCTAAAATATGTGTATCTCT
+AATAGCTAAAATTCACTTCCTTAATTTTTTTTGTTCAGTGTGTCAAACAGTATTCTTGAA
+TGTTATACCCCAGCCCAAACCATTTCAACTGAGTTTGCTGTTAAATTGAAAATTGACTTA
+GCCAACCGAGAGACAAGCATCTTCAGTTACCGTGAAGATCCCATTGTCTATGAAATTCAT
+CCAACCAAATCTTTTATTAGGTAAGTAGAAGCTTCTGATGGGTATAAGAAAACAA
+>ref000038|MET_Exon_9
+TTGGTGGAAAGAACCTCTCAACATTGTCAGTTTTCTATTTTGCTTTGCCAGTGGTGGGAG
+CACAATAACAGGTGTTGGGAAAAACCTGAATTCAGTTAGTGTCCCGAGAATGGTCATAAA
+TGTGCATGAAGCAGGAAGGAACTTTACAGTGGTAAGTCCTTTGAGCAATGGTTCTACTCA
+GAGCTCTGCATCTTTGCCTCTAACCATGTGGCTTTCATGGTACCTG
+>ref000039|MET_Exon_10
+TGTTGCCAAGCTGTATTCTGTTTACAGTGGATAATTGTGTCTTTCTCTAGGCATGTCAAC
+ATCGCTCTAATTCAGAGATAATCTGTTGTACCACTCCTTCCCTGCAACAGCTGAATCTGC
+AACTCCCCCTGAAAACCAAAGCCTTTTTCATGTTAGATGGGATCCTTTCCAAATACTTTG
+ATCTCATTTATGTACATAATCCTGTGTTTAAGCCTTTTGAAAAGCCAGTGATGATCTCAA
+TGGGCAATGAAAATGTACTGGAAATTAAGGTAAGAAATGCTTTAAACACTGTCTTAAATC
+ATCAGCTCAAA
+>ref000040|MET_Exon_12
+GGACCCAAAGTGCTACAACCTGTGTAGTACAAATATCTATCATGGCTAAATGCTGACTTT
+TCTTTATTTGTCATTTTTAGTGGAAGCAAGCAATTTCTTCAACCGTCCTTGGAAAAGTAA
+TAGTTCAACCAGATCAGAATTTCACAGGATTGATTGCTGGTGTTGTCTCAATATCAACAG
+CACTGTTATTACTACTTGGGTTTTTCCTGTGGCTGAAAAAGAGAAAGCAAATTAAAGGTG
+CATTTTTGTTACTGTTCATTTTTAGAAGTTACCTTAAGAACACAGTCATTACAGTTTAAG
+ATTGTCGTCGATTCTTG
+>ref000041|MET_Exon_13
+GCCCATGATAGCCGTCTTTAACAAGCTCTTTCTTTCTCTCTGTTTTAAGATCTGGGCAGT
+GAATTAGTTCGCTACGATGCAAGAGTACACACTCCTCATTTGGATAGGCTTGTAAGTGCC
+CGAAGTGTAAGCCCAACTACAGAAATGGTTTCAAATGAATCTGTAGACTACCGAGCTACT
+TTTCCAGAAGGTATATTTCAGTTTATTGTTCTGAGAAATACCTATACATATACCTCAGTG
+GGTTGTGACATTGTTG
+>ref000042|MET_Exon_14
+CCTTCATCTTACAGATCAGTTTCCTAATTCATCTCAGAACGGTTCATGCCGACAAGTGCA
+GTATCCTCTGACAGACATGTCCCCCATCCTAACTAGTGGGGACTCTGATATATCCAGTCC
+ATTACTGCAAAATACTGTCCACATTGACCTCAGTGCTCTAAATCCAGAGCTGGTCCAGGC
+AGTGCAGCATGTAGTGATTGGGCCCAGTAGCCTGATTGTGCATTTCAATGAAGTCATAGG
+AAGAGGTAAGTATTTCCACTCAGCTTTTTGTTAAATACGATTTTCCAGTAAGC
+>ref000043|MET_Exon_15
+ACGCAGTGCTAACCAAGTTCTTTCTTTTGCACAGGGCATTTTGGTTGTGTATATCATGGG
+ACTTTGTTGGACAATGATGGCAAGAAAATTCACTGTGCTGTGAAATCCTTGAACAGTAAG
+TGGCATTTTATTTAACCATGGAGTATACTTTTGTGGTTTGCAACCTAATAAATAGCTTAT
+AATAAAACGTTGATTTACACTTTCCCCTTGTGGA
+>ref000044|MET_Exon_16
+TGTCTCCACCACTGGATTTCTCAGGAATCACTGACATAGGAGAAGTTTCCCAATTTCTGA
+CCGAGGGAATCATCATGAAAGATTTTAGTCATCCCAATGTCCTCTCGCTCCTGGGAATCT
+GCCTGCGAAGTGAAGGGTCTCCGCTGGTGGTCCTACCATACATGAAACATGGAGATCTTC
+GAAATTTCATTCGAAATGAGACTCATGTAAGTTGACTGCCAAGCTTACTAACTGGCAAAC
+TAGCTGTAAGCC
+>ref000045|MET_Exon_17
+TGCTTTTCTAACTCTCTTTGACTGCAGAATCCAACTGTAAAAGATCTTATTGGCTTTGGT
+CTTCAAGTAGCCAAAGGCATGAAATATCTTGCAAGCAAAAAGTTTGTCCACAGAGACTTG
+GCTGCAAGAAACTGTATGTAAGTATCAGAATCTCTGTGCCACAATCCAAATTAAGTGACA
+AGGAGGA
+>ref000046|MET_Exon_18
+TTCTATTTCAGCCACGGGTAATAATTTTTGTCCTTTCTGTAGGCTGGATGAAAAATTCAC
+AGTCAAGGTTGCTGATTTTGGTCTTGCCAGAGACATGTATGATAAAGAATACTATAGTGT
+ACACAACAAAACAGGTGCAAAGCTGCCAGTGAAGTGGATGGCTTTGGAAAGTCTGCAAAC
+TCAAAAGTTTACCACCAAGTCAGATGTGGTAATGTATTGGTTATCTCTGAGTTTCTCCTC
+T
+>ref000047|MET_Exon_19
+CTCACCTCATCTGTCCTGTTTCTTGTTTTACTAGTGGTCCTTTGGCGTGCTCCTCTGGGA
+GCTGATGACAAGAGGAGCCCCACCTTATCCTGACGTAAACACCTTTGATATAACTGTTTA
+CTTGTTGCAAGGGAGAAGACTCCTACAACCCGAATACTGCCCAGACCCCTTGTAAGTAGT
+CTTTCTGTACCTCTTACGTTCTTTACTTTTACAGAAATGCC
+>ref000048|MET_Exon_20
+CCTGCCTTCAAAGGGTCTCTTACAGCATGTCTTTCTTTTTGGAACAGATATGAAGTAATG
+CTAAAATGCTGGCACCCTAAAGCCGAAATGCGCCCATCCTTTTCTGAACTGGTGTCCCGG
+ATATCAGCGATCTTCTCTACTTTCATTGGGGAGCACTATGTCCATGTGAACGCTACTTAT
+GTGAACGTAAAATGTGTCGCTCCGTATCCTTCTCTGTTGTCATCAGAAGATAACGCTGAT
+GATGAGGTGGACACACGACCAGCCTCCTTCTGGGAGACATCATAGTGCTAGTACTATGTC
+AAAGCAACAGTCCACAC
diff --git a/pbcore/data/Fluidigm_human_amplicons.fasta.fai b/pbcore/data/Fluidigm_human_amplicons.fasta.fai
new file mode 100644
index 0000000..b1fc715
--- /dev/null
+++ b/pbcore/data/Fluidigm_human_amplicons.fasta.fai
@@ -0,0 +1,48 @@
+ref000001|EGFR_Exon_2 183 23 60 61
+ref000002|EGFR_Exon_3 203 233 60 61
+ref000003|EGFR_Exon_4 215 463 60 61
+ref000004|EGFR_Exon_5 157 705 60 61
+ref000005|EGFR_Exon_6 185 888 60 61
+ref000006|EGFR_Exon_7 213 1100 60 61
+ref000007|EGFR_Exon_8 312 1340 60 61
+ref000008|EGFR_Exon_9 195 1681 60 61
+ref000009|EGFR_Exon_10 191 1904 60 61
+ref000010|EGFR_Exon_11 155 2123 60 61
+ref000011|EGFR_Exon_12 240 2305 60 61
+ref000012|EGFR_Exon_13 239 2573 60 61
+ref000013|EGFR_Exon_14 148 2840 60 61
+ref000014|EGFR_Exon_15 215 3015 60 61
+ref000015|EGFR_Exon_16 119 3258 60 61
+ref000016|EGFR_Exon_17 210 3403 60 61
+ref000017|EGFR_Exon_18 222 3641 60 61
+ref000018|EGFR_Exon_19 165 3891 60 61
+ref000019|EGFR_Exon_20 239 4083 60 61
+ref000020|EGFR_Exon_21 207 4350 60 61
+ref000021|EGFR_Exon_22 151 4598 60 61
+ref000022|EGFR_Exon_23 215 4776 60 61
+ref000023|EGFR_Exon_24 183 5019 60 61
+ref000024|EGFR_Exon_25 251 5230 60 61
+ref000025|EGFR_Exon_26 110 5510 60 61
+ref000026|EGFR_Exon_27 150 5646 60 61
+ref000027|EGFR_Exon_28.1 250 5825 60 61
+ref000028|EGFR_Exon_28.2 250 6106 60 61
+ref000029|MET_Exon_1.1 310 6385 60 61
+ref000030|MET_Exon_1.2 304 6725 60 61
+ref000031|MET_Exon_2 280 7057 60 61
+ref000032|MET_Exon_3 212 7364 60 61
+ref000033|MET_Exon_4 226 7602 60 61
+ref000034|MET_Exon_5 200 7854 60 61
+ref000035|MET_Exon_6 177 8080 60 61
+ref000036|MET_Exon_7 201 8282 60 61
+ref000037|MET_Exon_8 295 8509 60 61
+ref000038|MET_Exon_9 226 8831 60 61
+ref000039|MET_Exon_10 311 9084 60 61
+ref000040|MET_Exon_12 317 9424 60 61
+ref000041|MET_Exon_13 256 9770 60 61
+ref000042|MET_Exon_14 293 10054 60 61
+ref000043|MET_Exon_15 214 10375 60 61
+ref000044|MET_Exon_16 252 10616 60 61
+ref000045|MET_Exon_17 187 10896 60 61
+ref000046|MET_Exon_18 241 11110 60 61
+ref000047|MET_Exon_19 221 11379 60 61
+ref000048|MET_Exon_20 317 11627 60 61
diff --git a/pbcore/data/Fluidigm_human_amplicons_tiny.fasta b/pbcore/data/Fluidigm_human_amplicons_tiny.fasta
new file mode 100644
index 0000000..7fa541e
--- /dev/null
+++ b/pbcore/data/Fluidigm_human_amplicons_tiny.fasta
@@ -0,0 +1,19 @@
+>ref000001|EGFR_Exon_2
+TTTCTTCCAGTTTGCCAAGGCACGAGTAACAAGCTCACGCAGTTGGGCACTTTTGAAGAT
+CATTTTCTCAGCCTCCAGAGGATGTTCAATAACTGTGAGGTGGTCCTTGGGAATTTGGAA
+ATTACCTATGTGCAGAGGAATTATGATCTTTCCTTCTTAAAGGTTGGTGACTTTGATTTT
+CCT
+>ref000002|EGFR_Exon_3
+TTCTTAGACCATCCAGGAGGTGGCTGGTTATGTCCTCATTGCCCTCAACACAGTGGAGCG
+AATTCCTTTGGAAAACCTGCAGATCATCAGAGGAAATATGTACTACGAAAATTCCTATGC
+CTTAGCAGTCTTATCTAACTATGATGCAAATAAAACCGGACTGAAGGAGCTGCCCATGAG
+AAATTTACAGGGTGAGAGGCTGG
+>ref000003|EGFR_Exon_4
+AGCTGGAAAGAGTGCTCACCGCAGTTCCATTCTCCCGCAGAAATCCTGCATGGCGCCGTG
+CGGTTCAGCAACAACCCTGCCCTGTGCAACGTGGAGAGCATCCAGTGGCGGGACATAGTC
+AGCAGTGACTTTCTCAGCAACATGTCGATGGACTTCCAGAACCACCTGGGCAGCTGTAAG
+TGTCGCATACACACTATCTCTGCCTCCAGCTCCTA
+>ref000004|EGFR_Exon_5
+GCGTCATCAGTTTCTCATCATTTCACTGAGATATGCATCTATTACTTTTACATTTCAGGC
+CAAAAGTGTGATCCAAGCTGTCCCAATGGGAGCTGCTGGGGTGCAGGAGAGGAGAACTGC
+CAGAAACGTAAGTCAGTGAACAGCCTCAGACCCATGT
\ No newline at end of file
diff --git a/pbcore/data/__init__.py b/pbcore/data/__init__.py
new file mode 100644
index 0000000..b00ce44
--- /dev/null
+++ b/pbcore/data/__init__.py
@@ -0,0 +1,167 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from pkg_resources import Requirement, resource_filename
+
+DATA_FILES = {'aligned_reads_1.cmp.h5':
+ ['m110818_075520_42141_c100129202555500000315043109121112_s1_p0.bas.h5',
+ 'm110818_075520_42141_c100129202555500000315043109121112_s2_p0.bas.h5']}
+
+MOVIE_NAME_14 = "m110818_075520_42141_c100129202555500000315043109121112_s1_p0"
+MOVIE_NAME_20 = "m130522_092457_42208_c100497142550000001823078008081323_s1_p0"
+MOVIE_NAME_21 = "m130731_192718_42129_c100564662550000001823085912221321_s1_p0"
+MOVIE_NAME_23 = "m140912_020930_00114_c100702482550000001823141103261590_s1_p0"
+MOVIE_NAME_CCS = "m130727_114215_42211_c100569412550000001823090301191423_s1_p0"
+MOVIE_NAME_BC = "m140307_221913_42203_c100626172550000001823119008061414_s1_p0"
+
+def _getAbsPath(fname):
+ return resource_filename(Requirement.parse('pbcore'),'pbcore/data/%s' % fname)
+
+def getBasH5_v20():
+ return _getAbsPath(MOVIE_NAME_20 + '.bas.h5')
+
+def getBaxH5_v20():
+ return [_getAbsPath('.'.join((MOVIE_NAME_20, str(k), 'bax.h5')))
+ for k in range(1,4)]
+
+def getBasH5_v21():
+ return _getAbsPath(MOVIE_NAME_21 + '.bas.h5')
+
+def getBaxH5_v21():
+ return [_getAbsPath('.'.join((MOVIE_NAME_21, str(k), 'bax.h5')))
+ for k in range(1,4)]
+
+def getBasH5_v23():
+ return _getAbsPath(MOVIE_NAME_23 + '.bas.h5')
+
+def getBaxH5_v23():
+ return [_getAbsPath('.'.join((MOVIE_NAME_23, str(k), 'bax.h5')))
+ for k in range(1,4)]
+
+def getCCSH5():
+ return _getAbsPath(MOVIE_NAME_CCS + '.1.ccs.h5')
+
+def getBcH5s():
+ return [_getAbsPath('.'.join((MOVIE_NAME_BC, str(k), 'bc.h5')))
+ for k in range(1,4)]
+
+def getCmpH5s():
+ '''
+ Returns a list of dictionaries containing 2 keys: cmph5 and
+ bash5s. The latter are the bash5s that were used to generate the
+ cmp.h5 file.
+ '''
+ return [{'cmph5' : _getAbsPath(cmph5),
+ 'bash5s': map(_getAbsPath, bash5s)}
+ for cmph5, bash5s in DATA_FILES.items()]
+
+def getCmpH5AndBas():
+ '''
+ The returned value is a dictionary containing 2 keys: cmph5
+ and bash5s. The latter are the bash5s that were used to generate
+ the cmp.h5 file.
+ '''
+ return getCmpH5s()[0]
+
+def getCmpH5():
+ return getCmpH5AndBas()["cmph5"]
+
+def getBasH5s():
+ return getCmpH5AndBas()["bash5s"]
+
+def getGff3():
+ '''
+ Returns the filename of an example GFFv3 file
+ '''
+ return _getAbsPath("variants.gff")
+
+def getFasta():
+ '''
+ Returns the filename of an example FASTA file.
+ '''
+ return _getAbsPath('Fluidigm_human_amplicons.fasta')
+
+
+def getTinyFasta():
+ """
+ Returns the filename of an example FASTA file.
+ """
+ return _getAbsPath('Fluidigm_human_amplicons_tiny.fasta')
+
+def getLambdaFasta():
+ """
+ Returns the filename of the FASTA of the lambda phage reference.
+ """
+ return _getAbsPath('lambdaNEB.fa')
+
+def getDosFormattedFasta():
+ """
+ Returns the filename of an example FASTA file with DOS line endings
+ """
+ return _getAbsPath('barcodes-ed65-450.fasta')
+
+def getBlasrM4():
+ return _getAbsPath('blasr-output.m4')
+
+def getBlasrM5():
+ return _getAbsPath('blasr-output.m5')
+
+def getFofns():
+ """
+ Returns a list of FOFN files
+ """
+ return map(_getAbsPath,
+ ["1.4_bas_files.fofn",
+ "2.0_bax_files.fofn",
+ "2.1_bax_files.fofn",
+ "2.1_ccs_files.fofn"])
+
+def getBcFofn():
+ return _getAbsPath("bc_files.fofn")
+
+
+def getBamAndCmpH5():
+ """
+ Get a "matched" BAM and cmp.h5 file
+ """
+ return (_getAbsPath("bam_mapping.bam"),
+ _getAbsPath("cmph5_mapping.cmp.h5"))
+
+def getBaxForBam():
+ """
+ Get the bax file that was mapped to produce the bam
+ """
+ return _getAbsPath("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5")
+
+def getUnalignedBam():
+ """
+ Get the unaligned BAM file, corresponding to the same bax above
+ """
+ return _getAbsPath("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam")
diff --git a/pbcore/data/aligned_reads_1.bam b/pbcore/data/aligned_reads_1.bam
new file mode 100644
index 0000000..3993fb0
Binary files /dev/null and b/pbcore/data/aligned_reads_1.bam differ
diff --git a/pbcore/data/aligned_reads_1.bam.bai b/pbcore/data/aligned_reads_1.bam.bai
new file mode 100644
index 0000000..81bc1ce
Binary files /dev/null and b/pbcore/data/aligned_reads_1.bam.bai differ
diff --git a/pbcore/data/aligned_reads_1.cmp.h5 b/pbcore/data/aligned_reads_1.cmp.h5
new file mode 100644
index 0000000..f05c634
Binary files /dev/null and b/pbcore/data/aligned_reads_1.cmp.h5 differ
diff --git a/pbcore/data/bam_mapping.bam b/pbcore/data/bam_mapping.bam
new file mode 100644
index 0000000..6fcaa68
Binary files /dev/null and b/pbcore/data/bam_mapping.bam differ
diff --git a/pbcore/data/bam_mapping.bam.bai b/pbcore/data/bam_mapping.bam.bai
new file mode 100644
index 0000000..d64c44d
Binary files /dev/null and b/pbcore/data/bam_mapping.bam.bai differ
diff --git a/pbcore/data/bam_mapping.bam.pbi b/pbcore/data/bam_mapping.bam.pbi
new file mode 100644
index 0000000..b80e837
Binary files /dev/null and b/pbcore/data/bam_mapping.bam.pbi differ
diff --git a/pbcore/data/barcodes-ed65-450.fasta b/pbcore/data/barcodes-ed65-450.fasta
new file mode 100644
index 0000000..a9d2889
--- /dev/null
+++ b/pbcore/data/barcodes-ed65-450.fasta
@@ -0,0 +1,900 @@
+>lbc1
+TCAGACGATGCGTCAT
+>lbc2
+CTATACATGACTCTGC
+>lbc3
+TACTAGAGTAGCACTC
+>lbc4
+TGTGTATCAGTACATG
+>lbc5
+ACACGCATGACACACT
+>lbc6
+GATCTCTACTATATGC
+>lbc7
+ACAGTCTATACTGCTG
+>lbc8
+ATGATGTGCTACATCT
+>lbc9
+CTGCGTGCTCTACGAC
+>lbc10
+GCGCGATACGATGACT
+>lbc11
+CGCGCTCAGCTGATCG
+>lbc12
+GCGCACGCACTACAGA
+>lbc13
+ACACTGACGTCGCGAC
+>lbc14
+CGTCTATATACGTATA
+>lbc15
+ATAGAGACTCAGAGCT
+>lbc16
+TAGATGCGAGAGTAGA
+>lbc17
+CATAGCGACTATCGTG
+>lbc18
+CATCACTACGCTAGAT
+>lbc19
+CGCATCTGTGCATGCA
+>lbc20
+TATGTGATCGTCTCTC
+>lbc21
+GTACACGCTGTGACTA
+>lbc22
+CGTGTCGCGCATATCT
+>lbc23
+ATATCAGTCATGCATA
+>lbc24
+GAGATCGACAGTCTCG
+>lbc25
+CACGCACACACGCGCG
+>lbc26
+CGAGCACGCGCGTGTG
+>lbc27
+GTAGTCTCGCACAGAT
+>lbc28
+GAGACTCTGTGCGCGT
+>lbc29
+GCTCGACTGTGAGAGA
+>lbc30
+AGAGATGTGTGATGAC
+>lbc31
+TACGACTACATATCAG
+>lbc32
+TATCTCTGTAGAGTCT
+>lbc33
+AGAGAGAGACATGCGC
+>lbc34
+ACTCTCGCTCTGTAGA
+>lbc35
+TCTATGTCTCAGTAGT
+>lbc36
+GCGTATATCTCATGCG
+>lbc37
+GTGCGTATGTCGCTAC
+>lbc38
+TGCTCGCAGTATCACA
+>lbc39
+CTGTGTGTGATAGAGT
+>lbc40
+CAGTGAGAGCGCGATA
+>lbc41
+GTACATATGCGTCTGT
+>lbc42
+GAGACTAGAGATAGTG
+>lbc43
+TACGCGTGTACGCAGA
+>lbc44
+TGTCACTCATCTGAGT
+>lbc45
+GCACATACACGCTCAC
+>lbc46
+GCTCGTCGCGCGCACA
+>lbc47
+ACAGTGCGCTGTCTAT
+>lbc48
+TCACACTCTAGAGCGA
+>lbc49
+TCACATATGTATACAT
+>lbc50
+CGCTGCGAGAGACAGT
+>lbc51
+ACACACAGACTGTGAG
+>lbc52
+GCAGACTCTCACACGC
+>lbc53
+TGCTCTCGTGTACTGT
+>lbc54
+GTGTGAGATATATATC
+>lbc55
+CTCAGTGTGACACATG
+>lbc56
+TGCGAGCGACTCTATC
+>lbc57
+GTCAGCTAGTGTCAGC
+>lbc58
+AGATATCATCAGCGAG
+>lbc59
+GTGCAGTGATCGATGA
+>lbc60
+TGACTCGCTCATAGTC
+>lbc61
+ATGCTGATGACGCGCT
+>lbc62
+GACAGCATCTGCGCTC
+>lbc63
+AGCGTCTGACGTGAGT
+>lbc64
+TCGATATACGACGTGC
+>lbc65
+TCGTCATACGCTCTAG
+>lbc66
+CGACTACGTACAGTAG
+>lbc67
+GCGTAGACAGACTACA
+>lbc68
+ACAGTATGATGTACTC
+>lbc69
+GTCTGATAGATACAGA
+>lbc70
+CTGCGCAGTACGTGCA
+>lbc71
+TAGATCTCTGACTCAC
+>lbc72
+CTGATGCGCGCTGTAC
+>lbc73
+CACTCGTGCACGATGC
+>lbc74
+TGACAGTATCACAGTG
+>lbc75
+GAGATACGCTGCAGTC
+>lbc76
+ACGTGAGCTCACTCGC
+>lbc77
+ATAGAGAGTGTCTCAG
+>lbc78
+CATAGAGAGATAGTAT
+>lbc79
+ATCTCGAGATGTAGCG
+>lbc80
+ACGATCACTCGTGTCA
+>lbc81
+GATCGACTCGAGCATC
+>lbc82
+ATGCTCACTACTACAT
+>lbc83
+CGTGCACATCTATAGC
+>lbc84
+GACTGCACATGCACGA
+>lbc85
+TATGACTAGTGTACTA
+>lbc86
+GACGTGTCGTAGATAT
+>lbc87
+ATAGCGACGCGATATA
+>lbc88
+ATCGCTGTGTCTATAG
+>lbc89
+TCTCACTGATAGCGTG
+>lbc90
+TGTCGTCTATCATGTA
+>lbc91
+CACACGAGATCTCATC
+>lbc92
+AGATACACATGATACT
+>lbc93
+CGTGAGTAGTCAGACG
+>lbc94
+TCTCGACTGCACATAT
+>lbc95
+TGAGTGACGTGTAGCG
+>lbc96
+GTGTGCACTCACACTC
+>lbc97
+TACGATCGTAGCTGCT
+>lbc98
+TATACACACTCGCTCG
+>lbc99
+AGCGCTGCGACACGCG
+>lbc100
+GTCGTAGCTGCTGTAT
+>lbc101
+CTGTACTAGAGCGTCT
+>lbc102
+TCGAGTGTATAGCTCA
+>lbc103
+ACTGTGACAGTATGAT
+>lbc104
+TGTCTGAGACGCATAC
+>lbc105
+CACTCACGTGTGATAT
+>lbc106
+ATCGCATCGCAGAGAC
+>lbc107
+TACTCATATATGCTAC
+>lbc108
+GTCTACGCTCGTCGCG
+>lbc109
+TGCGAGACTATCGCGA
+>lbc110
+CAGATCTCTCTGATGT
+>lbc111
+GTAGAGTGATCGCGTC
+>lbc112
+ACGACAGTCAGAGTAT
+>lbc113
+ATATATAGCTGATGCG
+>lbc114
+TGCTATCTGAGATACT
+>lbc115
+CAGCAGATCATGTCGA
+>lbc116
+TGCTGCGAGCGCTCTG
+>lbc117
+ACTATCGCAGCTCAGT
+>lbc118
+CGTCTCTCGTCTGTGC
+>lbc119
+GAGTCTCGATATACTA
+>lbc120
+TGTCATGTGTACACAC
+>lbc121
+TCTGTCGATATACACT
+>lbc122
+ACGTGCTCTATAGAGA
+>lbc123
+TATCAGCACGACATGC
+>lbc124
+GCTCTCACGATATCAG
+>lbc125
+TATATGCTCTGTGTGA
+>lbc126
+GATAGCTGCTAGCTGA
+>lbc127
+TCTCATGTGTGAGCTA
+>lbc128
+TCAGATGTGTCGCGAG
+>lbc129
+CGTAGCTCAGACACTC
+>lbc130
+TCAGAGACACTACGAG
+>lbc131
+ATCGAGCAGCAGTCGT
+>lbc132
+CGTAGCTCGAGATGAG
+>lbc133
+GCTAGTCGATGACAGC
+>lbc134
+CATGATGCGAGACGCT
+>lbc135
+GTGTAGCGTAGACAGT
+>lbc136
+AGCACGTGTGTCGACA
+>lbc137
+CTAGACACGCAGTCAC
+>lbc138
+TAGCGTGAGAGTGTCG
+>lbc139
+GTCTCTCTCTCACGCA
+>lbc140
+TGCATAGTAGTGCTCT
+>lbc141
+CATATCAGTGCTACAG
+>lbc142
+CGACGTCATAGTGCGT
+>lbc143
+ACACACTCTATCAGAT
+>lbc144
+GCTGTGTGTGCTCGTC
+>lbc145
+AGCGTAGCATCTGAGC
+>lbc146
+GAGTCTGCACGCGCTA
+>lbc147
+AGACGCGAGCGCGTAG
+>lbc148
+CTACGATGCTATGTAT
+>lbc149
+CGACTAGATCTATCAT
+>lbc150
+ATCTCTGTGCGCGCAG
+>lbc151
+GCTAGCATGCTCTCAG
+>lbc152
+GTCACGATATAGTGAC
+>lbc153
+TCTACTGCATGATGTC
+>lbc154
+AGTCGTGACTATGCTC
+>lbc155
+GTATAGACAGATGTGC
+>lbc156
+TAGTGTGCGACTCTGA
+>lbc157
+GCACTCAGAGACGCGA
+>lbc158
+TCTATCAGCGCTGATG
+>lbc159
+ATGTCGCATATATCGC
+>lbc160
+CACGACTATATGCTCT
+>lbc161
+AGTCACACGCACGCTG
+>lbc162
+CATACATCGCGCAGTA
+>lbc163
+TGCGAGCGTGCACAGA
+>lbc164
+CTCTGACTCGCGTCGA
+>lbc165
+CTATCTAGCACTCACA
+>lbc166
+ACACGTGATAGCTACG
+>lbc167
+GCGATCACTGTACACT
+>lbc168
+CGCTAGAGATCTGCTA
+>lbc169
+GATACTGACACACTAT
+>lbc170
+GAGCTGATGTACATGT
+>lbc171
+AGTCGCGTAGCTCATC
+>lbc172
+TGTAGAGATACTCACT
+>lbc173
+TCGCTGACTCGACACA
+>lbc174
+TACATCTCGCTGCGCA
+>lbc175
+GTATATATATACGTCT
+>lbc176
+TCGCGAGCAGCGACAT
+>lbc177
+AGCTCAGTATCATCTG
+>lbc178
+ACACAGTAGAGCGAGC
+>lbc179
+ACGACGCGCACTGACA
+>lbc180
+CTCATAGCGTGTACTC
+>lbc181
+GACGACAGACTGCATA
+>lbc182
+GTCTGTATAGCTATCT
+>lbc183
+TGTCTCGTGCTGAGAC
+>lbc184
+CATATGCTCGTGCACT
+>lbc185
+ACTACATACTAGATCA
+>lbc186
+TGTGCACGACAGCAGT
+>lbc187
+ATGATACACGCGCGAC
+>lbc188
+TGTCTGATCTGTATCA
+>lbc189
+CTCTCGCATACGCGAG
+>lbc190
+GAGCGTGTATACAGCG
+>lbc191
+GAGCTCATGTAGACAC
+>lbc192
+TACATATGTCACGCGC
+>lbc193
+ATCGCTCTCATGTCTA
+>lbc194
+ACGATGTATCTACGCA
+>lbc195
+TCGATACGCACTCGAT
+>lbc196
+CACGACACGACGATGT
+>lbc197
+CTGCAGCTCACTACTA
+>lbc198
+CTATATGAGACGAGTG
+>lbc199
+CTCTCGTAGACAGATA
+>lbc200
+CGCATGACACGTGTGT
+>lbc201
+CACATACTACTACTGA
+>lbc202
+AGTCAGATGCGCACTC
+>lbc203
+AGCGACGCGAGAGTGC
+>lbc204
+ATACACTCATGTGCAC
+>lbc205
+GCTACGCTATAGACAT
+>lbc206
+TATCTATCGCATATCG
+>lbc207
+TCACGTGCAGATATAG
+>lbc208
+GCACAGCGTAGCGCAT
+>lbc209
+CATGCTACGTCTCTGT
+>lbc210
+CTCACGTACGTCACAC
+>lbc211
+TCTGAGACACAGACTC
+>lbc212
+CTAGTCTCTATCGCAT
+>lbc213
+ACGCTCGCTGAGCATA
+>lbc214
+ACTCATGTATATGAGT
+>lbc215
+AGCGTAGCGCGCGTCA
+>lbc216
+TCTCGTCGCAGTCTCT
+>lbc217
+GACGAGCGTCTGAGAG
+>lbc218
+GTATGATCACTAGTAG
+>lbc219
+CTCACACATACACGTC
+>lbc220
+GTATCGAGCGTATAGC
+>lbc221
+GCTGCGCTGATATGCG
+>lbc222
+GTCAGAGCTCTCGTGC
+>lbc223
+ATATGACATACACGCA
+>lbc224
+CTCGCTCGACGAGCGC
+>lbc225
+CGTCATCTATATACAG
+>lbc226
+TGTACGCTCTCTATAT
+>lbc227
+AGATCGCGCATGTGTA
+>lbc228
+GACACAGTGTGTAGTC
+>lbc229
+GTGCGCTACAGTCTCT
+>lbc230
+CATCGTCTAGCACTCG
+>lbc231
+CAGCGCATCTCACGTC
+>lbc232
+GTCTCATCATGCTGCG
+>lbc233
+ATCGTATAGTCATACA
+>lbc234
+AGTGCGCACATGTCAG
+>lbc235
+ATCTACGACTAGCAGA
+>lbc236
+TCGCGACATATAGATG
+>lbc237
+AGATATACTGTCTGAT
+>lbc238
+AGTCACTGTCTACTCG
+>lbc239
+TATACGAGATACGTGA
+>lbc240
+ACATGCGTGACAGTCA
+>lbc241
+GTGAGAGTCTGATACT
+>lbc242
+GCACGATGTCAGCGCG
+>lbc243
+CACGTGCTCGAGAGTC
+>lbc244
+GACACTCAGTCTCTCA
+>lbc245
+ACAGTAGACTCTCAGA
+>lbc246
+ACACTAGATCGCGTGT
+>lbc247
+ACGTCAGCACTGCTCT
+>lbc248
+CACAGTCGCAGTACGC
+>lbc249
+GTGACTCTATGCTATA
+>lbc250
+CTCTACATCAGTGCTA
+>lbc251
+GATGAGTATAGACACA
+>lbc252
+ATCTGAGTCTGACACG
+>lbc253
+GCGAGACTCAGCTCTG
+>lbc254
+CGTACGACTGCAGCGT
+>lbc255
+CGTGTCACTCTGCGTG
+>lbc256
+AGCTCTGTCACTAGAC
+>lbc257
+GCGAGAGTGAGACGCA
+>lbc258
+TCTACTACACTGTACT
+>lbc259
+CATCGTCACAGACATA
+>lbc260
+GTGCACTCGCGCTCTC
+>lbc261
+TGACATCTACACATAC
+>lbc262
+GTCGTCTAGATCGACG
+>lbc263
+GACATAGCTAGATCGC
+>lbc264
+TATATATGTCTATAGA
+>lbc265
+CTGTGTATCTGTGTAC
+>lbc266
+CGACGCACGATACTAT
+>lbc267
+TGATATATACGCGCGT
+>lbc268
+CGCGTATGTATGTCGC
+>lbc269
+CTCGAGCAGTAGATAC
+>lbc270
+CTGTGCTATGTACGCG
+>lbc271
+ACTCAGCGCGTACATA
+>lbc272
+TGAGATATGCATGATG
+>lbc273
+ACTCTATGTCGATGTA
+>lbc274
+GCGCGTGCTGCGTCTA
+>lbc275
+GATCATGTGAGCATAG
+>lbc276
+CATGTAGAGCAGAGAG
+>lbc277
+GTGTGTCTCGATGCGC
+>lbc278
+CTCGCACGTCGCATAG
+>lbc279
+CGAGCTACTCTGACAG
+>lbc280
+CGTGAGTATATGTCAT
+>lbc281
+ACAGTACTAGTGCGAG
+>lbc282
+CTCACTACGCGCGCGT
+>lbc283
+GACTCTCTATCGTACT
+>lbc284
+TATATACAGAGTCGAG
+>lbc285
+TGAGTGAGACATATCA
+>lbc286
+GTGACACACAGAGCAC
+>lbc287
+CTGCGTATAGATATGA
+>lbc288
+GAGAGTGTGAGAGTGT
+>lbc289
+CGTCTCTATCTCTCTA
+>lbc290
+TACATGTGTCTATGTC
+>lbc291
+TCTCGCGCGTGCACGC
+>lbc292
+TATGTGTCTGCGCATA
+>lbc293
+AGTCTGAGAGAGCTAT
+>lbc294
+ACAGTCGAGCGCTGCG
+>lbc295
+GAGAGTAGCGTGTACA
+>lbc296
+GATATATCGAGTATAT
+>lbc297
+GCACACATATCTGATG
+>lbc298
+CATCGCGAGTGCGCTC
+>lbc299
+ACATATCGTACTCTCT
+>lbc300
+AGCACAGTCACATGTC
+>lbc301
+GCGCACAGACATCTGT
+>lbc302
+ACGCGCTATCTCAGAG
+>lbc303
+CTGTAGACATCACACG
+>lbc304
+TATCTGAGCGCGAGCA
+>lbc305
+CTCTGCTCTGACTCTC
+>lbc306
+ACGTAGTGCACACAGA
+>lbc307
+TGTATGAGTGTCTGAC
+>lbc308
+CTCTGCAGCGATCACT
+>lbc309
+ACTGCGAGATACACAC
+>lbc310
+TATAGTGCGCAGCGAC
+>lbc311
+GATGTGTGCGCAGTGC
+>lbc312
+AGACACACACGCACAT
+>lbc313
+CACATGTGACTCGACG
+>lbc314
+GATCTGTCGTGAGCGT
+>lbc315
+ATATAGCGCATAGCTC
+>lbc316
+ACTCATCACGTCTCGA
+>lbc317
+CTCTCTAGAGTGACAT
+>lbc318
+TCACACTGTGCGAGAC
+>lbc319
+CGCGCGAGTATCTCGT
+>lbc320
+TATCTCTCGAGTCGCG
+>lbc321
+TAGATGAGTACACGTA
+>lbc322
+CATGTGCGCTCATCAC
+>lbc323
+GTATAGCACTCGAGCG
+>lbc324
+ACTCTGCTGTCATCGC
+>lbc325
+CGCATATCTCACTAGT
+>lbc326
+CACTATACACTGCGCT
+>lbc327
+CGCACAGATACGCTCT
+>lbc328
+CAGATCTCGCGTGACA
+>lbc329
+GCGCTCTCTCACATAC
+>lbc330
+ACACATCTCGTGAGAG
+>lbc331
+AGTAGTGTGATACTAG
+>lbc332
+CGAGCATATATATCTC
+>lbc333
+CTATACGTATATCTAT
+>lbc334
+GTGTATCAGCGAGTAT
+>lbc335
+GCTGAGACGACGCGCG
+>lbc336
+GCGCAGTGTCACATCA
+>lbc337
+TCATACACACAGATAG
+>lbc338
+CACTCGACTCTCGCGT
+>lbc339
+CACATATCAGAGTGCG
+>lbc340
+CGTATACAGTCACGCT
+>lbc341
+TGTAGACTAGCGCTGC
+>lbc342
+AGCACACATATAGCGC
+>lbc343
+GATATCTCGATCTCTG
+>lbc344
+TCTCACGAGAGCGCAC
+>lbc345
+TGTGCTCTCTACACAG
+>lbc346
+TGTCATATGAGAGTGT
+>lbc347
+CTGTGTGCTCGCTATG
+>lbc348
+TATAGAGCTCTACATA
+>lbc349
+CTATACATAGTGATGT
+>lbc350
+TCTCTCTATCGCGCTC
+>lbc351
+ATAGCGACATCTCTCT
+>lbc352
+GCGCGCGCACTCTCTG
+>lbc353
+TCTCTCGATATGATAG
+>lbc354
+GATCACAGAGATGCTC
+>lbc355
+GCTCGCACAGCGCGTC
+>lbc356
+CACAGAGACACGCACA
+>lbc357
+GCGTGTGTCGAGTGTA
+>lbc358
+GTCATCTGTACGCTAT
+>lbc359
+CACACGCACTGAGATA
+>lbc360
+ACACATATCGCACTAC
+>lbc361
+GAGAGCGCTGACTCTG
+>lbc362
+ACACGTGTGCTCTCTC
+>lbc363
+CGAGTGTGTCTATACT
+>lbc364
+GTGATGCATACGTACA
+>lbc365
+CTCGTGACGCTGACTG
+>lbc366
+TCTGTATCTCTATGTG
+>lbc367
+TGTGTCTCTGAGAGTA
+>lbc368
+TAGATCTATCATCGTC
+>lbc369
+ACATATACAGCGTATC
+>lbc370
+CGCTCATATGAGCTCA
+>lbc371
+GTCGCGCATAGAGCGC
+>lbc372
+TACACACTATGTGCGT
+>lbc373
+ATACGCGCGCGCATGC
+>lbc374
+GTGCGCGAGAGTATAC
+>lbc375
+GCGCTAGTGTGTACGA
+>lbc376
+GAGACACGTCGCACAC
+>lbc377
+ACAGAGTGTGCAGATA
+>lbc378
+TAGAGCGTCTCTCGTA
+>lbc379
+TCTATGAGCACTCTCG
+>lbc380
+ATGTGTATATAGATAT
+>lbc381
+CTCACACTCTCTCACA
+>lbc382
+TCAGCGCACTGTGCTG
+>lbc383
+GTGCATACATACATAT
+>lbc384
+CAGAGAGATATCTCTG
+>lbc385
+TCTCAGATAGTCTATA
+>lbc386
+AGTAGACAGAGCGTGA
+>lbc387
+AGTCGAGATATACAGT
+>lbc388
+AGAGAGCTCTCTCATC
+>lbc389
+AGCTACGCGTGCACTG
+>lbc390
+CAGTCTGTGAGTCACT
+>lbc391
+AGAGCAGACGAGACTC
+>lbc392
+GTCTATCTCGCGAGAG
+>lbc393
+GATGTCTGAGTGTGTG
+>lbc394
+ACTCGCGCACGCGCGA
+>lbc395
+ATATGAGTGACTCGTG
+>lbc396
+AGTCTGCGAGACAGAG
+>lbc397
+GTGTGTGTCACACTAT
+>lbc398
+CGTACGTGCGAGTACA
+>lbc399
+TATCGCTAGATGCGCA
+>lbc400
+GTGAGTATGTACTCTG
+>lbc401
+GTCATACGAGTGAGCA
+>lbc402
+ATGAGTCTCACTGTAT
+>lbc403
+TCGATGCGCATACAGC
+>lbc404
+CTATGTGAGTGTGATC
+>lbc405
+GCATACTGTGCGCTCG
+>lbc406
+CGACATAGCGCGACGA
+>lbc407
+ATGCGATACATAGTCT
+>lbc408
+CGTCTAGATAGAGATG
+>lbc409
+TACTCACTGCGCTCAC
+>lbc410
+ACACACACACTCTATA
+>lbc411
+CTCTATATATCTCGTC
+>lbc412
+ACATATGTCTGAGACA
+>lbc413
+ACACACGCGAGACAGA
+>lbc414
+GACACTCGCATGTGCG
+>lbc415
+CAGTATAGAGTCATAG
+>lbc416
+ATCTCTGCTACACTCA
+>lbc417
+CACACGCGCGCTATAT
+>lbc418
+GATACGAGAGCTGATG
+>lbc419
+CGCGACACGCTCGCGC
+>lbc420
+CATATATATCAGCTGT
+>lbc421
+TACACAGCATCTCGCA
+>lbc422
+AGATGTCATGTCTCTA
+>lbc423
+TCACGTGCTCACTGTG
+>lbc424
+GTGACAGACGTCACGC
+>lbc425
+CGTGTCTAGCGCGCGC
+>lbc426
+CGCTCTGTCACGTCTG
+>lbc427
+TGTGTCAGAGACTGTC
+>lbc428
+CTACGAGACAGATCGC
+>lbc429
+GATATACGCGAGAGAG
+>lbc430
+TAGAGAGCGTCGCGTG
+>lbc431
+ATAGTACACTCTGTGT
+>lbc432
+ACGACATCGCTCACAG
+>lbc433
+AGCATACGCACTATAG
+>lbc434
+AGTAGCTCGTCGAGTG
+>lbc435
+GTGCTATAGCACACGC
+>lbc436
+GCGAGCTATACATATA
+>lbc437
+CGTGTCTCTCGATACA
+>lbc438
+GACGCGCTCACAGTGA
+>lbc439
+GAGCACAGAGCGCGCT
+>lbc440
+CATAGATACGCACGCG
+>lbc441
+AGACACGAGTCTAGAT
+>lbc442
+GACTCGCGATACTAGA
+>lbc443
+TAGAGCGTGCATATAT
+>lbc444
+ACGTGTATGACGATAC
+>lbc445
+ATACGCATATCGCAGT
+>lbc446
+GATATATATGTGTGTA
+>lbc447
+GCGATACACAGTCGCA
+>lbc448
+TCACTGTGTGTGTCTG
+>lbc449
+CGCACACATAGATACA
+>lbc450
+CACTACTAGCGTGTGC
diff --git a/pbcore/data/barcodes-ed65-450.fasta.fai b/pbcore/data/barcodes-ed65-450.fasta.fai
new file mode 100644
index 0000000..57aa3fb
--- /dev/null
+++ b/pbcore/data/barcodes-ed65-450.fasta.fai
@@ -0,0 +1,450 @@
+lbc1 16 7 16 18
+lbc2 16 32 16 18
+lbc3 16 57 16 18
+lbc4 16 82 16 18
+lbc5 16 107 16 18
+lbc6 16 132 16 18
+lbc7 16 157 16 18
+lbc8 16 182 16 18
+lbc9 16 207 16 18
+lbc10 16 233 16 18
+lbc11 16 259 16 18
+lbc12 16 285 16 18
+lbc13 16 311 16 18
+lbc14 16 337 16 18
+lbc15 16 363 16 18
+lbc16 16 389 16 18
+lbc17 16 415 16 18
+lbc18 16 441 16 18
+lbc19 16 467 16 18
+lbc20 16 493 16 18
+lbc21 16 519 16 18
+lbc22 16 545 16 18
+lbc23 16 571 16 18
+lbc24 16 597 16 18
+lbc25 16 623 16 18
+lbc26 16 649 16 18
+lbc27 16 675 16 18
+lbc28 16 701 16 18
+lbc29 16 727 16 18
+lbc30 16 753 16 18
+lbc31 16 779 16 18
+lbc32 16 805 16 18
+lbc33 16 831 16 18
+lbc34 16 857 16 18
+lbc35 16 883 16 18
+lbc36 16 909 16 18
+lbc37 16 935 16 18
+lbc38 16 961 16 18
+lbc39 16 987 16 18
+lbc40 16 1013 16 18
+lbc41 16 1039 16 18
+lbc42 16 1065 16 18
+lbc43 16 1091 16 18
+lbc44 16 1117 16 18
+lbc45 16 1143 16 18
+lbc46 16 1169 16 18
+lbc47 16 1195 16 18
+lbc48 16 1221 16 18
+lbc49 16 1247 16 18
+lbc50 16 1273 16 18
+lbc51 16 1299 16 18
+lbc52 16 1325 16 18
+lbc53 16 1351 16 18
+lbc54 16 1377 16 18
+lbc55 16 1403 16 18
+lbc56 16 1429 16 18
+lbc57 16 1455 16 18
+lbc58 16 1481 16 18
+lbc59 16 1507 16 18
+lbc60 16 1533 16 18
+lbc61 16 1559 16 18
+lbc62 16 1585 16 18
+lbc63 16 1611 16 18
+lbc64 16 1637 16 18
+lbc65 16 1663 16 18
+lbc66 16 1689 16 18
+lbc67 16 1715 16 18
+lbc68 16 1741 16 18
+lbc69 16 1767 16 18
+lbc70 16 1793 16 18
+lbc71 16 1819 16 18
+lbc72 16 1845 16 18
+lbc73 16 1871 16 18
+lbc74 16 1897 16 18
+lbc75 16 1923 16 18
+lbc76 16 1949 16 18
+lbc77 16 1975 16 18
+lbc78 16 2001 16 18
+lbc79 16 2027 16 18
+lbc80 16 2053 16 18
+lbc81 16 2079 16 18
+lbc82 16 2105 16 18
+lbc83 16 2131 16 18
+lbc84 16 2157 16 18
+lbc85 16 2183 16 18
+lbc86 16 2209 16 18
+lbc87 16 2235 16 18
+lbc88 16 2261 16 18
+lbc89 16 2287 16 18
+lbc90 16 2313 16 18
+lbc91 16 2339 16 18
+lbc92 16 2365 16 18
+lbc93 16 2391 16 18
+lbc94 16 2417 16 18
+lbc95 16 2443 16 18
+lbc96 16 2469 16 18
+lbc97 16 2495 16 18
+lbc98 16 2521 16 18
+lbc99 16 2547 16 18
+lbc100 16 2574 16 18
+lbc101 16 2601 16 18
+lbc102 16 2628 16 18
+lbc103 16 2655 16 18
+lbc104 16 2682 16 18
+lbc105 16 2709 16 18
+lbc106 16 2736 16 18
+lbc107 16 2763 16 18
+lbc108 16 2790 16 18
+lbc109 16 2817 16 18
+lbc110 16 2844 16 18
+lbc111 16 2871 16 18
+lbc112 16 2898 16 18
+lbc113 16 2925 16 18
+lbc114 16 2952 16 18
+lbc115 16 2979 16 18
+lbc116 16 3006 16 18
+lbc117 16 3033 16 18
+lbc118 16 3060 16 18
+lbc119 16 3087 16 18
+lbc120 16 3114 16 18
+lbc121 16 3141 16 18
+lbc122 16 3168 16 18
+lbc123 16 3195 16 18
+lbc124 16 3222 16 18
+lbc125 16 3249 16 18
+lbc126 16 3276 16 18
+lbc127 16 3303 16 18
+lbc128 16 3330 16 18
+lbc129 16 3357 16 18
+lbc130 16 3384 16 18
+lbc131 16 3411 16 18
+lbc132 16 3438 16 18
+lbc133 16 3465 16 18
+lbc134 16 3492 16 18
+lbc135 16 3519 16 18
+lbc136 16 3546 16 18
+lbc137 16 3573 16 18
+lbc138 16 3600 16 18
+lbc139 16 3627 16 18
+lbc140 16 3654 16 18
+lbc141 16 3681 16 18
+lbc142 16 3708 16 18
+lbc143 16 3735 16 18
+lbc144 16 3762 16 18
+lbc145 16 3789 16 18
+lbc146 16 3816 16 18
+lbc147 16 3843 16 18
+lbc148 16 3870 16 18
+lbc149 16 3897 16 18
+lbc150 16 3924 16 18
+lbc151 16 3951 16 18
+lbc152 16 3978 16 18
+lbc153 16 4005 16 18
+lbc154 16 4032 16 18
+lbc155 16 4059 16 18
+lbc156 16 4086 16 18
+lbc157 16 4113 16 18
+lbc158 16 4140 16 18
+lbc159 16 4167 16 18
+lbc160 16 4194 16 18
+lbc161 16 4221 16 18
+lbc162 16 4248 16 18
+lbc163 16 4275 16 18
+lbc164 16 4302 16 18
+lbc165 16 4329 16 18
+lbc166 16 4356 16 18
+lbc167 16 4383 16 18
+lbc168 16 4410 16 18
+lbc169 16 4437 16 18
+lbc170 16 4464 16 18
+lbc171 16 4491 16 18
+lbc172 16 4518 16 18
+lbc173 16 4545 16 18
+lbc174 16 4572 16 18
+lbc175 16 4599 16 18
+lbc176 16 4626 16 18
+lbc177 16 4653 16 18
+lbc178 16 4680 16 18
+lbc179 16 4707 16 18
+lbc180 16 4734 16 18
+lbc181 16 4761 16 18
+lbc182 16 4788 16 18
+lbc183 16 4815 16 18
+lbc184 16 4842 16 18
+lbc185 16 4869 16 18
+lbc186 16 4896 16 18
+lbc187 16 4923 16 18
+lbc188 16 4950 16 18
+lbc189 16 4977 16 18
+lbc190 16 5004 16 18
+lbc191 16 5031 16 18
+lbc192 16 5058 16 18
+lbc193 16 5085 16 18
+lbc194 16 5112 16 18
+lbc195 16 5139 16 18
+lbc196 16 5166 16 18
+lbc197 16 5193 16 18
+lbc198 16 5220 16 18
+lbc199 16 5247 16 18
+lbc200 16 5274 16 18
+lbc201 16 5301 16 18
+lbc202 16 5328 16 18
+lbc203 16 5355 16 18
+lbc204 16 5382 16 18
+lbc205 16 5409 16 18
+lbc206 16 5436 16 18
+lbc207 16 5463 16 18
+lbc208 16 5490 16 18
+lbc209 16 5517 16 18
+lbc210 16 5544 16 18
+lbc211 16 5571 16 18
+lbc212 16 5598 16 18
+lbc213 16 5625 16 18
+lbc214 16 5652 16 18
+lbc215 16 5679 16 18
+lbc216 16 5706 16 18
+lbc217 16 5733 16 18
+lbc218 16 5760 16 18
+lbc219 16 5787 16 18
+lbc220 16 5814 16 18
+lbc221 16 5841 16 18
+lbc222 16 5868 16 18
+lbc223 16 5895 16 18
+lbc224 16 5922 16 18
+lbc225 16 5949 16 18
+lbc226 16 5976 16 18
+lbc227 16 6003 16 18
+lbc228 16 6030 16 18
+lbc229 16 6057 16 18
+lbc230 16 6084 16 18
+lbc231 16 6111 16 18
+lbc232 16 6138 16 18
+lbc233 16 6165 16 18
+lbc234 16 6192 16 18
+lbc235 16 6219 16 18
+lbc236 16 6246 16 18
+lbc237 16 6273 16 18
+lbc238 16 6300 16 18
+lbc239 16 6327 16 18
+lbc240 16 6354 16 18
+lbc241 16 6381 16 18
+lbc242 16 6408 16 18
+lbc243 16 6435 16 18
+lbc244 16 6462 16 18
+lbc245 16 6489 16 18
+lbc246 16 6516 16 18
+lbc247 16 6543 16 18
+lbc248 16 6570 16 18
+lbc249 16 6597 16 18
+lbc250 16 6624 16 18
+lbc251 16 6651 16 18
+lbc252 16 6678 16 18
+lbc253 16 6705 16 18
+lbc254 16 6732 16 18
+lbc255 16 6759 16 18
+lbc256 16 6786 16 18
+lbc257 16 6813 16 18
+lbc258 16 6840 16 18
+lbc259 16 6867 16 18
+lbc260 16 6894 16 18
+lbc261 16 6921 16 18
+lbc262 16 6948 16 18
+lbc263 16 6975 16 18
+lbc264 16 7002 16 18
+lbc265 16 7029 16 18
+lbc266 16 7056 16 18
+lbc267 16 7083 16 18
+lbc268 16 7110 16 18
+lbc269 16 7137 16 18
+lbc270 16 7164 16 18
+lbc271 16 7191 16 18
+lbc272 16 7218 16 18
+lbc273 16 7245 16 18
+lbc274 16 7272 16 18
+lbc275 16 7299 16 18
+lbc276 16 7326 16 18
+lbc277 16 7353 16 18
+lbc278 16 7380 16 18
+lbc279 16 7407 16 18
+lbc280 16 7434 16 18
+lbc281 16 7461 16 18
+lbc282 16 7488 16 18
+lbc283 16 7515 16 18
+lbc284 16 7542 16 18
+lbc285 16 7569 16 18
+lbc286 16 7596 16 18
+lbc287 16 7623 16 18
+lbc288 16 7650 16 18
+lbc289 16 7677 16 18
+lbc290 16 7704 16 18
+lbc291 16 7731 16 18
+lbc292 16 7758 16 18
+lbc293 16 7785 16 18
+lbc294 16 7812 16 18
+lbc295 16 7839 16 18
+lbc296 16 7866 16 18
+lbc297 16 7893 16 18
+lbc298 16 7920 16 18
+lbc299 16 7947 16 18
+lbc300 16 7974 16 18
+lbc301 16 8001 16 18
+lbc302 16 8028 16 18
+lbc303 16 8055 16 18
+lbc304 16 8082 16 18
+lbc305 16 8109 16 18
+lbc306 16 8136 16 18
+lbc307 16 8163 16 18
+lbc308 16 8190 16 18
+lbc309 16 8217 16 18
+lbc310 16 8244 16 18
+lbc311 16 8271 16 18
+lbc312 16 8298 16 18
+lbc313 16 8325 16 18
+lbc314 16 8352 16 18
+lbc315 16 8379 16 18
+lbc316 16 8406 16 18
+lbc317 16 8433 16 18
+lbc318 16 8460 16 18
+lbc319 16 8487 16 18
+lbc320 16 8514 16 18
+lbc321 16 8541 16 18
+lbc322 16 8568 16 18
+lbc323 16 8595 16 18
+lbc324 16 8622 16 18
+lbc325 16 8649 16 18
+lbc326 16 8676 16 18
+lbc327 16 8703 16 18
+lbc328 16 8730 16 18
+lbc329 16 8757 16 18
+lbc330 16 8784 16 18
+lbc331 16 8811 16 18
+lbc332 16 8838 16 18
+lbc333 16 8865 16 18
+lbc334 16 8892 16 18
+lbc335 16 8919 16 18
+lbc336 16 8946 16 18
+lbc337 16 8973 16 18
+lbc338 16 9000 16 18
+lbc339 16 9027 16 18
+lbc340 16 9054 16 18
+lbc341 16 9081 16 18
+lbc342 16 9108 16 18
+lbc343 16 9135 16 18
+lbc344 16 9162 16 18
+lbc345 16 9189 16 18
+lbc346 16 9216 16 18
+lbc347 16 9243 16 18
+lbc348 16 9270 16 18
+lbc349 16 9297 16 18
+lbc350 16 9324 16 18
+lbc351 16 9351 16 18
+lbc352 16 9378 16 18
+lbc353 16 9405 16 18
+lbc354 16 9432 16 18
+lbc355 16 9459 16 18
+lbc356 16 9486 16 18
+lbc357 16 9513 16 18
+lbc358 16 9540 16 18
+lbc359 16 9567 16 18
+lbc360 16 9594 16 18
+lbc361 16 9621 16 18
+lbc362 16 9648 16 18
+lbc363 16 9675 16 18
+lbc364 16 9702 16 18
+lbc365 16 9729 16 18
+lbc366 16 9756 16 18
+lbc367 16 9783 16 18
+lbc368 16 9810 16 18
+lbc369 16 9837 16 18
+lbc370 16 9864 16 18
+lbc371 16 9891 16 18
+lbc372 16 9918 16 18
+lbc373 16 9945 16 18
+lbc374 16 9972 16 18
+lbc375 16 9999 16 18
+lbc376 16 10026 16 18
+lbc377 16 10053 16 18
+lbc378 16 10080 16 18
+lbc379 16 10107 16 18
+lbc380 16 10134 16 18
+lbc381 16 10161 16 18
+lbc382 16 10188 16 18
+lbc383 16 10215 16 18
+lbc384 16 10242 16 18
+lbc385 16 10269 16 18
+lbc386 16 10296 16 18
+lbc387 16 10323 16 18
+lbc388 16 10350 16 18
+lbc389 16 10377 16 18
+lbc390 16 10404 16 18
+lbc391 16 10431 16 18
+lbc392 16 10458 16 18
+lbc393 16 10485 16 18
+lbc394 16 10512 16 18
+lbc395 16 10539 16 18
+lbc396 16 10566 16 18
+lbc397 16 10593 16 18
+lbc398 16 10620 16 18
+lbc399 16 10647 16 18
+lbc400 16 10674 16 18
+lbc401 16 10701 16 18
+lbc402 16 10728 16 18
+lbc403 16 10755 16 18
+lbc404 16 10782 16 18
+lbc405 16 10809 16 18
+lbc406 16 10836 16 18
+lbc407 16 10863 16 18
+lbc408 16 10890 16 18
+lbc409 16 10917 16 18
+lbc410 16 10944 16 18
+lbc411 16 10971 16 18
+lbc412 16 10998 16 18
+lbc413 16 11025 16 18
+lbc414 16 11052 16 18
+lbc415 16 11079 16 18
+lbc416 16 11106 16 18
+lbc417 16 11133 16 18
+lbc418 16 11160 16 18
+lbc419 16 11187 16 18
+lbc420 16 11214 16 18
+lbc421 16 11241 16 18
+lbc422 16 11268 16 18
+lbc423 16 11295 16 18
+lbc424 16 11322 16 18
+lbc425 16 11349 16 18
+lbc426 16 11376 16 18
+lbc427 16 11403 16 18
+lbc428 16 11430 16 18
+lbc429 16 11457 16 18
+lbc430 16 11484 16 18
+lbc431 16 11511 16 18
+lbc432 16 11538 16 18
+lbc433 16 11565 16 18
+lbc434 16 11592 16 18
+lbc435 16 11619 16 18
+lbc436 16 11646 16 18
+lbc437 16 11673 16 18
+lbc438 16 11700 16 18
+lbc439 16 11727 16 18
+lbc440 16 11754 16 18
+lbc441 16 11781 16 18
+lbc442 16 11808 16 18
+lbc443 16 11835 16 18
+lbc444 16 11862 16 18
+lbc445 16 11889 16 18
+lbc446 16 11916 16 18
+lbc447 16 11943 16 18
+lbc448 16 11970 16 18
+lbc449 16 11997 16 18
+lbc450 16 12024 16 18
diff --git a/pbcore/data/bc_files.fofn b/pbcore/data/bc_files.fofn
new file mode 100644
index 0000000..758a46d
--- /dev/null
+++ b/pbcore/data/bc_files.fofn
@@ -0,0 +1,3 @@
+m140307_221913_42203_c100626172550000001823119008061414_s1_p0.1.bc.h5
+m140307_221913_42203_c100626172550000001823119008061414_s1_p0.2.bc.h5
+m140307_221913_42203_c100626172550000001823119008061414_s1_p0.3.bc.h5
diff --git a/pbcore/data/blasr-output.m4 b/pbcore/data/blasr-output.m4
new file mode 100644
index 0000000..230bc9e
--- /dev/null
+++ b/pbcore/data/blasr-output.m4
@@ -0,0 +1,2 @@
+read1/0_60 lambda_NEB3011 -285 96.7213 0 0 60 60 0 100 160 48502 254
+read2/0_63 lambda_NEB3011 -274 93.6508 0 0 63 63 0 200 260 48502 254
diff --git a/pbcore/data/blasr-output.m5 b/pbcore/data/blasr-output.m5
new file mode 100644
index 0000000..700ae2a
--- /dev/null
+++ b/pbcore/data/blasr-output.m5
@@ -0,0 +1,2 @@
+read1/0_60 60 0 60 + lambda_NEB3011 48502 100 160 + -285 59 0 1 1 254 CTCTGAAAAGAAAGG-AACGACAGGTGCTGAAAGCGTAGCTTTTTGGCCTCTGTCGTTTCC |||||||||||||||*||||||||||||||||||||*|||||||||||||||||||||||| CTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCG-AGCTTTTTGGCCTCTGTCGTTTCC
+read2/0_63 63 0 63 + lambda_NEB3011 48502 200 260 + -274 59 1 3 0 254 CAAAAAACAGCTGGCTGACATTTTCGGTGCGAGTATCCGTACCATTTTCCAGAACTGGCAGGA ||||||*|||||||||||||||||||||||||||||||||||||**||*|||||||||||||| CAAAAAGCAGCTGGCTGACATTTTCGGTGCGAGTATCCGTACCA--TT-CAGAACTGGCAGGA
diff --git a/pbcore/data/cmph5_mapping.cmp.h5 b/pbcore/data/cmph5_mapping.cmp.h5
new file mode 100644
index 0000000..ae4b5e7
Binary files /dev/null and b/pbcore/data/cmph5_mapping.cmp.h5 differ
diff --git a/pbcore/data/lambdaNEB.fa b/pbcore/data/lambdaNEB.fa
new file mode 100644
index 0000000..33011e5
--- /dev/null
+++ b/pbcore/data/lambdaNEB.fa
@@ -0,0 +1,608 @@
+>lambda_NEB3011
+GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTA
+ATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGCTTTTTGGCCTCTGTCGTTTCC
+TTTCTCTGTTTTTGTCCGTGGAATGAACAATGGAAGTCAACAAAAAGCAGCTGGCTGACATTTTCGGTGCGAGTATCCGT
+ACCATTCAGAACTGGCAGGAACAGGGAATGCCCGTTCTGCGAGGCGGTGGCAAGGGTAATGAGGTGCTTTATGACTCTGC
+CGCCGTCATAAAATGGTATGCCGAAAGGGATGCTGAAATTGAGAACGAAAAGCTGCGCCGGGAGGTTGAAGAACTGCGGC
+AGGCCAGCGAGGCAGATCTCCAGCCAGGAACTATTGAGTACGAACGCCATCGACTTACGCGTGCGCAGGCCGACGCACAG
+GAACTGAAGAATGCCAGAGACTCCGCTGAAGTGGTGGAAACCGCATTCTGTACTTTCGTGCTGTCGCGGATCGCAGGTGA
+AATTGCCAGTATTCTCGACGGGCTCCCCCTGTCGGTGCAGCGGCGTTTTCCGGAACTGGAAAACCGACATGTTGATTTCC
+TGAAACGGGATATCATCAAAGCCATGAACAAAGCAGCCGCGCTGGATGAACTGATACCGGGGTTGCTGAGTGAATATATC
+GAACAGTCAGGTTAACAGGCTGCGGCATTTTGTCCGCGCCGGGCTTCGCTCACTGTTCAGGCCGGAGCCACAGACCGCCG
+TTGAATGGGCGGATGCTAATTACTATCTCCCGAAAGAATCCGCATACCAGGAAGGGCGCTGGGAAACACTGCCCTTTCAG
+CGGGCCATCATGAATGCGATGGGCAGCGACTACATCCGTGAGGTGAATGTGGTGAAGTCTGCCCGTGTCGGTTATTCCAA
+AATGCTGCTGGGTGTTTATGCCTACTTTATAGAGCATAAGCAGCGCAACACCCTTATCTGGTTGCCGACGGATGGTGATG
+CCGAGAACTTTATGAAAACCCACGTTGAGCCGACTATTCGTGATATTCCGTCGCTGCTGGCGCTGGCCCCGTGGTATGGC
+AAAAAGCACCGGGATAACACGCTCACCATGAAGCGTTTCACTAATGGGCGTGGCTTCTGGTGCCTGGGCGGTAAAGCGGC
+AAAAAACTACCGTGAAAAGTCGGTGGATGTGGCGGGTTATGATGAACTTGCTGCTTTTGATGATGATATTGAACAGGAAG
+GCTCTCCGACGTTCCTGGGTGACAAGCGTATTGAAGGCTCGGTCTGGCCAAAGTCCATCCGTGGCTCCACGCCAAAAGTG
+AGAGGCACCTGTCAGATTGAGCGTGCAGCCAGTGAATCCCCGCATTTTATGCGTTTTCATGTTGCCTGCCCGCATTGCGG
+GGAGGAGCAGTATCTTAAATTTGGCGACAAAGAGACGCCGTTTGGCCTCAAATGGACGCCGGATGACCCCTCCAGCGTGT
+TTTATCTCTGCGAGCATAATGCCTGCGTCATCCGCCAGCAGGAGCTGGACTTTACTGATGCCCGTTATATCTGCGAAAAG
+ACCGGGATCTGGACCCGTGATGGCATTCTCTGGTTTTCGTCATCCGGTGAAGAGATTGAGCCACCTGACAGTGTGACCTT
+TCACATCTGGACAGCGTACAGCCCGTTCACCACCTGGGTGCAGATTGTCAAAGACTGGATGAAAACGAAAGGGGATACGG
+GAAAACGTAAAACCTTCGTAAACACCACGCTCGGTGAGACGTGGGAGGCGAAAATTGGCGAACGTCCGGATGCTGAAGTG
+ATGGCAGAGCGGAAAGAGCATTATTCAGCGCCCGTTCCTGACCGTGTGGCTTACCTGACCGCCGGTATCGACTCCCAGCT
+GGACCGCTACGAAATGCGCGTATGGGGATGGGGGCCGGGTGAGGAAAGCTGGCTGATTGACCGGCAGATTATTATGGGCC
+GCCACGACGATGAACAGACGCTGCTGCGTGTGGATGAGGCCATCAATAAAACCTATACCCGCCGGAATGGTGCAGAAATG
+TCGATATCCCGTATCTGCTGGGATACTGGCGGGATTGACCCGACCATTGTGTATGAACGCTCGAAAAAACATGGGCTGTT
+CCGGGTGATCCCCATTAAAGGGGCATCCGTCTACGGAAAGCCGGTGGCCAGCATGCCACGTAAGCGAAACAAAAACGGGG
+TTTACCTTACCGAAATCGGTACGGATACCGCGAAAGAGCAGATTTATAACCGCTTCACACTGACGCCGGAAGGGGATGAA
+CCGCTTCCCGGTGCCGTTCACTTCCCGAATAACCCGGATATTTTTGATCTGACCGAAGCGCAGCAGCTGACTGCTGAAGA
+GCAGGTCGAAAAATGGGTGGATGGCAGGAAAAAAATACTGTGGGACAGCAAAAAGCGACGCAATGAGGCACTCGACTGCT
+TCGTTTATGCGCTGGCGGCGCTGCGCATCAGTATTTCCCGCTGGCAGCTGGATCTCAGTGCGCTGCTGGCGAGCCTGCAG
+GAAGAGGATGGTGCAGCAACCAACAAGAAAACACTGGCAGATTACGCCCGTGCCTTATCCGGAGAGGATGAATGACGCGA
+CAGGAAGAACTTGCCGCTGCCCGTGCGGCACTGCATGACCTGATGACAGGTAAACGGGTGGCAACAGTACAGAAAGACGG
+ACGAAGGGTGGAGTTTACGGCCACTTCCGTGTCTGACCTGAAAAAATATATTGCAGAGCTGGAAGTGCAGACCGGCATGA
+CACAGCGACGCAGGGGACCTGCAGGATTTTATGTATGAAAACGCCCACCATTCCCACCCTTCTGGGGCCGGACGGCATGA
+CATCGCTGCGCGAATATGCCGGTTATCACGGCGGTGGCAGCGGATTTGGAGGGCAGTTGCGGTCGTGGAACCCACCGAGT
+GAAAGTGTGGATGCAGCCCTGTTGCCCAACTTTACCCGTGGCAATGCCCGCGCAGACGATCTGGTACGCAATAACGGCTA
+TGCCGCCAACGCCATCCAGCTGCATCAGGATCATATCGTCGGGTCTTTTTTCCGGCTCAGTCATCGCCCAAGCTGGCGCT
+ATCTGGGCATCGGGGAGGAAGAAGCCCGTGCCTTTTCCCGCGAGGTTGAAGCGGCATGGAAAGAGTTTGCCGAGGATGAC
+TGCTGCTGCATTGACGTTGAGCGAAAACGCACGTTTACCATGATGATTCGGGAAGGTGTGGCCATGCACGCCTTTAACGG
+TGAACTGTTCGTTCAGGCCACCTGGGATACCAGTTCGTCGCGGCTTTTCCGGACACAGTTCCGGATGGTCAGCCCGAAGC
+GCATCAGCAACCCGAACAATACCGGCGACAGCCGGAACTGCCGTGCCGGTGTGCAGATTAATGACAGCGGTGCGGCGCTG
+GGATATTACGTCAGCGAGGACGGGTATCCTGGCTGGATGCCGCAGAAATGGACATGGATACCCCGTGAGTTACCCGGCGG
+GCGCGCCTCGTTCATTCACGTTTTTGAACCCGTGGAGGACGGGCAGACTCGCGGTGCAAATGTGTTTTACAGCGTGATGG
+AGCAGATGAAGATGCTCGACACGCTGCAGAACACGCAGCTGCAGAGCGCCATTGTGAAGGCGATGTATGCCGCCACCATT
+GAGAGTGAGCTGGATACGCAGTCAGCGATGGATTTTATTCTGGGCGCGAACAGTCAGGAGCAGCGGGAAAGGCTGACCGG
+CTGGATTGGTGAAATTGCCGCGTATTACGCCGCAGCGCCGGTCCGGCTGGGAGGCGCAAAAGTACCGCACCTGATGCCGG
+GTGACTCACTGAACCTGCAGACGGCTCAGGATACGGATAACGGCTACTCCGTGTTTGAGCAGTCACTGCTGCGGTATATC
+GCTGCCGGGCTGGGTGTCTCGTATGAGCAGCTTTCCCGGAATTACGCCCAGATGAGCTACTCCACGGCACGGGCCAGTGC
+GAACGAGTCGTGGGCGTACTTTATGGGGCGGCGAAAATTCGTCGCATCCCGTCAGGCGAGCCAGATGTTTCTGTGCTGGC
+TGGAAGAGGCCATCGTTCGCCGCGTGGTGACGTTACCTTCAAAAGCGCGCTTCAGTTTTCAGGAAGCCCGCAGTGCCTGG
+GGGAACTGCGACTGGATAGGCTCCGGTCGTATGGCCATCGATGGTCTGAAAGAAGTTCAGGAAGCGGTGATGCTGATAGA
+AGCCGGACTGAGTACCTACGAGAAAGAGTGCGCAAAACGCGGTGACGACTATCAGGAAATTTTTGCCCAGCAGGTCCGTG
+AAACGATGGAGCGCCGTGCAGCCGGTCTTAAACCGCCCGCCTGGGCGGCTGCAGCATTTGAATCCGGGCTGCGACAATCA
+ACAGAGGAGGAGAAGAGTGACAGCAGAGCTGCGTAATCTCCCGCATATTGCCAGCATGGCCTTTAATGAGCCGCTGATGC
+TTGAACCCGCCTATGCGCGGGTTTTCTTTTGTGCGCTTGCAGGCCAGCTTGGGATCAGCAGCCTGACGGATGCGGTGTCC
+GGCGACAGCCTGACTGCCCAGGAGGCACTCGCGACGCTGGCATTATCCGGTGATGATGACGGACCACGACAGGCCCGCAG
+TTATCAGGTCATGAACGGCATCGCCGTGCTGCCGGTGTCCGGCACGCTGGTCAGCCGGACGCGGGCGCTGCAGCCGTACT
+CGGGGATGACCGGTTACAACGGCATTATCGCCCGTCTGCAACAGGCTGCCAGCGATCCGATGGTGGACGGCATTCTGCTC
+GATATGGACACGCCCGGCGGGATGGTGGCGGGGGCATTTGACTGCGCTGACATCATCGCCCGTGTGCGTGACATAAAACC
+GGTATGGGCGCTTGCCAACGACATGAACTGCAGTGCAGGTCAGTTGCTTGCCAGTGCCGCCTCCCGGCGTCTGGTCACGC
+AGACCGCCCGGACAGGCTCCATCGGCGTCATGATGGCTCACAGTAATTACGGTGCTGCGCTGGAGAAACAGGGTGTGGAA
+ATCACGCTGATTTACAGCGGCAGCCATAAGGTGGATGGCAACCCCTACAGCCATCTTCCGGATGACGTCCGGGAGACACT
+GCAGTCCCGGATGGACGCAACCCGCCAGATGTTTGCGCAGAAGGTGTCGGCATATACCGGCCTGTCCGTGCAGGTTGTGC
+TGGATACCGAGGCTGCAGTGTACAGCGGTCAGGAGGCCATTGATGCCGGACTGGCTGATGAACTTGTTAACAGCACCGAT
+GCGATCACCGTCATGCGTGATGCACTGGATGCACGTAAATCCCGTCTCTCAGGAGGGCGAATGACCAAAGAGACTCAATC
+AACAACTGTTTCAGCCACTGCTTCGCAGGCTGACGTTACTGACGTGGTGCCAGCGACGGAGGGCGAGAACGCCAGCGCGG
+CGCAGCCGGACGTGAACGCGCAGATCACCGCAGCGGTTGCGGCAGAAAACAGCCGCATTATGGGGATCCTCAACTGTGAG
+GAGGCTCACGGACGCGAAGAACAGGCACGCGTGCTGGCAGAAACCCCCGGTATGACCGTGAAAACGGCCCGCCGCATTCT
+GGCCGCAGCACCACAGAGTGCACAGGCGCGCAGTGACACTGCGCTGGATCGTCTGATGCAGGGGGCACCGGCACCGCTGG
+CTGCAGGTAACCCGGCATCTGATGCCGTTAACGATTTGCTGAACACACCAGTGTAAGGGATGTTTATGACGAGCAAAGAA
+ACCTTTACCCATTACCAGCCGCAGGGCAACAGTGACCCGGCTCATACCGCAACCGCGCCCGGCGGATTGAGTGCGAAAGC
+GCCTGCAATGACCCCGCTGATGCTGGACACCTCCAGCCGTAAGCTGGTTGCGTGGGATGGCACCACCGACGGTGCTGCCG
+TTGGCATTCTTGCGGTTGCTGCTGACCAGACCAGCACCACGCTGACGTTCTACAAGTCCGGCACGTTCCGTTATGAGGAT
+GTGCTCTGGCCGGAGGCTGCCAGCGACGAGACGAAAAAACGGACCGCGTTTGCCGGAACGGCAATCAGCATCGTTTAACT
+TTACCCTTCATCACTAAAGGCCGCCTGTGCGGCTTTTTTTACGGGATTTTTTTATGTCGATGTACACAACCGCCCAACTG
+CTGGCGGCAAATGAGCAGAAATTTAAGTTTGATCCGCTGTTTCTGCGTCTCTTTTTCCGTGAGAGCTATCCCTTCACCAC
+GGAGAAAGTCTATCTCTCACAAATTCCGGGACTGGTAAACATGGCGCTGTACGTTTCGCCGATTGTTTCCGGTGAGGTTA
+TCCGTTCCCGTGGCGGCTCCACCTCTGAATTTACGCCGGGATATGTCAAGCCGAAGCATGAAGTGAATCCGCAGATGACC
+CTGCGTCGCCTGCCGGATGAAGATCCGCAGAATCTGGCGGACCCGGCTTACCGCCGCCGTCGCATCATCATGCAGAACAT
+GCGTGACGAAGAGCTGGCCATTGCTCAGGTCGAAGAGATGCAGGCAGTTTCTGCCGTGCTTAAGGGCAAATACACCATGA
+CCGGTGAAGCCTTCGATCCGGTTGAGGTGGATATGGGCCGCAGTGAGGAGAATAACATCACGCAGTCCGGCGGCACGGAG
+TGGAGCAAGCGTGACAAGTCCACGTATGACCCGACCGACGATATCGAAGCCTACGCGCTGAACGCCAGCGGTGTGGTGAA
+TATCATCGTGTTCGATCCGAAAGGCTGGGCGCTGTTCCGTTCCTTCAAAGCCGTCAAGGAGAAGCTGGATACCCGTCGTG
+GCTCTAATTCCGAGCTGGAGACAGCGGTGAAAGACCTGGGCAAAGCGGTGTCCTATAAGGGGATGTATGGCGATGTGGCC
+ATCGTCGTGTATTCCGGACAGTACGTGGAAAACGGCGTCAAAAAGAACTTCCTGCCGGACAACACGATGGTGCTGGGGAA
+CACTCAGGCACGCGGTCTGCGCACCTATGGCTGCATTCAGGATGCGGACGCACAGCGCGAAGGCATTAACGCCTCTGCCC
+GTTACCCGAAAAACTGGGTGACCACCGGCGATCCGGCGCGTGAGTTCACCATGATTCAGTCAGCACCGCTGATGCTGCTG
+GCTGACCCTGATGAGTTCGTGTCCGTACAACTGGCGTAATCATGGCCCTTCGGGGCCATTGTTTCTCTGTGGAGGAGTCC
+ATGACGAAAGATGAACTGATTGCCCGTCTCCGCTCGCTGGGTGAACAACTGAACCGTGATGTCAGCCTGACGGGGACGAA
+AGAAGAACTGGCGCTCCGTGTGGCAGAGCTGAAAGAGGAGCTTGATGACACGGATGAAACTGCCGGTCAGGACACCCCTC
+TCAGCCGGGAAAATGTGCTGACCGGACATGAAAATGAGGTGGGATCAGCGCAGCCGGATACCGTGATTCTGGATACGTCT
+GAACTGGTCACGGTCGTGGCACTGGTGAAGCTGCATACTGATGCACTTCACGCCACGCGGGATGAACCTGTGGCATTTGT
+GCTGCCGGGAACGGCGTTTCGTGTCTCTGCCGGTGTGGCAGCCGAAATGACAGAGCGCGGCCTGGCCAGAATGCAATAAC
+GGGAGGCGCTGTGGCTGATTTCGATAACCTGTTCGATGCTGCCATTGCCCGCGCCGATGAAACGATACGCGGGTACATGG
+GAACGTCAGCCACCATTACATCCGGTGAGCAGTCAGGTGCGGTGATACGTGGTGTTTTTGATGACCCTGAAAATATCAGC
+TATGCCGGACAGGGCGTGCGCGTTGAAGGCTCCAGCCCGTCCCTGTTTGTCCGGACTGATGAGGTGCGGCAGCTGCGGCG
+TGGAGACACGCTGACCATCGGTGAGGAAAATTTCTGGGTAGATCGGGTTTCGCCGGATGATGGCGGAAGTTGTCATCTCT
+GGCTTGGACGGGGCGTACCGCCTGCCGTTAACCGTCGCCGCTGAAAGGGGGATGTATGGCCATAAAAGGTCTTGAGCAGG
+CCGTTGAAAACCTCAGCCGTATCAGCAAAACGGCGGTGCCTGGTGCCGCCGCAATGGCCATTAACCGCGTTGCTTCATCC
+GCGATATCGCAGTCGGCGTCACAGGTTGCCCGTGAGACAAAGGTACGCCGGAAACTGGTAAAGGAAAGGGCCAGGCTGAA
+AAGGGCCACGGTCAAAAATCCGCAGGCCAGAATCAAAGTTAACCGGGGGGATTTGCCCGTAATCAAGCTGGGTAATGCGC
+GGGTTGTCCTTTCGCGCCGCAGGCGTCGTAAAAAGGGGCAGCGTTCATCCCTGAAAGGTGGCGGCAGCGTGCTTGTGGTG
+GGTAACCGTCGTATTCCCGGCGCGTTTATTCAGCAACTGAAAAATGGCCGGTGGCATGTCATGCAGCGTGTGGCTGGGAA
+AAACCGTTACCCCATTGATGTGGTGAAAATCCCGATGGCGGTGCCGCTGACCACGGCGTTTAAACAAAATATTGAGCGGA
+TACGGCGTGAACGTCTTCCGAAAGAGCTGGGCTATGCGCTGCAGCATCAACTGAGGATGGTAATAAAGCGATGAAACATA
+CTGAACTCCGTGCAGCCGTACTGGATGCACTGGAGAAGCATGACACCGGGGCGACGTTTTTTGATGGTCGCCCCGCTGTT
+TTTGATGAGGCGGATTTTCCGGCAGTTGCCGTTTATCTCACCGGCGCTGAATACACGGGCGAAGAGCTGGACAGCGATAC
+CTGGCAGGCGGAGCTGCATATCGAAGTTTTCCTGCCTGCTCAGGTGCCGGATTCAGAGCTGGATGCGTGGATGGAGTCCC
+GGATTTATCCGGTGATGAGCGATATCCCGGCACTGTCAGATTTGATCACCAGTATGGTGGCCAGCGGCTATGACTACCGG
+CGCGACGATGATGCGGGCTTGTGGAGTTCAGCCGATCTGACTTATGTCATTACCTATGAAATGTGAGGACGCTATGCCTG
+TACCAAATCCTACAATGCCGGTGAAAGGTGCCGGGACCACCCTGTGGGTTTATAAGGGGAGCGGTGACCCTTACGCGAAT
+CCGCTTTCAGACGTTGACTGGTCGCGTCTGGCAAAAGTTAAAGACCTGACGCCCGGCGAACTGACCGCTGAGTCCTATGA
+CGACAGCTATCTCGATGATGAAGATGCAGACTGGACTGCGACCGGGCAGGGGCAGAAATCTGCCGGAGATACCAGCTTCA
+CGCTGGCGTGGATGCCCGGAGAGCAGGGGCAGCAGGCGCTGCTGGCGTGGTTTAATGAAGGCGATACCCGTGCCTATAAA
+ATCCGCTTCCCGAACGGCACGGTCGATGTGTTCCGTGGCTGGGTCAGCAGTATCGGTAAGGCGGTGACGGCGAAGGAAGT
+GATCACCCGCACGGTGAAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGTAACAGCGGCAACCG
+GCATGACCGTGACGCCTGCCAGCACCTCGGTGGTGAAAGGGCAGAGCACCACGCTGACCGTGGCCTTCCAGCCGGAGGGC
+GTAACCGACAAGAGCTTTCGTGCGGTGTCTGCGGATAAAACAAAAGCCACCGTGTCGGTCAGTGGTATGACCATCACCGT
+GAACGGCGTTGCTGCAGGCAAGGTCAACATTCCGGTTGTATCCGGTAATGGTGAGTTTGCTGCGGTTGCAGAAATTACCG
+TCACCGCCAGTTAATCCGGAGAGTCAGCGATGTTCCTGAAAACCGAATCATTTGAACATAACGGTGTGACCGTCACGCTT
+TCTGAACTGTCAGCCCTGCAGCGCATTGAGCATCTCGCCCTGATGAAACGGCAGGCAGAACAGGCGGAGTCAGACAGCAA
+CCGGAAGTTTACTGTGGAAGACGCCATCAGAACCGGCGCGTTTCTGGTGGCGATGTCCCTGTGGCATAACCATCCGCAGA
+AGACGCAGATGCCGTCCATGAATGAAGCCGTTAAACAGATTGAGCAGGAAGTGCTTACCACCTGGCCCACGGAGGCAATT
+TCTCATGCTGAAAACGTGGTGTACCGGCTGTCTGGTATGTATGAGTTTGTGGTGAATAATGCCCCTGAACAGACAGAGGA
+CGCCGGGCCCGCAGAGCCTGTTTCTGCGGGAAAGTGTTCGACGGTGAGCTGAGTTTTGCCCTGAAACTGGCGCGTGAGAT
+GGGGCGACCCGACTGGCGTGCCATGCTTGCCGGGATGTCATCCACGGAGTATGCCGACTGGCACCGCTTTTACAGTACCC
+ATTATTTTCATGATGTTCTGCTGGATATGCACTTTTCCGGGCTGACGTACACCGTGCTCAGCCTGTTTTTCAGCGATCCG
+GATATGCATCCGCTGGATTTCAGTCTGCTGAACCGGCGCGAGGCTGACGAAGAGCCTGAAGATGATGTGCTGATGCAGAA
+AGCGGCAGGGCTTGCCGGAGGTGTCCGCTTTGGCCCGGACGGGAATGAAGTTATCCCCGCTTCCCCGGATGTGGCGGACA
+TGACGGAGGATGACGTAATGCTGATGACAGTATCAGAAGGGATCGCAGGAGGAGTCCGGTATGGCTGAACCGGTAGGCGA
+TCTGGTCGTTGATTTGAGTCTGGATGCGGCCAGATTTGACGAGCAGATGGCCAGAGTCAGGCGTCATTTTTCTGGTACGG
+AAAGTGATGCGAAAAAAACAGCGGCAGTCGTTGAACAGTCGCTGAGCCGACAGGCGCTGGCTGCACAGAAAGCGGGGATT
+TCCGTCGGGCAGTATAAAGCCGCCATGCGTATGCTGCCTGCACAGTTCACCGACGTGGCCACGCAGCTTGCAGGCGGGCA
+AAGTCCGTGGCTGATCCTGCTGCAACAGGGGGGGCAGGTGAAGGACTCCTTCGGCGGGATGATCCCCATGTTCAGGGGGC
+TTGCCGGTGCGATCACCCTGCCGATGGTGGGGGCCACCTCGCTGGCGGTGGCGACCGGTGCGCTGGCGTATGCCTGGTAT
+CAGGGCAACTCAACCCTGTCCGATTTCAACAAAACGCTGGTCCTTTCCGGCAATCAGGCGGGACTGACGGCAGATCGTAT
+GCTGGTCCTGTCCAGAGCCGGGCAGGCGGCAGGGCTGACGTTTAACCAGACCAGCGAGTCACTCAGCGCACTGGTTAAGG
+CGGGGGTAAGCGGTGAGGCTCAGATTGCGTCCATCAGCCAGAGTGTGGCGCGTTTCTCCTCTGCATCCGGCGTGGAGGTG
+GACAAGGTCGCTGAAGCCTTCGGGAAGCTGACCACAGACCCGACGTCGGGGCTGACGGCGATGGCTCGCCAGTTCCATAA
+CGTGTCGGCGGAGCAGATTGCGTATGTTGCTCAGTTGCAGCGTTCCGGCGATGAAGCCGGGGCATTGCAGGCGGCGAACG
+AGGCCGCAACGAAAGGGTTTGATGACCAGACCCGCCGCCTGAAAGAGAACATGGGCACGCTGGAGACCTGGGCAGACAGG
+ACTGCGCGGGCATTCAAATCCATGTGGGATGCGGTGCTGGATATTGGTCGTCCTGATACCGCGCAGGAGATGCTGATTAA
+GGCAGAGGCTGCGTATAAGAAAGCAGACGACATCTGGAATCTGCGCAAGGATGATTATTTTGTTAACGATGAAGCGCGGG
+CGCGTTACTGGGATGATCGTGAAAAGGCCCGTCTTGCGCTTGAAGCCGCCCGAAAGAAGGCTGAGCAGCAGACTCAACAG
+GACAAAAATGCGCAGCAGCAGAGCGATACCGAAGCGTCACGGCTGAAATATACCGAAGAGGCGCAGAAGGCTTACGAACG
+GCTGCAGACGCCGCTGGAGAAATATACCGCCCGTCAGGAAGAACTGAACAAGGCACTGAAAGACGGGAAAATCCTGCAGG
+CGGATTACAACACGCTGATGGCGGCGGCGAAAAAGGATTATGAAGCGACGCTGAAAAAGCCGAAACAGTCCAGCGTGAAG
+GTGTCTGCGGGCGATCGTCAGGAAGACAGTGCTCATGCTGCCCTGCTGACGCTTCAGGCAGAACTCCGGACGCTGGAGAA
+GCATGCCGGAGCAAATGAGAAAATCAGCCAGCAGCGCCGGGATTTGTGGAAGGCGGAGAGTCAGTTCGCGGTACTGGAGG
+AGGCGGCGCAACGTCGCCAGCTGTCTGCACAGGAGAAATCCCTGCTGGCGCATAAAGATGAGACGCTGGAGTACAAACGC
+CAGCTGGCTGCACTTGGCGACAAGGTTACGTATCAGGAGCGCCTGAACGCGCTGGCGCAGCAGGCGGATAAATTCGCACA
+GCAGCAACGGGCAAAACGGGCCGCCATTGATGCGAAAAGCCGGGGGCTGACTGACCGGCAGGCAGAACGGGAAGCCACGG
+AACAGCGCCTGAAGGAACAGTATGGCGATAATCCGCTGGCGCTGAATAACGTCATGTCAGAGCAGAAAAAGACCTGGGCG
+GCTGAAGACCAGCTTCGCGGGAACTGGATGGCAGGCCTGAAGTCCGGCTGGAGTGAGTGGGAAGAGAGCGCCACGGACAG
+TATGTCGCAGGTAAAAAGTGCAGCCACGCAGACCTTTGATGGTATTGCACAGAATATGGCGGCGATGCTGACCGGCAGTG
+AGCAGAACTGGCGCAGCTTCACCCGTTCCGTGCTGTCCATGATGACAGAAATTCTGCTTAAGCAGGCAATGGTGGGGATT
+GTCGGGAGTATCGGCAGCGCCATTGGCGGGGCTGTTGGTGGCGGCGCATCCGCGTCAGGCGGTACAGCCATTCAGGCCGC
+TGCGGCGAAATTCCATTTTGCAACCGGAGGATTTACGGGAACCGGCGGCAAATATGAGCCAGCGGGGATTGTTCACCGTG
+GTGAGTTTGTCTTCACGAAGGAGGCAACCAGCCGGATTGGCGTGGGGAATCTTTACCGGCTGATGCGCGGCTATGCCACC
+GGCGGTTATGTCGGTACACCGGGCAGCATGGCAGACAGCCGGTCGCAGGCGTCCGGGACGTTTGAGCAGAATAACCATGT
+GGTGATTAACAACGACGGCACGAACGGGCAGATAGGTCCGGCTGCTCTGAAGGCGGTGTATGACATGGCCCGCAAGGGTG
+CCCGTGATGAAATTCAGACACAGATGCGTGATGGTGGCCTGTTCTCCGGAGGTGGACGATGAAGACCTTCCGCTGGAAAG
+TGAAACCCGGTATGGATGTGGCTTCGGTCCCTTCTGTAAGAAAGGTGCGCTTTGGTGATGGCTATTCTCAGCGAGCGCCT
+GCCGGGCTGAATGCCAACCTGAAAACGTACAGCGTGACGCTTTCTGTCCCCCGTGAGGAGGCCACGGTACTGGAGTCGTT
+TCTGGAAGAGCACGGGGGCTGGAAATCCTTTCTGTGGACGCCGCCTTATGAGTGGCGGCAGATAAAGGTGACCTGCGCAA
+AATGGTCGTCGCGGGTCAGTATGCTGCGTGTTGAGTTCAGCGCAGAGTTTGAACAGGTGGTGAACTGATGCAGGATATCC
+GGCAGGAAACACTGAATGAATGCACCCGTGCGGAGCAGTCGGCCAGCGTGGTGCTCTGGGAAATCGACCTGACAGAGGTC
+GGTGGAGAACGTTATTTTTTCTGTAATGAGCAGAACGAAAAAGGTGAGCCGGTCACCTGGCAGGGGCGACAGTATCAGCC
+GTATCCCATTCAGGGGAGCGGTTTTGAACTGAATGGCAAAGGCACCAGTACGCGCCCCACGCTGACGGTTTCTAACCTGT
+ACGGTATGGTCACCGGGATGGCGGAAGATATGCAGAGTCTGGTCGGCGGAACGGTGGTCCGGCGTAAGGTTTACGCCCGT
+TTTCTGGATGCGGTGAACTTCGTCAACGGAAACAGTTACGCCGATCCGGAGCAGGAGGTGATCAGCCGCTGGCGCATTGA
+GCAGTGCAGCGAACTGAGCGCGGTGAGTGCCTCCTTTGTACTGTCCACGCCGACGGAAACGGATGGCGCTGTTTTTCCGG
+GACGTATCATGCTGGCCAACACCTGCACCTGGACCTATCGCGGTGACGAGTGCGGTTATAGCGGTCCGGCTGTCGCGGAT
+GAATATGACCAGCCAACGTCCGATATCACGAAGGATAAATGCAGCAAATGCCTGAGCGGTTGTAAGTTCCGCAATAACGT
+CGGCAACTTTGGCGGCTTCCTTTCCATTAACAAACTTTCGCAGTAAATCCCATGACACAGACAGAATCAGCGATTCTGGC
+GCACGCCCGGCGATGTGCGCCAGCGGAGTCGTGCGGCTTCGTGGTAAGCACGCCGGAGGGGGAAAGATATTTCCCCTGCG
+TGAATATCTCCGGTGAGCCGGAGGCGTATTTCCGTATGTCGCCGGAAGACTGGCTGCAGGCAGAAATGCAGGGTGAGATT
+GTGGCGCTGGTCCACAGCCACCCCGGTGGTCTGCCCTGGCTGAGTGAGGCCGACCGGCGGCTGCAGGTGCAGAGTGATTT
+GCCGTGGTGGCTGGTCTGCCGGGGGACGATTCATAAGTTCCGCTGTGTGCCGCATCTCACCGGGCGGCGCTTTGAGCACG
+GTGTGACGGACTGTTACACACTGTTCCGGGATGCTTATCATCTGGCGGGGATTGAGATGCCGGACTTTCATCGTGAGGAT
+GACTGGTGGCGTAACGGCCAGAATCTCTATCTGGATAATCTGGAGGCGACGGGGCTGTATCAGGTGCCGTTGTCAGCGGC
+ACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCATCAGTGCCGAATCACGCCGCAATTTACTGCGGCGACGGCGAGC
+TGCTGCACCATATTCCTGAACAACTGAGCAAACGAGAGAGGTACACCGACAAATGGCAGCGACGCACACACTCCCTCTGG
+CGTCACCGGGCATGGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTGGTCGCCGCATCGACCTTCGTGTGAAAACG
+GGGGCTGAAGCCATCCGGGCACTGGCCACACAGCTCCCGGCGTTTCGTCAGAAACTGAGCGACGGCTGGTATCAGGTACG
+GATTGCCGGGCGGGACGTCAGCACGTCCGGGTTAACGGCGCAGTTACATGAGACTCTGCCTGATGGCGCTGTAATTCATA
+TTGTTCCCAGAGTCGCCGGGGCCAAGTCAGGTGGCGTATTCCAGATTGTCCTGGGGGCTGCCGCCATTGCCGGATCATTC
+TTTACCGCCGGAGCCACCCTTGCAGCATGGGGGGCAGCCATTGGGGCCGGTGGTATGACCGGCATCCTGTTTTCTCTCGG
+TGCCAGTATGGTGCTCGGTGGTGTGGCGCAGATGCTGGCACCGAAAGCCAGAACTCCCCGTATACAGACAACGGATAACG
+GTAAGCAGAACACCTATTTCTCCTCACTGGATAACATGGTTGCCCAGGGCAATGTTCTGCCTGTTCTGTACGGGGAAATG
+CGCGTGGGGTCACGCGTGGTTTCTCAGGAGATCAGCACGGCAGACGAAGGGGACGGTGGTCAGGTTGTGGTGATTGGTCG
+CTGATGCAAAATGTTTTATGTGAAACCGCCTGCGGGCGGTTTTGTCATTTATGGAGCGTGAGGAATGGGTAAAGGAAGCA
+GTAAGGGGCATACCCCGCGCGAAGCGAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCGATGCCATCAGCGAA
+GGGCCGATTGAAGGTCCGGTGGATGGCTTAAAAAGCGTGCTGCTGAACAGTACGCCGGTGCTGGACACTGAGGGGAATAC
+CAACATATCCGGTGTCACGGTGGTGTTCCGGGCTGGTGAGCAGGAGCAGACTCCGCCGGAGGGATTTGAATCCTCCGGCT
+CCGAGACGGTGCTGGGTACGGAAGTGAAATATGACACGCCGATCACCCGCACCATTACGTCTGCAAACATCGACCGTCTG
+CGCTTTACCTTCGGTGTACAGGCACTGGTGGAAACCACCTCAAAGGGTGACAGGAATCCGTCGGAAGTCCGCCTGCTGGT
+TCAGATACAACGTAACGGTGGCTGGGTGACGGAAAAAGACATCACCATTAAGGGCAAAACCACCTCGCAGTATCTGGCCT
+CGGTGGTGATGGGTAACCTGCCGCCGCGCCCGTTTAATATCCGGATGCGCAGGATGACGCCGGACAGCACCACAGACCAG
+CTGCAGAACAAAACGCTCTGGTCGTCATACACTGAAATCATCGATGTGAAACAGTGCTACCCGAACACGGCACTGGTCGG
+CGTGCAGGTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGTATTCTGCAGGTGC
+CGTCGAACTATAACCCGCAGACGCGGCAATACAGCGGTATCTGGGACGGAACGTTTAAACCGGCATACAGCAACAACATG
+GCCTGGTGTCTGTGGGATATGCTGACCCATCCGCGCTACGGCATGGGGAAACGTCTTGGTGCGGCGGATGTGGATAAATG
+GGCGCTGTATGTCATCGGCCAGTACTGCGACCAGTCAGTGCCGGACGGCTTTGGCGGCACGGAGCCGCGCATCACCTGTA
+ATGCGTACCTGACCACACAGCGTAAGGCGTGGGATGTGCTCAGCGATTTCTGCTCGGCGATGCGCTGTATGCCGGTATGG
+AACGGGCAGACGCTGACGTTCGTGCAGGACCGACCGTCGGATAAGACGTGGACCTATAACCGCAGTAATGTGGTGATGCC
+GGATGATGGCGCGCCGTTCCGCTACAGCTTCAGCGCCCTGAAGGACCGCCATAATGCCGTTGAGGTGAACTGGATTGACC
+CGAACAACGGCTGGGAGACGGCGACAGAGCTTGTTGAAGATACGCAGGCCATTGCCCGTTACGGTCGTAATGTTACGAAG
+ATGGATGCCTTTGGCTGTACCAGCCGGGGGCAGGCACACCGCGCCGGGCTGTGGCTGATTAAAACAGAACTGCTGGAAAC
+GCAGACCGTGGATTTCAGCGTCGGCGCAGAAGGGCTTCGCCATGTACCGGGCGATGTTATTGAAATCTGCGATGATGACT
+ATGCCGGTATCAGCACCGGTGGTCGTGTGCTGGCGGTGAACAGCCAGACCCGGACGCTGACGCTCGACCGTGAAATCACG
+CTGCCATCCTCCGGTACCGCGCTGATAAGCCTGGTTGACGGAAGTGGCAATCCGGTCAGCGTGGAGGTTCAGTCCGTCAC
+CGACGGCGTGAAGGTAAAAGTGAGCCGTGTTCCTGACGGTGTTGCTGAATACAGCGTATGGGAGCTGAAGCTGCCGACGC
+TGCGCCAGCGACTGTTCCGCTGCGTGAGTATCCGTGAGAACGACGACGGCACGTATGCCATCACCGCCGTGCAGCATGTG
+CCGGAAAAAGAGGCCATCGTGGATAACGGGGCGCACTTTGACGGCGAACAGAGTGGCACGGTGAATGGTGTCACGCCGCC
+AGCGGTGCAGCACCTGACCGCAGAAGTCACTGCAGACAGCGGGGAATATCAGGTGCTGGCGCGATGGGACACACCGAAGG
+TGGTGAAGGGCGTGAGTTTCCTGCTCCGTCTGACCGTAACAGCGGACGACGGCAGTGAGCGGCTGGTCAGCACGGCCCGG
+ACGACGGAAACCACATACCGCTTCACGCAACTGGCGCTGGGGAACTACAGGCTGACAGTCCGGGCGGTAAATGCGTGGGG
+GCAGCAGGGCGATCCGGCGTCGGTATCGTTCCGGATTGCCGCACCGGCAGCACCGTCGAGGATTGAGCTGACGCCGGGCT
+ATTTTCAGATAACCGCCACGCCGCATCTTGCCGTTTATGACCCGACGGTACAGTTTGAGTTCTGGTTCTCGGAAAAGCAG
+ATTGCGGATATCAGACAGGTTGAAACCAGCACGCGTTATCTTGGTACGGCGCTGTACTGGATAGCCGCCAGTATCAATAT
+CAAACCGGGCCATGATTATTACTTTTATATCCGCAGTGTGAACACCGTTGGCAAATCGGCATTCGTGGAGGCCGTCGGTC
+GGGCGAGCGATGATGCGGAAGGTTACCTGGATTTTTTCAAAGGCAAGATAACCGAATCCCATCTCGGCAAGGAGCTGCTG
+GAAAAAGTCGAGCTGACGGAGGATAACGCCAGCAGACTGGAGGAGTTTTCGAAAGAGTGGAAGGATGCCAGTGATAAGTG
+GAATGCCATGTGGGCTGTCAAAATTGAGCAGACCAAAGACGGCAAACATTATGTCGCGGGTATTGGCCTCAGCATGGAGG
+ACACGGAGGAAGGCAAACTGAGCCAGTTTCTGGTTGCCGCCAATCGTATCGCATTTATTGACCCGGCAAACGGGAATGAA
+ACGCCGATGTTTGTGGCGCAGGGCAACCAGATATTCATGAACGACGTGTTCCTGAAGCGCCTGACGGCCCCCACCATTAC
+CAGCGGCGGCAATCCTCCGGCCTTTTCCCTGACACCGGACGGAAAGCTGACCGCTAAAAATGCGGATATCAGTGGCAGTG
+TGAATGCGAACTCCGGGACGCTCAGTAATGTGACGATAGCTGAAAACTGTACGATAAACGGTACGCTGAGGGCGGAAAAA
+ATCGTCGGGGACATTGTAAAGGCGGCGAGCGCGGCTTTTCCGCGCCAGCGTGAAAGCAGTGTGGACTGGCCGTCAGGTAC
+CCGTACTGTCACCGTGACCGATGACCATCCTTTTGATCGCCAGATAGTGGTGCTTCCGCTGACGTTTCGCGGAAGTAAGC
+GTACTGTCAGCGGCAGGACAACGTATTCGATGTGTTATCTGAAAGTACTGATGAACGGTGCGGTGATTTATGATGGCGCG
+GCGAACGAGGCGGTACAGGTGTTCTCCCGTATTGTTGACATGCCAGCGGGTCGGGGAAACGTGATCCTGACGTTCACGCT
+TACGTCCACACGGCATTCGGCAGATATTCCGCCGTATACGTTTGCCAGCGATGTGCAGGTTATGGTGATTAAGAAACAGG
+CGCTGGGCATCAGCGTGGTCTGAGTGTGTTACAGAGGTTCGTCCGGGAACGGGCGTTTTATTATAAAACAGTGAGAGGTG
+AACGATGCGTAATGTGTGTATTGCCGTTGCTGTCTTTGCCGCACTTGCGGTGACAGTCACTCCGGCCCGTGCGGAAGGTG
+GACATGGTACGTTTACGGTGGGCTATTTTCAAGTGAAACCGGGTACATTGCCGTCGTTGTCGGGCGGGGATACCGGTGTG
+AGTCATCTGAAAGGGATTAACGTGAAGTACCGTTATGAGCTGACGGACAGTGTGGGGGTGATGGCTTCCCTGGGGTTCGC
+CGCGTCGAAAAAGAGCAGCACAGTGATGACCGGGGAGGATACGTTTCACTATGAGAGCCTGCGTGGACGTTATGTGAGCG
+TGATGGCCGGACCGGTTTTACAAATCAGTAAGCAGGTCAGTGCGTACGCCATGGCCGGAGTGGCTCACAGTCGGTGGTCC
+GGCAGTACAATGGATTACCGTAAGACGGAAATCACTCCCGGGTATATGAAAGAGACGACCACTGCCAGGGACGAAAGTGC
+AATGCGGCATACCTCAGTGGCGTGGAGTGCAGGTATACAGATTAATCCGGCAGCGTCCGTCGTTGTTGATATTGCTTATG
+AAGGCTCCGGCAGTGGCGACTGGCGTACTGACGGATTCATCGTTGGGGTCGGTTATAAATTCTGATTAGCCAGGTAACAC
+AGTGTTATGACAGCCCGCCGGAACCGGTGGGCTTTTTTGTGGGGTGAATATGGCAGTAAAGATTTCAGGAGTCCTGAAAG
+ACGGCACAGGAAAACCGGTACAGAACTGCACCATTCAGCTGAAAGCCAGACGTAACAGCACCACGGTGGTGGTGAACACG
+GTGGGCTCAGAGAATCCGGATGAAGCCGGGCGTTACAGCATGGATGTGGAGTACGGTCAGTACAGTGTCATCCTGCAGGT
+TGACGGTTTTCCACCATCGCACGCCGGGACCATCACCGTGTATGAAGATTCACAACCGGGGACGCTGAATGATTTTCTCT
+GTGCCATGACGGAGGATGATGCCCGGCCGGAGGTGCTGCGTCGTCTTGAACTGATGGTGGAAGAGGTGGCGCGTAACGCG
+TCCGTGGTGGCACAGAGTACGGCAGACGCGAAGAAATCAGCCGGCGATGCCAGTGCATCAGCTGCTCAGGTCGCGGCCCT
+TGTGACTGATGCAACTGACTCAGCACGCGCCGCCAGCACGTCCGCCGGACAGGCTGCATCGTCAGCTCAGGAAGCGTCCT
+CCGGCGCAGAAGCGGCATCAGCAAAGGCCACTGAAGCGGAAAAAAGTGCCGCAGCCGCAGAGTCCTCAAAAAACGCGGCG
+GCCACCAGTGCCGGTGCGGCGAAAACGTCAGAAACGAATGCTGCAGCGTCACAACAATCAGCCGCCACGTCTGCCTCCAC
+CGCGGCCACGAAAGCGTCAGAGGCCGCCACTTCAGCACGAGATGCGGTGGCCTCAAAAGAGGCAGCAAAATCATCAGAAA
+CGAACGCATCATCAAGTGCCGGTCGTGCAGCTTCCTCGGCAACGGCGGCAGAAAATTCTGCCAGGGCGGCAAAAACGTCC
+GAGACGAATGCCAGGTCATCTGAAACAGCAGCGGAACGGAGCGCCTCTGCCGCGGCAGACGCAAAAACAGCGGCGGCGGG
+GAGTGCGTCAACGGCATCCACGAAGGCGACAGAGGCTGCGGGAAGTGCGGTATCAGCATCGCAGAGCAAAAGTGCGGCAG
+AAGCGGCGGCAATACGTGCAAAAAATTCGGCAAAACGTGCAGAAGATATAGCTTCAGCTGTCGCGCTTGAGGATGCGGAC
+ACAACGAGAAAGGGGATAGTGCAGCTCAGCAGTGCAACCAACAGCACGTCTGAAACGCTTGCTGCAACGCCAAAGGCGGT
+TAAGGTGGTAATGGATGAAACGAACAGAAAAGCCCACTGGACAGTCCGGCACTGACCGGAACGCCAACAGCACCAACCGC
+GCTCAGGGGAACAAACAATACCCAGATTGCGAACACCGCTTTTGTACTGGCCGCGATTGCAGATGTTATCGACGCGTCAC
+CTGACGCACTGAATACGCTGAATGAACTGGCCGCAGCGCTCGGGAATGATCCAGATTTTGCTACCACCATGACTAACGCG
+CTTGCGGGTAAACAACCGAAGAATGCGACACTGACGGCGCTGGCAGGGCTTTCCACGGCGAAAAATAAATTACCGTATTT
+TGCGGAAAATGATGCCGCCAGCCTGACTGAACTGACTCAGGTTGGCAGGGATATTCTGGCAAAAAATTCCGTTGCAGATG
+TTCTTGAATACCTTGGGGCCGGTGAGAATTCGGCCTTTCCGGCAGGTGCGCCGATCCCGTGGCCATCAGATATCGTTCCG
+TCTGGCTACGTCCTGATGCAGGGGCAGGCGTTTGACAAATCAGCCTACCCAAAACTTGCTGTCGCGTATCCATCGGGTGT
+GCTTCCTGATATGCGAGGCTGGACAATCAAGGGGAAACCCGCCAGCGGTCGTGCTGTATTGTCTCAGGAACAGGATGGAA
+TTAAGTCGCACACCCACAGTGCCAGTGCATCCGGTACGGATTTGGGGACGAAAACCACATCGTCGTTTGATTACGGGACG
+AAAACAACAGGCAGTTTCGATTACGGCACCAAATCGACGAATAACACGGGGGCTCATGCTCACAGTCTGAGCGGTTCAAC
+AGGGGCCGCGGGTGCTCATGCCCACACAAGTGGTTTAAGGATGAACAGTTCTGGCTGGAGTCAGTATGGAACAGCAACCA
+TTACAGGAAGTTTATCCACAGTTAAAGGAACCAGCACACAGGGTATTGCTTATTTATCGAAAACGGACAGTCAGGGCAGC
+CACAGTCACTCATTGTCCGGTACAGCCGTGAGTGCCGGTGCACATGCGCATACAGTTGGTATTGGTGCGCACCAGCATCC
+GGTTGTTATCGGTGCTCATGCCCATTCTTTCAGTATTGGTTCACACGGACACACCATCACCGTTAACGCTGCGGGTAACG
+CGGAAAACACCGTCAAAAACATTGCATTTAACTATATTGTGAGGCTTGCATAATGGCATTCAGAATGAGTGAACAACCAC
+GGACCATAAAAATTTATAATCTGCTGGCCGGAACTAATGAATTTATTGGTGAAGGTGACGCATATATTCCGCCTCATACC
+GGTCTGCCTGCAAACAGTACCGATATTGCACCGCCAGATATTCCGGCTGGCTTTGTGGCTGTTTTCAACAGTGATGAGGC
+ATCGTGGCATCTCGTTGAAGACCATCGGGGTAAAACCGTCTATGACGTGGCTTCCGGCGACGCGTTATTTATTTCTGAAC
+TCGGTCCGTTACCGGAAAATTTTACCTGGTTATCGCCGGGAGGGGAATATCAGAAGTGGAACGGCACAGCCTGGGTGAAG
+GATACGGAAGCAGAAAAACTGTTCCGGATCCGGGAGGCGGAAGAAACAAAAAAAAGCCTGATGCAGGTAGCCAGTGAGCA
+TATTGCGCCGCTTCAGGATGCTGCAGATCTGGAAATTGCAACGAAGGAAGAAACCTCGTTGCTGGAAGCCTGGAAGAAGT
+ATCGGGTGTTGCTGAACCGTGTTGATACATCAACTGCACCTGATATTGAGTGGCCTGCTGTCCCTGTTATGGAGTAATCG
+TTTTGTGATATGCCGCAGAAACGTTGTATGAAATAACGTTCTGCGGTTAGTTAGTATATTGTAAAGCTGAGTATTGGTTT
+ATTTGGCGATTATTATCTTCAGGAGAATAATGGAAGTTCTATGACTCAATTGTTCATAGTGTTTACATCACCGCCAATTG
+CTTTTAAGACTGAACGCATGAAATATGGTTTTTCGTCATGTTTTGAGTCTGCTGTTGATATTTCTAAAGTCGGTTTTTTT
+TCTTCGTTTTCTCTAACTATTTTCCATGAAATACATTTTTGATTATTATTTGAATCAATTCCAATTACCTGAAGTCTTTC
+ATCTATAATTGGCATTGTATGTATTGGTTTATTGGAGTAGATGCTTGCTTTTCTGAGCCATAGCTCTGATATCCAAATGA
+AGCCATAGGCATTTGTTATTTTGGCTCTGTCAGCTGCATAACGCCAAAAAATATATTTATCTGCTTGATCTTCAAATGTT
+GTATTGATTAAATCAATTGGATGGAATTGTTTATCATAAAAAATTAATGTTTGAATGTGATAACCGTCCTTTAAAAAAGT
+CGTTTCTGCAAGCTTGGCTGTATAGTCAACTAACTCTTCTGTCGAAGTGATATTTTTAGGCTTATCTACCAGTTTTAGAC
+GCTCTTTAATATCTTCAGGAATTATTTTATTGTCATATTGTATCATGCTAAATGACAATTTGCTTATGGAGTAATCTTTT
+AATTTTAAATAAGTTATTCTCCTGGCTTCATCAAATAAAGAGTCGAATGATGTTGGCGAAATCACATCGTCACCCATTGG
+ATTGTTTATTTGTATGCCAAGAGAGTTACAGCAGTTATACATTCTGCCATAGATTATAGCTAAGGCATGTAATAATTCGT
+AATCTTTTAGCGTATTAGCGACCCATCGTCTTTCTGATTTAATAATAGATGATTCAGTTAAATATGAAGGTAATTTCTTT
+TGTGCAAGTCTGACTAACTTTTTTATACCAATGTTTAACATACTTTCATTTGTAATAAACTCAATGTCATTTTCTTCAAT
+GTAAGATGAAATAAGAGTAGCCTTTGCCTCGCTATACATTTCTAAATCGCCTTGTTTTTCTATCGTATTGCGAGAATTTT
+TAGCCCAAGCCATTAATGGATCATTTTTCCATTTTTCAATAACATTATTGTTATACCAAATGTCATATCCTATAATCTGG
+TTTTTGTTTTTTTGAATAATAAATGTTACTGTTCTTGCGGTTTGGAGGAATTGATTCAAATTCAAGCGAAATAATTCAGG
+GTCAAAATATGTATCAATGCAGCATTTGAGCAAGTGCGATAAATCTTTAAGTCTTCTTTCCCATGGTTTTTTAGTCATAA
+AACTCTCCATTTTGATAGGTTGCATGCTAGATGCTGATATATTTTAGAGGTGATAAAATTAACTGCTTAACTGTCAATGT
+AATACAAGTTGTTTGATCTTTGCAATGATTCTTATCAGAAACCATATAGTAAATTAGTTACACAGGAAATTTTTAATATT
+ATTATTATCATTCATTATGTATTAAAATTAGAGTTGTGGCTTGGCTCTGCTAACACGTTGCTCATAGGAGATATGGTAGA
+GCCGCAGACACGTCGTATGCAGGAACGTGCTGCGGCTGGCTGGTGAACTTCCGATAGTGCGGGTGTTGAATGATTTCCAG
+TTGCTACCGATTTTACATATTTTTTGCATGAGAGAATTTGTACCACCTCCCACCGACCATCTATGACTGTACGCCACTGT
+CCCTAGGACTGCTATGTGCCGGAGCGGACATTACAAACGTCCTTCTCGGTGCATGCCACTGTTGCCAATGACCTGCCTAG
+GAATTGGTTAGCAAGTTACTACCGGATTTTGTAAAAACAGCCCTCCTCATATAAAAAGTATTCGTTCACTTCCGATAAGC
+GTCGTAATTTTCTATCTTTCATCATATTCTAGATCCCTCTGAAAAAATCTTCCGAGTTTGCTAGGCACTGATACATAACT
+CTTTTCCAATAATTGGGGAAGTCATTCAAATCTATAATAGGTTTCAGATTTGCTTCAATAAATTCTGACTGTAGCTGCTG
+AAACGTTGCGGTTGAACTATATTTCCTTATAACTTTTACGAAAGAGTTTCTTTGAGTAATCACTTCACTCAAGTGCTTCC
+CTGCCTCCAAACGATACCTGTTAGCAATATTTAATAGCTTGAAATGATGAAGAGCTCTGTGTTTGTCTTCCTGCCTCCAG
+TTCGCCGGGCATTCAACATAAAAACTGATAGCACCCGGAGTTCCGGAAACGAAATTTGCATATACCCATTGCTCACGAAA
+AAAAATGTCCTTGTCGATATAGGGATGAATCGCTTGGTGTACCTCATCTACTGCGAAAACTTGACCTTTCTCTCCCATAT
+TGCAGTCGCGGCACGATGGAACTAAATTAATAGGCATCACCGAAAATTCAGGATAATGTGCAATAGGAAGAAAATGATCT
+ATATTTTTTGTCTGTCCTATATCACCACAAAATGGACATTTTTCACCTGATGAAACAAGCATGTCATCGTAATATGTTCT
+AGCGGGTTTGTTTTTATCTCGGAGATTATTTTCATAAAGCTTTTCTAATTTAACCTTTGTCAGGTTACCAACTACTAAGG
+TTGTAGGCTCAAGAGGGTGTGTCCTGTCGTAGGTAAATAACTGACCTGTCGAGCTTAATATTCTATATTGTTGTTCTTTC
+TGCAAAAAAGTGGGGAAGTGAGTAATGAAATTATTTCTAACATTTATCTGCATCATACCTTCCGAGCATTTATTAAGCAT
+TTCGCTATAAGTTCTCGCTGGAAGAGGTAGTTTTTTCATTGTACTTTACCTTCATCTCTGTTCATTATCATCGCTTTTAA
+AACGGTTCGACCTTCTAATCCTATCTGACCATTATAATTTTTTAGAATGGTTTCATAAGAAAGCTCTGAATCAACGGACT
+GCGATAATAAGTGGTGGTATCCAGAATTTGTCACTTCAAGTAAAAACACCTCACGAGTTAAAACACCTAAGTTCTCACCG
+AATGTCTCAATATCCGGACGGATAATATTTATTGCTTCTCTTGACCGTAGGACTTTCCACATGCAGGATTTTGGAACCTC
+TTGCAGTACTACTGGGGAATGAGTTGCAATTATTGCTACACCATTGCGTGCATCGAGTAAGTCGCTTAATGTTCGTAAAA
+AAGCAGAGAGCAAAGGTGGATGCAGATGAACCTCTGGTTCATCGAATAAAACTAATGACTTTTCGCCAACGACATCTACT
+AATCTTGTGATAGTAAATAAAACAATTGCATGTCCAGAGCTCATTCGAAGCAGATATTTCTGGATATTGTCATAAAACAA
+TTTAGTGAATTTATCATCGTCCACTTGAATCTGTGGTTCATTACGTCTTAACTCTTCATATTTAGAAATGAGGCTGATGA
+GTTCCATATTTGAAAAGTTTTCATCACTACTTAGTTTTTTGATAGCTTCAAGCCAGAGTTGTCTTTTTCTATCTACTCTC
+ATACAACCAATAAATGCTGAAATGAATTCTAAGCGGAGATCGCCTAGTGATTTTAAACTATTGCTGGCAGCATTCTTGAG
+TCCAATATAAAAGTATTGTGTACCTTTTGCTGGGTCAGGTTGTTCTTTAGGAGGAGTAAAAGGATCAAATGCACTAAACG
+AAACTGAAACAAGCGATCGAAAATATCCCTTTGGGATTCTTGACTCGATAAGTCTATTATTTTCAGAGAAAAAATATTCA
+TTGTTTTCTGGGTTGGTGATTGCACCAATCATTCCATTCAAAATTGTTGTTTTACCACACCCATTCCGCCCGATAAAAGC
+ATGAATGTTCGTGCTGGGCATAGAATTAACCGTCACCTCAAAAGGTATAGTTAAATCACTGAATCCGGGAGCACTTTTTC
+TATTAAATGAAAAGTGGAAATCTGACAATTCTGGCAAACCATTTAACACACGTGCGAACTGTCCATGAATTTCTGAAAGA
+GTTACCCCTCTAAGTAATGAGGTGTTAAGGACGCTTTCATTTTCAATGTCGGCTAATCGATTTGGCCATACTACTAAATC
+CTGAATAGCTTTAAGAAGGTTATGTTTAAAACCATCGCTTAATTTGCTGAGATTAACATAGTAGTCAATGCTTTCACCTA
+AGGAAAAAAACATTTCAGGGAGTTGACTGAATTTTTTATCTATTAATGAATAAGTGCTTACTTCTTCTTTTTGACCTACA
+AAACCAATTTTAACATTTCCGATATCGCATTTTTCACCATGCTCATCAAAGACAGTAAGATAAAACATTGTAACAAAGGA
+ATAGTCATTCCAACCATCTGCTCGTAGGAATGCCTTATTTTTTTCTACTGCAGGAATATACCCGCCTCTTTCAATAACAC
+TAAACTCCAACATATAGTAACCCTTAATTTTATTAAAATAACCGCAATTTATTTGGCGGCAACACAGGATCTCTCTTTTA
+AGTTACTCTCTATTACATACGTTTTCCATCTAAAAATTAGTAGTATTGAACTTAACGGGGCATCGTATTGTAGTTTTCCA
+TATTTAGCTTTCTGCTTCCTTTTGGATAACCCACTGTTATTCATGTTGCATGGTGCACTGTTTATACCAACGATATAGTC
+TATTAATGCATATATAGTATCGCCGAACGATTAGCTCTTCAGGCTTCTGAAGAAGCGTTTCAAGTACTAATAAGCCGATA
+GATAGCCACGGACTTCGTAGCCATTTTTCATAAGTGTTAACTTCCGCTCCTCGCTCATAACAGACATTCACTACAGTTAT
+GGCGGAAAGGTATGCATGCTGGGTGTGGGGAAGTCGTGAAAGAAAAGAAGTCAGCTGCGTCGTTTGACATCACTGCTATC
+TTCTTACTGGTTATGCAGGTCGTAGTGGGTGGCACACAAAGCTTTGCACTGGATTGCGAGGCTTTGTGCTTCTCTGGAGT
+GCGACAGGTTTGATGACAAAAAATTAGCGCAAGAAGACAAAAATCACCTTGCGCTAATGCTCTGTTACAGGTCACTAATA
+CCATCTAAGTAGTTGATTCATAGTGACTGCATATGTTGTGTTTTACAGTATTATGTAGTCTGTTTTTTATGCAAAATCTA
+ATTTAATATATTGATATTTATATCATTTTACGTTTCTCGTTCAGCTTTTTTATACTAAGTTGGCATTATAAAAAAGCATT
+GCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGATTTCAATTTTGTCCCACTCCC
+TGCCTCTGTCATCACGATACTGTGATGCCATGGTGTCCGACTTATGCCCGAGAAGATGTTGAGCAAACTTATCGCTTATC
+TGCTTCTCATAGAGTCTTGCAGACAAACTGCGCAACTCGTGAAAGGTAGGCGGATCCCCTTCGAAGGAAAGACCTGATGC
+TTTTCGTGCGCGCATAAAATACCTTGATACTGTGCCGGATGAAAGCGGTTCGCGACGAGTAGATGCAATTATGGTTTCTC
+CGCCAAGAATCTCTTTGCATTTATCAAGTGTTTCCTTCATTGATATTCCGAGAGCATCAATATGCAATGCTGTTGGGATG
+GCAATTTTTACGCCTGTTTTGCTTTGCTCGACATAAAGATATCCATCTACGATATCAGACCACTTCATTTCGCATAAATC
+ACCAACTCGTTGCCCGGTAACAACAGCCAGTTCCATTGCAAGTCTGAGCCAACATGGTGATGATTCTGCTGCTTGATAAA
+TTTTCAGGTATTCGTCAGCCGTAAGTCTTGATCTCCTTACCTCTGATTTTGCTGCGCGAGTGGCAGCGACATGGTTTGTT
+GTTATATGGCCTTCAGCTATTGCCTCTCGGAATGCATCGCTCAGTGTTGATCTGATTAACTTGGCTGACGCCGCCTTGCC
+CTCGTCTATGTATCCATTGAGCATTGCCGCAATTTCTTTTGTGGTGATGTCTTCAAGTGGAGCATCAGGCAGACCCCTCC
+TTATTGCTTTAATTTTGCTCATGTAATTTATGAGTGTCTTCTGCTTGATTCCTCTGCTGGCCAGGATTTTTTCGTAGCGA
+TCAAGCCATGAATGTAACGTAACGGAATTATCACTGTTGATTCTCGCTGTCAGAGGCTTGTGTTTGTGTCCTGAAAATAA
+CTCAATGTTGGCCTGTATAGCTTCAGTGATTGCGATTCGCCTGTCTCTGCCTAATCCAAACTCTTTACCCGTCCTTGGGT
+CCCTGTAGCAGTAATATCCATTGTTTCTTATATAAAGGTTAGGGGGTAAATCCCGGCGCTCATGACTTCGCCTTCTTCCC
+ATTTCTGATCCTCTTCAAAAGGCCACCTGTTACTGGTCGATTTAAGTCAACCTTTACCGCTGATTCGTGGAACAGATACT
+CTCTTCCATCCTTAACCGGAGGTGGGAATATCCTGCATTCCCGAACCCATCGACGAACTGTTTCAAGGCTTCTTGGACGT
+CGCTGGCGTGCGTTCCACTCCTGAAGTGTCAAGTACATCGCAAAGTCTCCGCAATTACACGCAAGAAAAAACCGCCATCA
+GGCGGCTTGGTGTTCTTTCAGTTCTTCAATTCGAATATTGGTTACGTCTGCATGTGCTATCTGCGCCCATATCATCCAGT
+GGTCGTAGCAGTCGTTGATGTTCTCCGCTTCGATAACTCTGTTGAATGGCTCTCCATTCCATTCTCCTGTGACTCGGAAG
+TGCATTTATCATCTCCATAAAACAAAACCCGCCGTAGCGAGTTCAGATAAAATAAATCCCCGCGAGTGCGAGGATTGTTA
+TGTAATATTGGGTTTAATCATCTATATGTTTTGTACAGAGAGGGCAAGTATCGTTTCCACCGTACTCGTGATAATAATTT
+TGCACGGTATCAGTCATTTCTCGCACATTGCAGAATGGGGATTTGTCTTCATTAGACTTATAAACCTTCATGGAATATTT
+GTATGCCGACTCTATATCTATACCTTCATCTACATAAACACCTTCGTGATGTCTGCATGGAGACAAGACACCGGATCTGC
+ACAACATTGATAACGCCCAATCTTTTTGCTCAGACTCTAACTCATTGATACTCATTTATAAACTCCTTGCAATGTATGTC
+GTTTCAGCTAAACGGTATCAGCAATGTTTATGTAAAGAAACAGTAAGATAATACTCAACCCGATGTTTGAGTACGGTCAT
+CATCTGACACTACAGACTCTGGCATCGCTGTGAAGACGACGCGAAATTCAGCATTTTCACAAGCGTTATCTTTTACAAAA
+CCGATCTCACTCTCCTTTGATGCGAATGCCAGCGTCAGACATCATATGCAGATACTCACCTGCATCCTGAACCCATTGAC
+CTCCAACCCCGTAATAGCGATGCGTAATGATGTCGATAGTTACTAACGGGTCTTGTTCGATTAACTGCCGCAGAAACTCT
+TCCAGGTCACCAGTGCAGTGCTTGATAACAGGAGTCTTCCCAGGATGGCGAACAACAAGAAACTGGTTTCCGTCTTCACG
+GACTTCGTTGCTTTCCAGTTTAGCAATACGCTTACTCCCATCCGAGATAACACCTTCGTAATACTCACGCTGCTCGTTGA
+GTTTTGATTTTGCTGTTTCAAGCTCAACACGCAGTTTCCCTACTGTTAGCGCAATATCCTCGTTCTCCTGGTCGCGGCGT
+TTGATGTATTGCTGGTTTCTTTCCCGTTCATCCAGCAGTTCCAGCACAATCGATGGTGTTACCAATTCATGGAAAAGGTC
+TGCGTCAAATCCCCAGTCGTCATGCATTGCCTGCTCTGCCGCTTCACGCAGTGCCTGAGAGTTAATTTCGCTCACTTCGA
+ACCTCTCTGTTTACTGATAAGTTCCAGATCCTCCTGGCAACTTGCACAAGTCCGACAACCCTGAACGACCAGGCGTCTTC
+GTTCATCTATCGGATCGCCACACTCACAACAATGAGTGGCAGATATAGCCTGGTGGTTCAGGCGGCGCATTTTTATTGCT
+GTGTTGCGCTGTAATTCTTCTATTTCTGATGCTGAATCAATGATGTCTGCCATCTTTCATTAATCCCTGAACTGTTGGTT
+AATACGCTTGAGGGTGAATGCGAATAATAAAAAAGGAGCCTGTAGCTCCCTGATGATTTTGCTTTTCATGTTCATCGTTC
+CTTAAAGACGCCGTTTAACATGCCGATTGCCAGGCTTAAATGAGTCGGTGTGAATCCCATCAGCGTTACCGTTTCGCGGT
+GCTTCTTCAGTACGCTACGGCAAATGTCATCGACGTTTTTATCCGGAAACTGCTGTCTGGCTTTTTTTGATTTCAGAATT
+AGCCTGACGGGCAATGCTGCGAAGGGCGTTTTCCTGCTGAGGTGTCATTGAACAAGTCCCATGTCGGCAAGCATAAGCAC
+ACAGAATATGAAGCCCGCTGCCAGAAAAATGCATTCCGTGGTTGTCATACCTGGTTTCTCTCATCTGCTTCTGCTTTCGC
+CACCATCATTTCCAGCTTTTGTGAAAGGGATGCGGCTAACGTATGAAATTCTTCGTCTGTTTCTACTGGTATTGGCACAA
+ACCTGATTCCAATTTGAGCAAGGCTATGTGCCATCTCGATACTCGTTCTTAACTCAACAGAAGATGCTTTGTGCATACAG
+CCCCTCGTTTATTATTTATCTCCTCAGCCAGCCGCTGTGCTTTCAGTGGATTTCGGATAACAGAAAGGCCGGGAAATACC
+CAGCCTCGCTTTGTAACGGAGTAGACGAAAGTGATTGCGCCTACCCGGATATTATCGTGAGGATGCGTCATCGCCATTGC
+TCCCCAAATACAAAACCAATTTCAGCCAGTGCCTCGTCCATTTTTTCGATGAACTCCGGCACGATCTCGTCAAAACTCGC
+CATGTACTTTTCATCCCGCTCAATCACGACATAATGCAGGCCTTCACGCTTCATACGCGGGTCATAGTTGGCAAAGTACC
+AGGCATTTTTTCGCGTCACCCACATGCTGTACTGCACCTGGGCCATGTAAGCTGACTTTATGGCCTCGAAACCACCGAGC
+CGGAACTTCATGAAATCCCGGGAGGTAAACGGGCATTTCAGTTCAAGGCCGTTGCCGTCACTGCATAAACCATCGGGAGA
+GCAGGCGGTACGCATACTTTCGTCGCGATAGATGATCGGGGATTCAGTAACATTCACGCCGGAAGTGAATTCAAACAGGG
+TTCTGGCGTCGTTCTCGTACTGTTTTCCCCAGGCCAGTGCTTTAGCGTTAACTTCCGGAGCCACACCGGTGCAAACCTCA
+GCAAGCAGGGTGTGGAAGTAGGACATTTTCATGTCAGGCCACTTCTTTCCGGAGCGGGGTTTTGCTATCACGTTGTGAAC
+TTCTGAAGCGGTGATGACGCCGAGCCGTAATTTGTGCCACGCATCATCCCCCTGTTCGACAGCTCTCACATCGATCCCGG
+TACGCTGCAGGATAATGTCCGGTGTCATGCTGCCACCTTCTGCTCTGCGGCTTTCTGTTTCAGGAATCCAAGAGCTTTTA
+CTGCTTCGGCCTGTGTCAGTTCTGACGATGCACGAATGTCGCGGCGAAATATCTGGGAACAGAGCGGCAATAAGTCGTCA
+TCCCATGTTTTATCCAGGGCGATCAGCAGAGTGTTAATCTCCTGCATGGTTTCATCGTTAACCGGAGTGATGTCGCGTTC
+CGGCTGACGTTCTGCAGTGTATGCAGTATTTTCGACAATGCGCTCGGCTTCATCCTTGTCATAGATACCAGCAAATCCGA
+AGGCCAGACGGGCACACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTGATTTCT
+CTGCCTTCGCGAGTTTTGAATGGTTCGCGGCGGCATTCATCCATCCATTCGGTAACGCAGATCGGATGATTACGGTCCTT
+GCGGTAAATCCGGCATGTACAGGATTCATTGTCCTGCTCAAAGTCCATGCCATCAAACTGCTGGTTTTCATTGATGATGC
+GGGACCAGCCATCAACGCCCACCACCGGAACGATGCCATTCTGCTTATCAGGAAAGGCGTAAATTTCTTTCGTCCACGGA
+TTAAGGCCGTACTGGTTGGCAACGATCAGTAATGCGATGAACTGCGCATCGCTGGCATCACCTTTAAATGCCGTCTGGCG
+AAGAGTGGTGATCAGTTCCTGTGGGTCGACAGAATCCATGCCGACACGTTCAGCCAGCTTCCCAGCCAGCGTTGCGAGTG
+CAGTACTCATTCGTTTTATACCTCTGAATCAATATCAACCTGGTGGTGAGCAATGGTTTCAACCATGTACCGGATGTGTT
+CTGCCATGCGCTCCTGAAACTCAACATCGTCATCAAACGCACGGGTAATGGATTTTTTGCTGGCCCCGTGGCGTTGCAAA
+TGATCGATGCATAGCGATTCAAACAGGTGCTGGGGCAGGCCTTTTTCCATGTCGTCTGCCAGTTCTGCCTCTTTCTCTTC
+ACGGGCGAGCTGCTGGTAGTGACGCGCCCAGCTCTGAGCCTCAAGACGATCCTGAATGTAATAAGCGTTCATGGCTGAAC
+TCCTGAAATAGCTGTGAAAATATCGCCCGCGAAATGCCGGGCTGATTAGGAAAACAGGAAAGGGGGTTAGTGAATGCTTT
+TGCTTGATCTCAGTTTCAGTATTAATATCCATTTTTTATAAGCGTCGACGGCTTCACGAAACATCTTTTCATCGCCAATA
+AAAGTGGCGATAGTGAATTTAGTCTGGATAGCCATAAGTGTTTGATCCATTCTTTGGGACTCCTGGCTGATTAAGTATGT
+CGATAAGGCGTTTCCATCCGTCACGTAATTTACGGGTGATTCGTTCAAGTAAAGATTCGGAAGGGCAGCCAGCAACAGGC
+CACCCTGCAATGGCATATTGCATGGTGTGCTCCTTATTTATACATAACGAAAAACGCCTCGAGTGAAGCGTTATTGGTAT
+GCGGTAAAACCGCACTCAGGCGGCCTTGATAGTCATATCATCTGAATCAAATATTCCTGATGTATCGATATCGGTAATTC
+TTATTCCTTCGCTACCATCCATTGGAGGCCATCCTTCCTGACCATTTCCATCATTCCAGTCGAACTCACACACAACACCA
+TATGCATTTAAGTCGCTTGAAATTGCTATAAGCAGAGCATGTTGCGCCAGCATGATTAATACAGCATTTAATACAGAGCC
+GTGTTTATTGAGTCGGTATTCAGAGTCTGACCAGAAATTATTAATCTGGTGAAGTTTTTCCTCTGTCATTACGTCATGGT
+CGATTTCAATTTCTATTGATGCTTTCCAGTCGTAATCAATGATGTATTTTTTGATGTTTGACATCTGTTCATATCCTCAC
+AGATAAAAAATCGCCCTCACACTGGAGGGCAAAGAAGATTTCCAATAATCAGAACAAGTCGGCTCCTGTTTAGTTACGAG
+CGACATTGCTCCGTGTATTCACTCGTTGGAATGAATACACAGTGCAGTGTTTATTCTGTTATTTATGCCAAAAATAAAGG
+CCACTATCAGGCAGCTTTGTTGTTCTGTTTACCAAGTTCTCTGGCAATCATTGCCGTCGTTCGTATTGCCCATTTATCGA
+CATATTTCCCATCTTCCATTACAGGAAACATTTCTTCAGGCTTAACCATGCATTCCGATTGCAGCTTGCATCCATTGCAT
+CGCTTGAATTGTCCACACCATTGATTTTTATCAATAGTCGTAGTCATACGGATAGTCCTGGTATTGTTCCATCACATCCT
+GAGGATGCTCTTCGAACTCTTCAAATTCTTCTTCCATATATCACCTTAAATAGTGGATTGCGGTAGTAAAGATTGTGCCT
+GTCTTTTAACCACATCAGGCTCGGTGGTTCTCGTGTACCCCTACAGCGAGAAATCGGATAAACTATTACAACCCCTACAG
+TTTGATGAGTATAGAAATGGATCCACTCGTTATTCTCGGACGAGTGTTCAGTAATGAACCTCTGGAGAGAACCATGTATA
+TGATCGTTATCTGGGTTGGACTTCTGCTTTTAAGCCCAGATAACTGGCCTGAATATGTTAATGAGAGAATCGGTATTCCT
+CATGTGTGGCATGTTTTCGTCTTTGCTCTTGCATTTTCGCTAGCAATTAATGTGCATCGATTATCAGCTATTGCCAGCGC
+CAGATATAAGCGATTTAAGCTAAGAAAACGCATTAAGATGCAAAACGATAAAGTGCGATCAGTAATTCAAAACCTTACAG
+AAGAGCAATCTATGGTTTTGTGCGCAGCCCTTAATGAAGGCAGGAAGTATGTGGTTACATCAAAACAATTCCCATACATT
+AGTGAGTTGATTGAGCTTGGTGTGTTGAACAAAACTTTTTCCCGATGGAATGGAAAGCATATATTATTCCCTATTGAGGA
+TATTTACTGGACTGAATTAGTTGCCAGCTATGATCCATATAATATTGAGATAAAGCCAAGGCCAATATCTAAGTAACTAG
+ATAAGAGGAATCGATTTTCCCTTAATTTTCTGGCGTCCACTGCATGTTATGCCGCGTTCGCCAGGCTTGCTGTACCATGT
+GCGCTGATTCTTGCGCTCAATACGTTGCAGGTTGCTTTCAATCTGTTTGTGGTATTCAGCCAGCACTGTAAGGTCTATCG
+GATTTAGTGCGCTTTCTACTCGTGATTTCGGTTTGCGATTCAGCGAGAGAATAGGGCGGTTAACTGGTTTTGCGCTTACC
+CCAACCAACAGGGGATTTGCTGCTTTCCATTGAGCCTGTTTCTCTGCGCGACGTTCGCGGCGGCGTGTTTGTGCATCCAT
+CTGGATTCTCCTGTCAGTTAGCTTTGGTGGTGTGTGGCAGTTGTAGTCCTGAACGAAAACCCCCCGCGATTGGCACATTG
+GCAGCTAATCCGGAATCGCACTTACGGCCAATGCTTCGTTTCGTATCACACACCCCAAAGCCTTCTGCTTTGAATGCTGC
+CCTTCTTCAGGGCTTAATTTTTAAGAGCGTCACCTTCATGGTGGTCAGTGCGTCCTGCTGATGTGCTCAGTATCACCGCC
+AGTGGTATTTATGTCAACACCGCCAGAGATAATTTATCACCGCAGATGGTTATCTGTATGTTTTTTATATGAATTTATTT
+TTTGCAGGGGGGCATTGTTTGGTAGGTGAGAGATCTGAATTGCTATGTTTAGTGAGTTGTATCTATTTATTTTTCAATAA
+ATACAATTGGTTATGTGTTTTGGGGGCGATCGTGAGGCAAAGAAAACCCGGCGCTGAGGCCGGGTTATTCTTGTTCTCTG
+GTCAAATTATATAGTTGGAAAACAAGGATGCATATATGAATGAACGATGCAGAGGCAATGCCGATGGCGATAGTGGGTAT
+CATGTAGCCGCTTATGCTGGAAAGAAGCAATAACCCGCAGAAAAACAAAGCTCCAAGCTCAACAAAACTAAGGGCATAGA
+CAATAACTACCGATGTCATATACCCATACTCTCTAATCTTGGCCAGTCGGCGCGTTCTGCTTCCGATTAGAAACGTCAAG
+GCAGCAATCAGGATTGCAATCATGGTTCCTGCATATGATGACAATGTCGCCCCAAGACCATCTCTATGAGCTGAAAAAGA
+AACACCAGGAATGTAGTGGCGGAAAAGGAGATAGCAAATGCTTACGATAACGTAAGGAATTATTACTATGTAAACACCAG
+GCATGATTCTGTTCCGCATAATTACTCCTGATAATTAATCCTTAACTTTGCCCACCTGCCTTTTAAAACATTCCAGTATA
+TCACTTTTCATTCTTGCGTAGCAATATGCCATCTCTTCAGCTATCTCAGCATTGGTGACCTTGTTCAGAGGCGCTGAGAG
+ATGGCCTTTTTCTGATAGATAATGTTCTGTTAAAATATCTCCGGCCTCATCTTTTGCCCGCAGGCTAATGTCTGAAAATT
+GAGGTGACGGGTTAAAAATAATATCCTTGGCAACCTTTTTTATATCCCTTTTAAATTTTGGCTTAATGACTATATCCAAT
+GAGTCAAAAAGCTCCCCTTCAATATCTGTTGCCCCTAAGACCTTTAATATATCGCCAAATACAGGTAGCTTGGCTTCTAC
+CTTCACCGTTGTTCGGCCGATGAAATGCATATGCATAACATCGTCTTTGGTGGTTCCCCTCATCAGTGGCTCTATCTGAA
+CGCGCTCTCCACTGCTTAATGACATTCCTTTCCCGATTAAAAAATCTGTCAGATCGGATGTGGTCGGCCCGAAAACAGTT
+CTGGCAAAACCAATGGTGTCGCCTTCAACAAACAAAAAAGATGGGAATCCCAATGATTCGTCATCTGCGAGGCTGTTCTT
+AATATCTTCAACTGAAGCTTTAGAGCGATTTATCTTCTGAACCAGACTCTTGTCATTTGTTTTGGTAAAGAGAAAAGTTT
+TTCCATCGATTTTATGAATATACAAATAATTGGAGCCAACCTGCAGGTGATGATTATCAGCCAGCAGAGAATTAAGGAAA
+ACAGACAGGTTTATTGAGCGCTTATCTTTCCCTTTATTTTTGCTGCGGTAAGTCGCATAAAAACCATTCTTCATAATTCA
+ATCCATTTACTATGTTATGTTCTGAGGGGAGTGAAAATTCCCCTAATTCGATGAAGATTCTTGCTCAATTGTTATCAGCT
+ATGCGCCGACCAGAACACCTTGCCGATCAGCCAAACGTCTCTTCAGGCCACTGACTAGCGATAACTTTCCCCACAACGGA
+ACAACTCTCATTGCATGGGATCATTGGGTACTGTGGGTTTAGTGGTTGTAAAAACACCTGACCGCTATCCCTGATCAGTT
+TCTTGAAGGTAAACTCATCACCCCCAAGTCTGGCTATGCAGAAATCACCTGGCTCAACAGCCTGCTCAGGGTCAACGAGA
+ATTAACATTCCGTCAGGAAAGCTTGGCTTGGAGCCTGTTGGTGCGGTCATGGAATTACCTTCAACCTCAAGCCAGAATGC
+AGAATCACTGGCTTTTTTGGTTGTGCTTACCCATCTCTCCGCATCACCTTTGGTAAAGGTTCTAAGCTTAGGTGAGAACA
+TCCCTGCCTGAACATGAGAAAAAACAGGGTACTCATACTCACTTCTAAGTGACGGCTGCATACTAACCGCTTCATACATC
+TCGTAGATTTCTCTGGCGATTGAAGGGCTAAATTCTTCAACGCTAACTTTGAGAATTTTTGTAAGCAATGCGGCGTTATA
+AGCATTTAATGCATTGATGCCATTAAATAAAGCACCAACGCCTGACTGCCCCATCCCCATCTTGTCTGCGACAGATTCCT
+GGGATAAGCCAAGTTCATTTTTCTTTTTTTCATAAATTGCTTTAAGGCGACGTGCGTCCTCAAGCTGCTCTTGTGTTAAT
+GGTTTCTTTTTTGTGCTCATACGTTAAATCTATCACCGCAAGGGATAAATATCTAACACCGTGCGTGTTGACTATTTTAC
+CTCTGGCGGTGATAATGGTTGCATGTACTAAGGAGGTTGTATGGAACAACGCATAACCCTGAAAGATTATGCAATGCGCT
+TTGGGCAAACCAAGACAGCTAAAGATCTCGGCGTATATCAAAGCGCGATCAACAAGGCCATTCATGCAGGCCGAAAGATT
+TTTTTAACTATAAACGCTGATGGAAGCGTTTATGCGGAAGAGGTAAAGCCCTTCCCGAGTAACAAAAAAACAACAGCATA
+AATAACCCCGCTCTTACACATTCCAGCCCTGAAAAAGGGCATCAAATTAAACCACACCTATGGTGTATGCATTTATTTGC
+ATACATTCAATCAATTGTTATCTAAGGAAATACTTACATATGGTTCGTGCAAACAAACGCAACGAGGCTCTACGAATCGA
+GAGTGCGTTGCTTAACAAAATCGCAATGCTTGGAACTGAGAAGACAGCGGAAGCTGTGGGCGTTGATAAGTCGCAGATCA
+GCAGGTGGAAGAGGGACTGGATTCCAAAGTTCTCAATGCTGCTTGCTGTTCTTGAATGGGGGGTCGTTGACGACGACATG
+GCTCGATTGGCGCGACAAGTTGCTGCGATTCTCACCAATAAAAAACGCCCGGCGGCAACCGAGCGTTCTGAACAAATCCA
+GATGGAGTTCTGAGGTCATTACTGGATCTATCAACAGGAGTCATTATGACAAATACAGCAAAAATACTCAACTTCGGCAG
+AGGTAACTTTGCCGGACAGGAGCGTAATGTGGCAGATCTCGATGATGGTTACGCCAGACTATCAAATATGCTGCTTGAGG
+CTTATTCGGGCGCAGATCTGACCAAGCGACAGTTTAAAGTGCTGCTTGCCATTCTGCGTAAAACCTATGGGTGGAATAAA
+CCAATGGACAGAATCACCGATTCTCAACTTAGCGAGATTACAAAGTTACCTGTCAAACGGTGCAATGAAGCCAAGTTAGA
+ACTCGTCAGAATGAATATTATCAAGCAGCAAGGCGGCATGTTTGGACCAAATAAAAACATCTCAGAATGGTGCATCCCTC
+AAAACGAGGGAAAATCCCCTAAAACGAGGGATAAAACATCCCTCAAATTGGGGGATTGCTATCCCTCAAAACAGGGGGAC
+ACAAAAGACACTATTACAAAAGAAAAAAGAAAAGATTATTCGTCAGAGAATTCTGGCGAATCCTCTGACCAGCCAGAAAA
+CGACCTTTCTGTGGTGAAACCGGATGCTGCAATTCAGAGCGGCAGCAAGTGGGGGACAGCAGAAGACCTGACCGCCGCAG
+AGTGGATGTTTGACATGGTGAAGACTATCGCACCATCAGCCAGAAAACCGAATTTTGCTGGGTGGGCTAACGATATCCGC
+CTGATGCGTGAACGTGACGGACGTAACCACCGCGACATGTGTGTGCTGTTCCGCTGGGCATGCCAGGACAACTTCTGGTC
+CGGTAACGTGCTGAGCCCGGCCAAACTCCGCGATAAGTGGACCCAACTCGAAATCAACCGTAACAAGCAACAGGCAGGCG
+TGACAGCCAGCAAACCAAAACTCGACCTGACAAACACAGACTGGATTTACGGGGTGGATCTATGAAAAACATCGCCGCAC
+AGATGGTTAACTTTGACCGTGAGCAGATGCGTCGGATCGCCAACAACATGCCGGAACAGTACGACGAAAAGCCGCAGGTA
+CAGCAGGTAGCGCAGATCATCAACGGTGTGTTCAGCCAGTTACTGGCAACTTTCCCGGCGAGCCTGGCTAACCGTGACCA
+GAACGAAGTGAACGAAATCCGTCGCCAGTGGGTTCTGGCTTTTCGGGAAAACGGGATCACCACGATGGAACAGGTTAACG
+CAGGAATGCGCGTAGCCCGTCGGCAGAATCGACCATTTCTGCCATCACCCGGGCAGTTTGTTGCATGGTGCCGGGAAGAA
+GCATCCGTTACCGCCGGACTGCCAAACGTCAGCGAGCTGGTTGATATGGTTTACGAGTATTGCCGGAAGCGAGGCCTGTA
+TCCGGATGCGGAGTCTTATCCGTGGAAATCAAACGCGCACTACTGGCTGGTTACCAACCTGTATCAGAACATGCGGGCCA
+ATGCGCTTACTGATGCGGAATTACGCCGTAAGGCCGCAGATGAGCTTGTCCATATGACTGCGAGAATTAACCGTGGTGAG
+GCGATCCCTGAACCAGTAAAACAACTTCCTGTCATGGGCGGTAGACCTCTAAATCGTGCACAGGCTCTGGCGAAGATCGC
+AGAAATCAAAGCTAAGTTCGGACTGAAAGGAGCAAGTGTATGACGGGCAAAGAGGCAATTATTCATTACCTGGGGACGCA
+TAATAGCTTCTGTGCGCCGGACGTTGCCGCGCTAACAGGCGCAACAGTAACCAGCATAAATCAGGCCGCGGCTAAAATGG
+CACGGGCAGGTCTTCTGGTTATCGAAGGTAAGGTCTGGCGAACGGTGTATTACCGGTTTGCTACCAGGGAAGAACGGGAA
+GGAAAGATGAGCACGAACCTGGTTTTTAAGGAGTGTCGCCAGAGTGCCGCGATGAAACGGGTATTGGCGGTATATGGAGT
+TAAAAGATGACCATCTACATTACTGAGCTAATAACAGGCCTGCTGGTAATCGCAGGCCTTTTTATTTGGGGGAGAGGGAA
+GTCATGAAAAAACTAACCTTTGAAATTCGATCTCCAGCACATCAGCAAAACGCTATTCACGCAGTACAGCAAATCCTTCC
+AGACCCAACCAAACCAATCGTAGTAACCATTCAGGAACGCAACCGCAGCTTAGACCAAAACAGGAAGCTATGGGCCTGCT
+TAGGTGACGTCTCTCGTCAGGTTGAATGGCATGGTCGCTGGCTGGATGCAGAAAGCTGGAAGTGTGTGTTTACCGCAGCA
+TTAAAGCAGCAGGATGTTGTTCCTAACCTTGCCGGGAATGGCTTTGTGGTAATAGGCCAGTCAACCAGCAGGATGCGTGT
+AGGCGAATTTGCGGAGCTATTAGAGCTTATACAGGCATTCGGTACAGAGCGTGGCGTTAAGTGGTCAGACGAAGCGAGAC
+TGGCTCTGGAGTGGAAAGCGAGATGGGGAGACAGGGCTGCATGATAAATGTCGTTAGTTTCTCCGGTGGCAGGACGTCAG
+CATATTTGCTCTGGCTAATGGAGCAAAAGCGACGGGCAGGTAAAGACGTGCATTACGTTTTCATGGATACAGGTTGTGAA
+CATCCAATGACATATCGGTTTGTCAGGGAAGTTGTGAAGTTCTGGGATATACCGCTCACCGTATTGCAGGTTGATATCAA
+CCCGGAGCTTGGACAGCCAAATGGTTATACGGTATGGGAACCAAAGGATATTCAGACGCGAATGCCTGTTCTGAAGCCAT
+TTATCGATATGGTAAAGAAATATGGCACTCCATACGTCGGCGGCGCGTTCTGCACTGACAGATTAAAACTCGTTCCCTTC
+ACCAAATACTGTGATGACCATTTCGGGCGAGGGAATTACACCACGTGGATTGGCATCAGAGCTGATGAACCGAAGCGGCT
+AAAGCCAAAGCCTGGAATCAGATATCTTGCTGAACTGTCAGACTTTGAGAAGGAAGATATCCTCGCATGGTGGAAGCAAC
+AACCATTCGATTTGCAAATACCGGAACATCTCGGTAACTGCATATTCTGCATTAAAAAATCAACGCAAAAAATCGGACTT
+GCCTGCAAAGATGAGGAGGGATTGCAGCGTGTTTTTAATGAGGTCATCACGGGATCCCATGTGCGTGACGGACATCGGGA
+AACGCCAAAGGAGATTATGTACCGAGGAAGAATGTCGCTGGACGGTATCGCGAAAATGTATTCAGAAAATGATTATCAAG
+CCCTGTATCAGGACATGGTACGAGCTAAAAGATTCGATACCGGCTCTTGTTCTGAGTCATGCGAAATATTTGGAGGGCAG
+CTTGATTTCGACTTCGGGAGGGAAGCTGCATGATGCGATGTTATCGGTGCGGTGAATGCAAAGAAGATAACCGCTTCCGA
+CCAAATCAACCTTACTGGAATCGATGGTGTCTCCGGTGTGAAAGAACACCAACAGGGGTGTTACCACTACCGCAGGAAAA
+GGAGGACGTGTGGCGAGACAGCGACGAAGTATCACCGACATAATCTGCGAAAACTGCAAATACCTTCCAACGAAACGCAC
+CAGAAATAAACCCAAGCCAATCCCAAAAGAATCTGACGTAAAAACCTTCAACTACACGGCTCACCTGTGGGATATCCGGT
+GGCTAAGACGTCGTGCGAGGAAAACAAGGTGATTGACCAAAATCGAAGTTACGAACAAGAAAGCGTCGAGCGAGCTTTAA
+CGTGCGCTAACTGCGGTCAGAAGCTGCATGTGCTGGAAGTTCACGTGTGTGAGCACTGCTGCGCAGAACTGATGAGCGAT
+CCGAATAGCTCGATGCACGAGGAAGAAGATGATGGCTAAACCAGCGCGAAGACGATGTAAAAACGATGAATGCCGGGAAT
+GGTTTCACCCTGCATTCGCTAATCAGTGGTGGTGCTCTCCAGAGTGTGGAACCAAGATAGCACTCGAACGACGAAGTAAA
+GAACGCGAAAAAGCGGAAAAAGCAGCAGAGAAGAAACGACGACGAGAGGAGCAGAAACAGAAAGATAAACTTAAGATTCG
+AAAACTCGCCTTAAAGCCCCGCAGTTACTGGATTAAACAAGCCCAACAAGCCGTAAACGCCTTCATCAGAGAAAGAGACC
+GCGACTTACCATGTATCTCGTGCGGAACGCTCACGTCTGCTCAGTGGGATGCCGGACATTACCGGACAACTGCTGCGGCA
+CCTCAACTCCGATTTAATGAACGCAATATTCACAAGCAATGCGTGGTGTGCAACCAGCACAAAAGCGGAAATCTCGTTCC
+GTATCGCGTCGAACTGATTAGCCGCATCGGGCAGGAAGCAGTAGACGAAATCGAATCAAACCATAACCGCCATCGCTGGA
+CTATCGAAGAGTGCAAGGCGATCAAGGCAGAGTACCAACAGAAACTCAAAGACCTGCGAAATAGCAGAAGTGAGGCCGCA
+TGACGTTCTCAGTAAAAACCATTCCAGACATGCTCGTTGAAACATACGGAAATCAGACAGAAGTAGCACGCAGACTGAAA
+TGTAGTCGCGGTACGGTCAGAAAATACGTTGATGATAAAGACGGGAAAATGCACGCCATCGTCAACGACGTTCTCATGGT
+TCATCGCGGATGGAGTGAAAGAGATGCGCTATTACGAAAAAATTGATGGCAGCAAATACCGAAATATTTGGGTAGTTGGC
+GATCTGCACGGATGCTACACGAACCTGATGAACAAACTGGATACGATTGGATTCGACAACAAAAAAGACCTGCTTATCTC
+GGTGGGCGATTTGGTTGATCGTGGTGCAGAGAACGTTGAATGCCTGGAATTAATCACATTCCCCTGGTTCAGAGCTGTAC
+GTGGAAACCATGAGCAAATGATGATTGATGGCTTATCAGAGCGTGGAAACGTTAATCACTGGCTGCTTAATGGCGGTGGC
+TGGTTCTTTAATCTCGATTACGACAAAGAAATTCTGGCTAAAGCTCTTGCCCATAAAGCAGATGAACTTCCGTTAATCAT
+CGAACTGGTGAGCAAAGATAAAAAATATGTTATCTGCCACGCCGATTATCCCTTTGACGAATACGAGTTTGGAAAGCCAG
+TTGATCATCAGCAGGTAATCTGGAACCGCGAACGAATCAGCAACTCACAAAACGGGATCGTGAAAGAAATCAAAGGCGCG
+GACACGTTCATCTTTGGTCATACGCCAGCAGTGAAACCACTCAAGTTTGCCAACCAAATGTATATCGATACCGGCGCAGT
+GTTCTGCGGAAACCTAACATTGATTCAGGTACAGGGAGAAGGCGCATGAGACTCGAAAGCGTAGCTAAATTTCATTCGCC
+AAAAAGCCCGATGATGAGCGACTCACCACGGGCCACGGCTTCTGACTCTCTTTCCGGTACTGATGTGATGGCTGCTATGG
+GGATGGCGCAATCACAAGCCGGATTCGGTATGGCTGCATTCTGCGGTAAGCACGAACTCAGCCAGAACGACAAACAAAAG
+GCTATCAACTATCTGATGCAATTTGCACACAAGGTATCGGGGAAATACCGTGGTGTGGCAAAGCTTGAAGGAAATACTAA
+GGCAAAGGTACTGCAAGTGCTCGCAACATTCGCTTATGCGGATTATTGCCGTAGTGCCGCGACGCCGGGGGCAAGATGCA
+GAGATTGCCATGGTACAGGCCGTGCGGTTGATATTGCCAAAACAGAGCTGTGGGGGAGAGTTGTCGAGAAAGAGTGCGGA
+AGATGCAAAGGCGTCGGCTATTCAAGGATGCCAGCAAGCGCAGCATATCGCGCTGTGACGATGCTAATCCCAAACCTTAC
+CCAACCCACCTGGTCACGCACTGTTAAGCCGCTGTATGACGCTCTGGTGGTGCAATGCCACAAAGAAGAGTCAATCGCAG
+ACAACATTTTGAATGCGGTCACACGTTAGCAGCATGATTGCCACGGATGGCAACATATTAACGGCATGATATTGACTTAT
+TGAATAAAATTGGGTAAATTTGACTCAACGATGGGTTAATTCGCTCGTTGTGGTAGTGAGATGAAAAGAGGCGGCGCTTA
+CTACCGATTCCGCCTAGTTGGTCACTTCGACGTATCGTCTGGAACTCCAACCATCGCAGGCAGAGAGGTCTGCAAAATGC
+AATCCCGAAACAGTTCGCAGGTAATAGTTAGAGCCTGCATAACGGTTTCGGGATTTTTTATATCTGCACAACAGGTAAGA
+GCATTGAGTCGATAATCGTGAAGAGTCGGCGAGCCTGGTTAGCCAGTGCTCTTTCCGTTGTGCTGAATTAAGCGAATACC
+GGAAGCAGAACCGGATCACCAAATGCGTACAGGCGTCATCGCCGCCCAGCAACAGCACAACCCAAACTGAGCCGTAGCCA
+CTGTCTGTCCTGAATTCATTAGTAATAGTTACGCTGCGGCCTTTTACACATGACCTTCGTGAAAGCGGGTGGCAGGAGGT
+CGCGCTAACAACCTCCTGCCGTTTTGCCCGTGCATATCGGTCACGAACAAATCTGATTACTAAACACAGTAGCCTGGATT
+TGTTCTATCAGTAATCGACCTTATTCCTAATTAAATAGAGCAAATCCCCTTATTGGGGGTAAGACATGAAGATGCCAGAA
+AAACATGACCTGTTGGCCGCCATTCTCGCGGCAAAGGAACAAGGCATCGGGGCAATCCTTGCGTTTGCAATGGCGTACCT
+TCGCGGCAGATATAATGGCGGTGCGTTTACAAAAACAGTAATCGACGCAACGATGTGCGCCATTATCGCCTAGTTCATTC
+GTGACCTTCTCGACTTCGCCGGACTAAGTAGCAATCTCGCTTATATAACGAGCGTGTTTATCGGCTACATCGGTACTGAC
+TCGATTGGTTCGCTTATCAAACGCTTCGCTGCTAAAAAAGCCGGAGTAGAAGATGGTAGAAATCAATAATCAACGTAAGG
+CGTTCCTCGATATGCTGGCGTGGTCGGAGGGAACTGATAACGGACGTCAGAAAACCAGAAATCATGGTTATGACGTCATT
+GTAGGCGGAGAGCTATTTACTGATTACTCCGATCACCCTCGCAAACTTGTCACGCTAAACCCAAAACTCAAATCAACAGG
+CGCCGGACGCTACCAGCTTCTTTCCCGTTGGTGGGATGCCTACCGCAAGCAGCTTGGCCTGAAAGACTTCTCTCCGAAAA
+GTCAGGACGCTGTGGCATTGCAGCAGATTAAGGAGCGTGGCGCTTTACCTATGATTGATCGTGGTGATATCCGTCAGGCA
+ATCGACCGTTGCAGCAATATCTGGGCTTCACTGCCGGGCGCTGGTTATGGTCAGTTCGAGCATAAGGCTGACAGCCTGAT
+TGCAAAATTCAAAGAAGCGGGCGGAACGGTCAGAGAGATTGATGTATGAGCAGAGTCACCGCGATTATCTCCGCTCTGGT
+TATCTGCATCATCGTCTGCCTGTCATGGGCTGTTAATCATTACCGTGATAACGCCATTACCTACAAAGCCCAGCGCGACA
+AAAATGCCAGAGAACTGAAGCTGGCGAACGCGGCAATTACTGACATGCAGATGCGTCAGCGTGATGTTGCTGCGCTCGAT
+GCAAAATACACGAAGGAGTTAGCTGATGCTAAAGCTGAAAATGATGCTCTGCGTGATGATGTTGCCGCTGGTCGTCGTCG
+GTTGCACATCAAAGCAGTCTGTCAGTCAGTGCGTGAAGCCACCACCGCCTCCGGCGTGGATAATGCAGCCTCCCCCCGAC
+TGGCAGACACCGCTGAACGGGATTATTTCACCCTCAGAGAGAGGCTGATCACTATGCAAAAACAACTGGAAGGAACCCAG
+AAGTATATTAATGAGCAGTGCAGATAGAGTTGCCCATATCGATGGGCAACTCATGCAATTATTGTGAGCAATACACACGC
+GCTTCCAGCGGAGTATAAATGCCTAAAGTAATAAAACCGAGCAATCCATTTACGAATGTTTGCTGGGTTTCTGTTTTAAC
+AACATTTTCTGCGCCGCCACAAATTTTGGCTGCATCGACAGTTTTCTTCTGCCCAATTCCAGAAACGAAGAAATGATGGG
+TGATGGTTTCCTTTGGTGCTACTGCTGCCGGTTTGTTTTGAACAGTAAACGTCTGTTGAGCACATCCTGTAATAAGCAGG
+GCCAGCGCAGTAGCGAGTAGCATTTTTTTCATGGTGTTATTCCCGATGCTTTTTGAAGTTCGCAGAATCGTATGTGTAGA
+AAATTAAACAAACCCTAAACAATGAGTTGAAATTTCATATTGTTAATATTTATTAATGTATGTCAGGTGCGATGAATCGT
+CATTGTATTCCCGGATTAACTATGTCCACAGCCCTGACGGGGAACTTCTCTGCGGGAGTGTCCGGGAATAATTAAAACGA
+TGCACACAGGGTTTAGCGCGTACACGTATTGCATTATGCCAACGCCCCGGTGCTGACACGGAAGAAACCGGACGTTATGA
+TTTAGCGTGGAAAGATTTGTGTAGTGTTCTGAATGCTCTCAGTAAATAGTAATGAATTATCAAAGGTATAGTAATATCTT
+TTATGTTCATGGATATTTGTAACCCATCGGAAAACTCCTGCTTTAGCAAGATTTTCCCTGTATTGCTGAAATGTGATTTC
+TCTTGATTTCAACCTATCATAGGACGTTTCTATAAGATGCGTGTTTCTTGAGAATTTAACATTTACAACCTTTTTAAGTC
+CTTTTATTAACACGGTGTTATCGTTTTCTAACACGATGTGAATATTATCTGTGGCTAGATAGTAAATATAATGTGAGACG
+TTGTGACGTTTTAGTTCAGAATAAAACAATTCACAGTCTAAATCTTTTCGCACTTGATCGAATATTTCTTTAAAAATGGC
+AACCTGAGCCATTGGTAAAACCTTCCATGTGATACGAGGGCGCGTAGTTTGCATTATCGTTTTTATCGTTTCAATCTGGT
+CTGACCTCCTTGTGTTTTGTTGATGATTTATGTCAAATATTAGGAATGTTTTCACTTAATAGTATTGGTTGCGTAACAAA
+GTGCGGTCCTGCTGGCATTCTGGAGGGAAATACAACCGACAGATGTATGTAAGGCCAACGTGCTCAAATCTTCATACAGA
+AAGATTTGAAGTAATATTTTAACCGCTAGATGAAGAGCAAGCGCATGGAGCGACAAAATGAATAAAGAACAATCTGCTGA
+TGATCCCTCCGTGGATCTGATTCGTGTAAAAAATATGCTTAATAGCACCATTTCTATGAGTTACCCTGATGTTGTAATTG
+CATGTATAGAACATAAGGTGTCTCTGGAAGCATTCAGAGCAATTGAGGCAGCGTTGGTGAAGCACGATAATAATATGAAG
+GATTATTCCCTGGTGGTTGACTGATCACCATAACTGCTAATCATTCAAACTATTTAGTCTGTGACAGAGCCAACACGCAG
+TCTGTCACTGTCAGGAAAGTGGTAAAACTGCAACTCAATTACTGCAATGCCCTCGTAATTAAGTGAATTTACAATATCGT
+CCTGTTCGGAGGGAAGAACGCGGGATGTTCATTCTTCATCACTTTTAATTGATGTATATGCTCTCTTTTCTGACGTTAGT
+CTCCGACGGCAGGCTTCAATGACCCAGGCTGAGAAATTCCCGGACCCTTTTTGCTCAAGAGCGATGTTAATTTGTTCAAT
+CATTTGGTTAGGAAAGCGGATGTTGCGGGTTGTTGTTCTGCGGGTTCTGTTCTTCGTTGACATGAGGTTGCCCCGTATTC
+AGTGTCGCTGATTTGTATTGTCTGAAGTTGTTTTTACGTTAAGTTGATGCAGATCAATTAATACGATACCTGCGTCATAA
+TTGATTATTTGACGTGGTTTGATGGCCTCCACGCACGTTGTGATATGTAGATGATAATCATTATCACTTTACGGGTCCTT
+TCCGGTGATCCGACAGGTTACG
diff --git a/pbcore/data/lambdaNEB.fa.fai b/pbcore/data/lambdaNEB.fa.fai
new file mode 100644
index 0000000..064af36
--- /dev/null
+++ b/pbcore/data/lambdaNEB.fa.fai
@@ -0,0 +1 @@
+lambda_NEB3011 48502 16 80 81
diff --git a/pbcore/data/m110818_075520_42141_c100129202555500000315043109121112_s1_p0.bas.h5 b/pbcore/data/m110818_075520_42141_c100129202555500000315043109121112_s1_p0.bas.h5
new file mode 100644
index 0000000..34df9ed
Binary files /dev/null and b/pbcore/data/m110818_075520_42141_c100129202555500000315043109121112_s1_p0.bas.h5 differ
diff --git a/pbcore/data/m110818_075520_42141_c100129202555500000315043109121112_s2_p0.bas.h5 b/pbcore/data/m110818_075520_42141_c100129202555500000315043109121112_s2_p0.bas.h5
new file mode 100644
index 0000000..5e254dc
Binary files /dev/null and b/pbcore/data/m110818_075520_42141_c100129202555500000315043109121112_s2_p0.bas.h5 differ
diff --git a/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.1.bax.h5 b/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.1.bax.h5
new file mode 100755
index 0000000..60eb003
Binary files /dev/null and b/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.1.bax.h5 differ
diff --git a/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.2.bax.h5 b/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.2.bax.h5
new file mode 100755
index 0000000..ebf625f
Binary files /dev/null and b/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.2.bax.h5 differ
diff --git a/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.3.bax.h5 b/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.3.bax.h5
new file mode 100755
index 0000000..0a2f3ac
Binary files /dev/null and b/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.3.bax.h5 differ
diff --git a/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.bas.h5 b/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.bas.h5
new file mode 100755
index 0000000..441e404
Binary files /dev/null and b/pbcore/data/m130522_092457_42208_c100497142550000001823078008081323_s1_p0.bas.h5 differ
diff --git a/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.1.ccs.h5 b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.1.ccs.h5
new file mode 100755
index 0000000..3097c4d
Binary files /dev/null and b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.1.ccs.h5 differ
diff --git a/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.1.bax.h5 b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.1.bax.h5
new file mode 100755
index 0000000..e8904f6
Binary files /dev/null and b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.1.bax.h5 differ
diff --git a/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.1.rgn.h5 b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.1.rgn.h5
new file mode 100644
index 0000000..00b0bb4
Binary files /dev/null and b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.1.rgn.h5 differ
diff --git a/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.2.bax.h5 b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.2.bax.h5
new file mode 100755
index 0000000..c603d36
Binary files /dev/null and b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.2.bax.h5 differ
diff --git a/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.2.rgn.h5 b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.2.rgn.h5
new file mode 100644
index 0000000..6abd850
Binary files /dev/null and b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.2.rgn.h5 differ
diff --git a/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.3.bax.h5 b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.3.bax.h5
new file mode 100755
index 0000000..3b27701
Binary files /dev/null and b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.3.bax.h5 differ
diff --git a/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.3.rgn.h5 b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.3.rgn.h5
new file mode 100644
index 0000000..6fa5269
Binary files /dev/null and b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.3.rgn.h5 differ
diff --git a/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.bas.h5 b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.bas.h5
new file mode 100755
index 0000000..eaf01e9
Binary files /dev/null and b/pbcore/data/m130731_192718_42129_c100564662550000001823085912221321_s1_p0.bas.h5 differ
diff --git a/pbcore/data/m140307_221913_42203_c100626172550000001823119008061414_s1_p0.1.bc.h5 b/pbcore/data/m140307_221913_42203_c100626172550000001823119008061414_s1_p0.1.bc.h5
new file mode 100644
index 0000000..479164c
Binary files /dev/null and b/pbcore/data/m140307_221913_42203_c100626172550000001823119008061414_s1_p0.1.bc.h5 differ
diff --git a/pbcore/data/m140307_221913_42203_c100626172550000001823119008061414_s1_p0.2.bc.h5 b/pbcore/data/m140307_221913_42203_c100626172550000001823119008061414_s1_p0.2.bc.h5
new file mode 100644
index 0000000..dcc9265
Binary files /dev/null and b/pbcore/data/m140307_221913_42203_c100626172550000001823119008061414_s1_p0.2.bc.h5 differ
diff --git a/pbcore/data/m140307_221913_42203_c100626172550000001823119008061414_s1_p0.3.bc.h5 b/pbcore/data/m140307_221913_42203_c100626172550000001823119008061414_s1_p0.3.bc.h5
new file mode 100644
index 0000000..3b26e19
Binary files /dev/null and b/pbcore/data/m140307_221913_42203_c100626172550000001823119008061414_s1_p0.3.bc.h5 differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5
new file mode 100644
index 0000000..f50c379
Binary files /dev/null and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 differ
diff --git a/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam
new file mode 100644
index 0000000..9697285
Binary files /dev/null and b/pbcore/data/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.subreads.bam differ
diff --git a/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.1.bax.h5 b/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.1.bax.h5
new file mode 100644
index 0000000..31e8768
Binary files /dev/null and b/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.1.bax.h5 differ
diff --git a/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.2.bax.h5 b/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.2.bax.h5
new file mode 100644
index 0000000..c4b117f
Binary files /dev/null and b/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.2.bax.h5 differ
diff --git a/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.3.bax.h5 b/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.3.bax.h5
new file mode 100644
index 0000000..e0bfcfb
Binary files /dev/null and b/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.3.bax.h5 differ
diff --git a/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.bas.h5 b/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.bas.h5
new file mode 100755
index 0000000..1056612
Binary files /dev/null and b/pbcore/data/m140912_020930_00114_c100702482550000001823141103261590_s1_p0.bas.h5 differ
diff --git a/pbcore/data/variants.gff b/pbcore/data/variants.gff
new file mode 100644
index 0000000..5e15571
--- /dev/null
+++ b/pbcore/data/variants.gff
@@ -0,0 +1,11 @@
+##gff-version 3
+##pacbio-variant-version 2.1
+##date Sat Mar 22 12:16:13 2014
+##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.12
+##source GenomicConsensus 0.8.0
+##source-commandline /Users/dalexander/.virtualenvs/VE/bin/variantCaller.py --algorithm=plurality -q20 -x5 pbcore/data/aligned_reads_1.cmp.h5 -r /Users/dalexander/Data/lambdaNEB.fa -o /tmp/v.gff
+##source-alignment-file /Users/dalexander/Dropbox/Sources/git/pbcore/pbcore/data/aligned_reads_1.cmp.h5
+##source-reference-file /Users/dalexander/Data/lambdaNEB.fa
+##sequence-region lambda_NEB3011 1 48502
+lambda_NEB3011 . deletion 30890 30890 . . . reference=G;variantSeq=.;frequency=2;coverage=5;confidence=25
+lambda_NEB3011 . insertion 30924 30924 . . . reference=.;variantSeq=G;frequency=2;coverage=5;confidence=25
diff --git a/pbcore/io/BarcodeH5Reader.py b/pbcore/io/BarcodeH5Reader.py
new file mode 100644
index 0000000..c6c1fad
--- /dev/null
+++ b/pbcore/io/BarcodeH5Reader.py
@@ -0,0 +1,374 @@
+#################################################################################$$
+# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#################################################################################$$
+
+import h5py as h5
+import numpy as n
+
+from pbcore.io.FofnIO import readFofn
+
+BARCODE_DELIMITER = "--"
+BC_DS_PATH = "BarcodeCalls/best"
+BC_DS_ALL_PATH = "BarcodeCalls/all"
+
+
+class LabeledZmw(object):
+ """A scored ZMW represents a ZMW object and its corresponding
+ barcode scores. Some fields are considered optional"""
+ def __init__(self, holeNumber, nScored, bestIdx, bestScore,
+ secondBestIdx = -1, secondBestScore = 0,
+ allScores = None):
+ self._holeNumber = holeNumber
+ self._nScored = nScored
+ self._bestIdx = bestIdx
+ self._bestScore = bestScore
+ self._secondBestIdx = secondBestIdx
+ self._secondBestScore = secondBestScore
+ self._allScores = allScores
+
+ def toBestRecord(self):
+ """Return a summary record suitable for storage"""
+ return (self.holeNumber, self.nScored, self.bestIdx,
+ self.bestScore, self.secondBestIdx, self.secondBestScore)
+
+ @staticmethod
+ def fromBestRecord(npRow):
+ return LabeledZmw(npRow[0], npRow[1], npRow[2],
+ npRow[3], npRow[4], npRow[5], None)
+ @property
+ def holeNumber(self):
+ return self._holeNumber
+ @property
+ def nScored(self):
+ return self._nScored
+ @property
+ def bestIdx(self):
+ return self._bestIdx
+ @property
+ def bestScore(self):
+ return self._bestScore
+ @property
+ def averageScore(self):
+ return 0 if self.nScored <= 0 else self.bestScore/self.nScored
+ @property
+ def scoreRatio(self):
+ return 1 if self.secondBestScore == 0 or self.bestScore == 0 else \
+ self.bestScore/(1.0 * self.secondBestScore)
+ @property
+ def secondBestIdx(self):
+ return self._secondBestIdx
+ @property
+ def secondBestScore(self):
+ return self._secondBestScore
+ @property
+ def allScores(self):
+ return self._allScores
+
+ def __repr__(self):
+ return "(holeNumber = %d, nScored = %d, bestIdx = %d, bestScore = %d, averageScore = %d)" % \
+ (self.holeNumber, self.nScored, self.bestIdx, self.bestScore, self.averageScore)
+
+
+def writeBarcodeH5(labeledZmws, labeler, outFile,
+ writeExtendedInfo = False):
+ """Write a barcode file from a list of labeled ZMWs. In addition
+ to labeledZmws, this function takes a
+ pbbarcode.BarcodeLabeler."""
+ bestScores = map(lambda z: z.toBestRecord(), labeledZmws)
+ outDta = n.vstack(bestScores)
+ outH5 = h5.File(outFile, 'a')
+
+ if BC_DS_PATH in outH5:
+ del outH5[BC_DS_PATH]
+
+ bestDS = outH5.create_dataset(BC_DS_PATH, data = outDta, dtype = "int32")
+ bestDS.attrs['movieName'] = labeler.movieName
+ bestDS.attrs['barcodes'] = n.array(labeler.barcodeLabels, dtype = h5.new_vlen(str))
+ bestDS.attrs['columnNames'] = n.array(['holeNumber', 'nAdapters', 'barcodeIdx1',
+ 'barcodeScore1', 'barcodeIdx2', 'barcodeScore2'],
+ dtype = h5.new_vlen(str))
+ bestDS.attrs['scoreMode'] = labeler.scoreMode
+
+ if writeExtendedInfo:
+ # here we use the 'names' because each barcode is scored
+ # individually.
+ nBarcodes = len(labeler.barcodeNames)
+
+ def makeArray(l, v):
+ a = n.zeros(l, dtype = type(v))
+ a.fill(v)
+ return a
+
+ def makeRecord(lZmw):
+ zmws = makeArray(nBarcodes * lZmw.nScored, lZmw.holeNumber)
+ adapters = n.concatenate([makeArray(nBarcodes, i) for i in \
+ xrange(1, lZmw.nScored + 1)])
+ idxs = n.concatenate([range(0, nBarcodes) for i in \
+ xrange(0, lZmw.nScored)])
+ scores = n.concatenate(lZmw.allScores)
+ return n.transpose(n.vstack((zmws, adapters, idxs, scores)))
+
+ records = [makeRecord(lZmw) for lZmw in labeledZmws if lZmw.allScores]
+ records = n.vstack(records)
+
+ if BC_DS_ALL_PATH in outH5:
+ del outH5[BC_DS_ALL_PATH]
+ allDS = outH5.create_dataset(BC_DS_ALL_PATH, data = records, dtype = 'int32')
+ allDS.attrs['movieName'] = labeler.movieName
+ # note names versus labels.
+ allDS.attrs['barcodes'] = n.array(labeler.barcodeNames, dtype = h5.new_vlen(str))
+ allDS.attrs['columnNames'] = n.array(['holeNumber', 'adapter', 'barcodeIdx', 'score'],
+ dtype = h5.new_vlen(str))
+ # close the file at the very end.
+ outH5.close()
+
+
+class BarcodeH5Reader(object):
+ def __init__(self, fname):
+
+ try:
+ self.h5File = h5.File(fname, "r")
+ except IOError:
+ raise IOError("Invalid or nonexistent bc file %s" % fname)
+
+ self.bestDS = self.h5File[BC_DS_PATH]
+
+ self._scoreMode = self.bestDS.attrs['scoreMode']
+ self._barcodeLabels = self.bestDS.attrs['barcodes']
+ self._movieName = self.bestDS.attrs['movieName']
+ # zmw => LabeledZmw
+ labeledZmws = [LabeledZmw.fromBestRecord(self.bestDS[i,:]) for i in
+ xrange(0, self.bestDS.shape[0])]
+ self.labeledZmws = dict([(lZmw.holeNumber, lZmw) for lZmw in labeledZmws])
+
+ # barcode => LabeledZmws
+ self.bcLabelToLabeledZmws = {l:[] for l in self.barcodeLabels}
+ for lZmw in self.labeledZmws.values():
+ d = self.bcLabelToLabeledZmws[self.barcodeLabels[lZmw.bestIdx]]
+ d.append(lZmw)
+
+ @property
+ def holeNumbers(self):
+ return sorted(self.labeledZmws.keys())
+ @property
+ def barcodeLabels(self):
+ return self._barcodeLabels
+ @property
+ def scoreMode(self):
+ """String specifying whether the barcodes were score symmetrically or in pairs"""
+ return self._scoreMode
+ @property
+ def movieName(self):
+ return self._movieName
+
+ def labeledZmwFromHoleNumber(self, holeNumber):
+ """Returns a LabeledZmw object from the holeNumber"""
+ try:
+ return self.labeledZmws[holeNumber]
+ except KeyError:
+ raise KeyError("holeNumber %d not labeled" % holeNumber)
+
+ def labeledZmwsFromBarcodeLabel(self, bcLabel):
+ """Returns a list of LabeledZmw objects for the particular
+ barcode label, an empty list if there are no ZMWs for this
+ barcode."""
+ return self.bcLabelToLabeledZmws[bcLabel]
+
+ def __iter__(self):
+ for key in self.holeNumbers:
+ yield self.labeledZmws[key]
+
+
+class MPBarcodeH5Reader(object):
+ def __init__(self, parts):
+ self._parts = parts
+ def rng(x):
+ return (n.min(x), n.max(x))
+ # these aren't the ranges of ZMWs, but the ranges for the
+ # scored ZMWs.
+ self._bins = map(lambda z : rng(z.holeNumbers), self._parts)
+
+ def choosePart(self, holeNumber):
+ for i,b in enumerate(self._bins):
+ if holeNumber >= b[0] and holeNumber <= b[1]:
+ return self._parts[i]
+ # Return None meaning the zmw is ouf of the range of
+ # the scored ZMWs for all parts.
+ return None
+
+ @property
+ def barcodeLabels(self):
+ return self._parts[0].barcodeLabels
+ @property
+ def scoreMode(self):
+ """String specifying whether the barcodes were score symmetrically or in pairs"""
+ return self._parts[0].scoreMode
+
+ def labeledZmwFromHoleNumber(self, holeNumber):
+ """Returns a LabeledZmw object from the holeNumber"""
+ part = self.choosePart(holeNumber)
+ if part:
+ return part.labeledZmwFromHoleNumber(holeNumber)
+ else:
+ raise KeyError("holeNumber: %d not labeled" % holeNumber)
+
+ def labeledZmwsFromBarcodeLabel(self, bcLabel):
+ lzmws = reduce(lambda x,y: x + y,
+ map(lambda z: z.labeledZmwsFromBarcodeLabel(bcLabel),
+ self._parts))
+ return sorted(lzmws, key=lambda z: z.holeNumber)
+
+ def __iter__(self):
+ for reader in self._parts:
+ for labeledZmw in reader:
+ yield labeledZmw
+
+ def __getitem__(self, item):
+ if (isinstance(item, int) or
+ issubclass(type(item), n.integer)):
+ return self.labeledZmwFromHoleNumber(item)
+ elif isinstance(item, str):
+ return self.labeledZmwsFromBarcodeLabel(item)
+ elif isinstance(item, slice):
+ return [ self.labeledZmwFromHoleNumber(self, item)
+ for r in xrange(*item.indices(len(self)))]
+ elif isinstance(item, list) or isinstance(item, n.ndarray):
+ if len(item) == 0:
+ return []
+ else:
+ entryType = type(item[0])
+ if entryType == int or issubclass(entryType, n.integer):
+ return [ self.labeledZmwFromHoleNumber(r) for r in item ]
+ elif entryType == bool or issubclass(entryType, n.bool_):
+ return [ self.labeledZmwFromHoleNumber(r) for r in n.flatnonzero(item) ]
+ raise TypeError, "Invalid type for BasH5Reader slicing"
+
+
+class BarcodeH5Fofn(object):
+ def __init__(self, *args):
+
+ bcFilenames = []
+ for arg in args:
+ if arg.endswith(".fofn"):
+ for fn in readFofn(arg):
+ bcFilenames.append(fn)
+ else:
+ bcFilenames.append(arg)
+
+ self._bcH5s = [BarcodeH5Reader(fname) for fname in
+ bcFilenames]
+ self._byMovie = {}
+ for bc in self._bcH5s:
+ if bc.movieName not in self._byMovie:
+ self._byMovie[bc.movieName] = [bc]
+ else:
+ self._byMovie[bc.movieName].append(bc)
+
+ self.mpReaders = { movieName: parts[0] if len(parts) == 1 else MPBarcodeH5Reader(parts)
+ for movieName, parts in self._byMovie.iteritems() }
+
+ @property
+ def holeNumbers(self):
+ return sorted([hn for reader in self._bcH5s
+ for hn in reader.holeNumbers])
+ @property
+ def movieNames(self):
+ return self.mpReaders.keys()
+ @property
+ def barcodeLabels(self):
+ return self._bcH5s[0].barcodeLabels
+ @property
+ def scoreMode(self):
+ """String specifying whether the barcodes were score symmetrically or in pairs"""
+ return self._bcH5s[0].scoreMode
+
+ def labeledZmwsFromBarcodeLabel(self, item):
+ lzmws = reduce(lambda x,y: x + y,
+ map(lambda z: z.labeledZmwsFromBarcodeLabel(item),
+ self._bcH5s))
+ return sorted(lzmws, key=lambda z: z.holeNumber )
+
+ def labeledZmwFromName(self, item):
+ indices = item.rstrip("/").split("/")
+
+ if (len(indices) < 1):
+ raise KeyError("Invalid slice of BarcodeH5Fofn")
+
+ if len(indices) >= 1:
+ result = self.readerForMovie(indices[0])
+ if len(indices) >= 2:
+ if indices[1] in self.barcodeLabels:
+ return result.labeledZmwsFromBarcodeLabel(indices[1])
+ try:
+ indexNum = int(indices[1])
+ except ValueError:
+ ValueError("Invalid hole number or barcode name {0} as second index".format(indices[1]))
+ result = result[indexNum]
+ return result
+
+ def labeledZmwFromHoleNumber(self, item):
+ if len(self.movieNames) > 1:
+ raise ValueError("Cannot slice by holeNumber with multiple movies")
+ else:
+ movie = self.movieNames[0]
+ reader = self.mpReaders[movie]
+ return reader[item]
+
+ def readerForMovie(self, movieName):
+ """Return a BarcodeH5Reader for a movieName"""
+ return self.mpReaders[movieName]
+
+ def __iter__(self):
+ for reader in self._bcH5s:
+ for labeledZmw in reader:
+ yield labeledZmw
+
+ def __getitem__(self, item):
+ """
+ Get a BarcodeH5Reader or LabeledZmw by movie name, zmw name, subread id,
+ or ccs id, using standard PacBio naming conventions. Examples:
+
+ - ["F3--R3"] -> List of LabeledZmws
+ - ["m110818_..._s1_p0"] -> BarcodeH5Reader
+ - ["m110818_,,,_s1_p9/F3--R3"] -> List of LabeledZmws
+ - ["m110818_..._s1_p0/24480"] -> LabeledZmw
+ - ["m110818_..._s1_p0/24480/20_67"] -> LabeledZmw
+ """
+
+ if (isinstance(item, int) or
+ issubclass(type(item), n.integer)):
+ return self.labeledZmwFromHoleNumber(item)
+ elif isinstance(item, str):
+ if item in self.barcodeLabels:
+ return self.labeledZmwsFromBarcodeLabel(item)
+ elif item in self.movieNames:
+ return self.readerForMovie(item)
+ else:
+ return self.labeledZmwFromName(item)
+ else:
+ raise ValueError("BcH5Fofn slice must be a barcode, name or hole number")
diff --git a/pbcore/io/BasH5IO.py b/pbcore/io/BasH5IO.py
new file mode 100644
index 0000000..2dd4b2e
--- /dev/null
+++ b/pbcore/io/BasH5IO.py
@@ -0,0 +1,1026 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Authors: David Alexander, Jim Bullard
+
+__all__ = [ "BasH5Reader" ,
+ "BaxH5Reader" ,
+ "BasH5Collection" ]
+
+import h5py, numpy as np, os.path as op
+from bisect import bisect_left, bisect_right
+from operator import getitem
+from itertools import groupby
+from collections import OrderedDict
+
+from pbcore.io.FofnIO import readFofn
+from pbcore.chemistry import (decodeTriple,
+ tripleFromMetadataXML,
+ ChemistryLookupError)
+from ._utils import arrayFromDataset, CommonEqualityMixin
+
+
+def intersectRanges(r1, r2):
+ b1, e1 = r1
+ b2, e2 = r2
+ b, e = max(b1, b2), min(e1, e2)
+ return (b, e) if (b < e) else None
+
+def rangeLength(r):
+ b, e = r
+ return e - b
+
+def removeNones(lst):
+ return filter(lambda x: x!=None, lst)
+
+# ZMW hole Types
+SEQUENCING_ZMW = 0
+
+# Region types
+ADAPTER_REGION = 0
+INSERT_REGION = 1
+HQ_REGION = 2
+
+# This seems to be the magic incantation to get a RecArray that can be
+# indexed to yield a record that can then be accessed using dot
+# notation.
+def toRecArray(dtype, arr):
+ return np.rec.array(arr, dtype=dtype).flatten()
+
+REGION_TABLE_DTYPE = [("holeNumber", np.int32),
+ ("regionType", np.int32),
+ ("regionStart", np.int32),
+ ("regionEnd", np.int32),
+ ("regionScore", np.int32) ]
+
+def _makeQvAccessor(featureName):
+ def f(self):
+ return self.qv(featureName)
+ return f
+
+class Zmw(CommonEqualityMixin):
+ """
+ A Zmw represents all data from a ZMW (zero-mode waveguide) hole
+ within a bas.h5 movie file. Accessor methods provide convenient
+ access to the read (or subreads), and to the region table entries
+ for this hole.
+ """
+ __slots__ = [ "baxH5", "holeNumber", "index"]
+
+ def __init__(self, baxH5, holeNumber):
+ self.baxH5 = baxH5
+ self.holeNumber = holeNumber
+ self.index = self.baxH5._holeNumberToIndex[holeNumber]
+
+ @property
+ def regionTable(self):
+ if self.holeNumber in self.baxH5._regionTableIndex:
+ startRow, endRow = self.baxH5._regionTableIndex[self.holeNumber]
+ return self.baxH5.regionTable[startRow:endRow]
+ else:
+ # Broken region table---primary pipeline bug (see bugs
+ # 23585, 25273). Work around this by returning a fake
+ # regiontable consisting of an empty HQ region
+ return toRecArray(REGION_TABLE_DTYPE,
+ [ (self.holeNumber, HQ_REGION, 0, 0, 0) ])
+
+ #
+ # The following "region" calls return one or more intervals ((int, int)).
+ # - The default implementations perform clipping to the hqRegion.
+ # - The "unclipped" implementations entail no clipping
+ #
+ @property
+ def adapterRegionsNoQC(self):
+ """
+ Get adapter regions as intervals, without clipping to the HQ
+ region. Don't use this unless you know what you're doing.
+ """
+ return [ (region.regionStart, region.regionEnd)
+ for region in self.regionTable
+ if region.regionType == ADAPTER_REGION ]
+
+ @property
+ def adapterRegions(self):
+ """
+ Get adapter regions as intervals, performing clipping to the HQ region
+ """
+ hqRegion = self.hqRegion
+ return removeNones([ intersectRanges(hqRegion, region)
+ for region in self.adapterRegionsNoQC ])
+
+ @property
+ def insertRegionsNoQC(self):
+ """
+ Get insert regions as intervals, without clipping to the HQ
+ region. Don't use this unless you know what you're doing.
+ """
+ return [ (region.regionStart, region.regionEnd)
+ for region in self.regionTable
+ if region.regionType == INSERT_REGION ]
+
+ @property
+ def insertRegions(self):
+ """
+ Get insert regions as intervals, clipped to the HQ region
+ """
+ hqRegion = self.hqRegion
+ return removeNones([ intersectRanges(hqRegion, region)
+ for region in self.insertRegionsNoQC ])
+ @property
+ def hqRegion(self):
+ """
+ Return the HQ region interval.
+
+ The HQ region is an interval of basecalls where the basecaller has
+ inferred that a single sequencing reaction is taking place.
+ Secondary analysis should only use subreads within the HQ
+ region. Methods in this class, with the exception of the
+ "NoQC" methods, return data appropriately clipped/filtered to
+ the HQ region.
+ """
+ rt = self.regionTable
+ hqRows = rt[rt.regionType == HQ_REGION]
+ assert len(hqRows) == 1
+ hqRow = hqRows[0]
+ return hqRow.regionStart, hqRow.regionEnd
+
+ @property
+ def readScore(self):
+ """
+ Return the "read score", a prediction of the accuracy (between 0 and 1) of the
+ basecalls from this ZMW, from the `ReadScore` dataset in the
+ file
+ """
+ return self.zmwMetric("ReadScore")
+
+ @property
+ def productivity(self):
+ """
+ Return the 'productivity' of this ZMW, which is the estimated
+ number of polymerase reactions taking place within it. For
+ example, a doubly-loaded ZMW would have productivity 2.
+ """
+ return self.zmwMetric("Productivity")
+
+ @property
+ def hqRegionSnr(self):
+ """
+ Return the SNRs, as a vector by channel.
+ """
+ return self.zmwMetric("HQRegionSNR")
+
+ def zmwMetric(self, name):
+ """
+ Return the value of metric 'name' from the ZMW metrics.
+ """
+ return self.baxH5.zmwMetric(name, self.index)
+
+ def listZmwMetrics(self):
+ """
+ List the available ZMW metrics for this bax.h5 file.
+ """
+ return self.baxH5.listZmwMetrics()
+
+ @property
+ def numPasses(self):
+ """
+ Return the number of passes (forward + back) across the SMRTbell
+ insert, used to forming the CCS consensus.
+ """
+ if not self.baxH5.hasConsensusBasecalls:
+ raise ValueError, "No CCS reads in this file"
+ return self.baxH5._ccsNumPasses[self.index]
+
+ #
+ # The following calls return one or more ZmwRead objects.
+ #
+ def read(self, readStart=None, readEnd=None):
+ """
+ Given no arguments, returns the entire (HQ-clipped) polymerase
+ read. With readStart, readEnd arguments, returns the
+ specified extent of the polymerase read.
+ """
+ if not self.baxH5.hasRawBasecalls:
+ raise ValueError, "No raw reads in this file"
+ hqStart, hqEnd = self.hqRegion
+ readStart = hqStart if readStart is None else readStart
+ readEnd = hqEnd if readEnd is None else readEnd
+ return ZmwRead(self.baxH5, self.holeNumber, readStart, readEnd)
+
+
+ def readNoQC(self, readStart=None, readEnd=None):
+ """
+ Given no arguments, returns the entire polymerase read, *not
+ HQ-clipped*. With readStart, readEnd arguments, returns the
+ specified extent of the polymerase read.
+
+ .. warning::
+
+ It is not recommended that production code use this method
+ as we make no guarantees about what happens outside of the
+ HQ region.
+ """
+ if not self.baxH5.hasRawBasecalls:
+ raise ValueError, "No raw reads in this file"
+ offsets = self.baxH5._offsetsByHole[self.holeNumber]
+ numEvent = offsets[1] - offsets[0]
+ polymeraseBegin = 0
+ polymeraseEnd = numEvent
+ readStart = polymeraseBegin if readStart is None else readStart
+ readEnd = polymeraseEnd if readEnd is None else readEnd
+ return ZmwRead(self.baxH5, self.holeNumber, readStart, readEnd)
+
+ @property
+ def subreadsNoQC(self):
+ """
+ Get the subreads, including data beyond the bounds of the HQ region.
+
+ .. warning::
+
+ It is not recommended that production code use this method
+ as we make no guarantees about what happens outside of the
+ HQ region.
+ """
+ if not self.baxH5.hasRawBasecalls:
+ raise ValueError, "No raw reads in this file"
+ return [ self.read(readStart, readEnd)
+ for (readStart, readEnd) in self.unclippedInsertRegions ]
+
+ @property
+ def subreads(self):
+ """
+ Get the subreads as a list of ZmwRead objects. Restricts focus,
+ and clips to, the HQ region. This method can be used by
+ production code.
+ """
+ if not self.baxH5.hasRawBasecalls:
+ raise ValueError, "No raw reads in this file"
+ return [ self.read(readStart, readEnd)
+ for (readStart, readEnd) in self.insertRegions ]
+
+
+ @property
+ def adapters(self):
+ """
+ Get the adapter hits as a list of ZmwRead objects. Restricts
+ focus, and clips to, the HQ region. This method can be used
+ by production code.
+ """
+ if not self.baxH5.hasRawBasecalls:
+ raise ValueError, "No raw reads in this file"
+ return [ self.read(readStart, readEnd)
+ for (readStart, readEnd) in self.adapterRegions ]
+
+ @property
+ def adaptersNoQC(self):
+ """
+ Get the adapters, including data beyond the bounds of the HQ
+ region.
+
+ .. warning::
+
+ It is not recommended that production code use this method
+ as we make no guarantees about what happens outside of the
+ HQ region.
+ """
+ if not self.baxH5.hasRawBasecalls:
+ raise ValueError, "No raw reads in this file"
+ return [ self.read(readStart, readEnd)
+ for (readStart, readEnd) in self.unclippedAdapterRegions ]
+
+ @property
+ def ccsRead(self):
+ if not self.baxH5.hasConsensusBasecalls:
+ raise ValueError, "No CCS reads in this file"
+ baseOffset = self.baxH5._ccsOffsetsByHole[self.holeNumber]
+ if (baseOffset[1] - baseOffset[0]) <= 0:
+ return None
+ else:
+ return CCSZmwRead(self.baxH5, self.holeNumber, 0,
+ baseOffset[1] - baseOffset[0])
+
+ @property
+ def zmwName(self):
+ return "%s/%d" % (self.baxH5.movieName,
+ self.holeNumber)
+
+ def __repr__(self):
+ return "<Zmw: %s>" % self.zmwName
+
+
+class ZmwRead(CommonEqualityMixin):
+ """
+ A ZmwRead represents the data features (basecalls as well as pulse
+ features) recorded from the ZMW, delimited by readStart and readEnd.
+ """
+ __slots__ = [ "baxH5", "holeNumber",
+ "readStart", "readEnd",
+ "offsetBegin", "offsetEnd" ]
+
+ def __init__(self, baxH5, holeNumber, readStart, readEnd):
+ self.baxH5 = baxH5
+ self.holeNumber = holeNumber
+ self.readStart = readStart
+ self.readEnd = readEnd
+ zmwOffsetBegin, zmwOffsetEnd = self._getOffsets()[self.holeNumber]
+ self.offsetBegin = zmwOffsetBegin + self.readStart
+ self.offsetEnd = zmwOffsetBegin + self.readEnd
+ if not (zmwOffsetBegin <=
+ self.offsetBegin <=
+ self.offsetEnd <=
+ zmwOffsetEnd):
+ raise IndexError, "Invalid slice of Zmw!"
+
+ def _getBasecallsGroup(self):
+ return self.baxH5._basecallsGroup
+
+ def _getOffsets(self):
+ return self.baxH5._offsetsByHole
+
+ @property
+ def zmw(self):
+ return self.baxH5[self.holeNumber]
+
+ @property
+ def readName(self):
+ return "%s/%d_%d" % (self.zmw.zmwName,
+ self.readStart,
+ self.readEnd)
+
+ def __repr__(self):
+ return "<%s: %s>" % (self.__class__.__name__,
+ self.readName)
+
+ def __len__(self):
+ return self.readEnd - self.readStart
+
+ def basecalls(self):
+ return arrayFromDataset(self._getBasecallsGroup()["Basecall"],
+ self.offsetBegin, self.offsetEnd).tostring()
+
+ def qv(self, qvName):
+ return arrayFromDataset(self._getBasecallsGroup()[qvName],
+ self.offsetBegin, self.offsetEnd)
+
+ PreBaseFrames = _makeQvAccessor("PreBaseFrames")
+ IPD = _makeQvAccessor("PreBaseFrames")
+
+ WidthInFrames = _makeQvAccessor("WidthInFrames")
+ PulseWidth = _makeQvAccessor("WidthInFrames")
+
+ QualityValue = _makeQvAccessor("QualityValue")
+ InsertionQV = _makeQvAccessor("InsertionQV")
+ DeletionQV = _makeQvAccessor("DeletionQV")
+ DeletionTag = _makeQvAccessor("DeletionTag")
+ MergeQV = _makeQvAccessor("MergeQV")
+ SubstitutionQV = _makeQvAccessor("SubstitutionQV")
+ SubstitutionTag = _makeQvAccessor("SubstitutionTag")
+
+
+class CCSZmwRead(ZmwRead):
+ """
+ Class providing access to the CCS (circular consensus sequencing)
+ data calculated for a ZMW.
+ """
+ def _getBasecallsGroup(self):
+ return self.baxH5._ccsBasecallsGroup
+
+ def _getOffsets(self):
+ return self.baxH5._ccsOffsetsByHole
+
+ @property
+ def readName(self):
+ return "%s/ccs" % self.zmw.zmwName
+
+def _makeOffsetsDataStructure(h5Group):
+ numEvent = h5Group["ZMW/NumEvent"].value
+ holeNumber = h5Group["ZMW/HoleNumber"].value
+ endOffset = np.cumsum(numEvent)
+ beginOffset = np.hstack(([0], endOffset[0:-1]))
+ offsets = zip(beginOffset, endOffset)
+ return dict(zip(holeNumber, offsets))
+
+def _makeRegionTableIndex(regionTableHoleNumbers):
+ # returns a dict: holeNumber -> (startRow, endRow)
+ diffs = np.ediff1d(regionTableHoleNumbers,
+ to_begin=[1], to_end=[1])
+ changepoints = np.flatnonzero(diffs)
+ startsAndEnds = zip(changepoints[:-1],
+ changepoints[1:])
+ return dict(zip(np.unique(regionTableHoleNumbers),
+ startsAndEnds))
+
+class BaxH5Reader(object):
+ """
+ The `BaxH5Reader` class provides access to bax.h5 file and
+ single-part bas.h5 files.
+ """
+ def __init__(self, filename, regionH5Filename=None):
+ try:
+ self.filename = op.abspath(op.expanduser(filename))
+ self.file = h5py.File(self.filename, "r")
+ except IOError:
+ raise IOError, ("Invalid or nonexistent bax/bas file %s" % filename)
+
+ #
+ # Raw base calls?
+ #
+ if "/PulseData/BaseCalls/Basecall" in self.file:
+ self._basecallsGroup = self.file["/PulseData/BaseCalls"]
+ self._offsetsByHole = _makeOffsetsDataStructure(self._basecallsGroup)
+ self.hasRawBasecalls = True
+ else:
+ self.hasRawBasecalls = False
+ #
+ # CCS base calls?
+ #
+ if "/PulseData/ConsensusBaseCalls" in self.file:
+ self._ccsBasecallsGroup = self.file["/PulseData/ConsensusBaseCalls"]
+ self._ccsOffsetsByHole = _makeOffsetsDataStructure(self._ccsBasecallsGroup)
+ self._ccsNumPasses = self._ccsBasecallsGroup["Passes/NumPasses"]
+ self.hasConsensusBasecalls = True
+ else:
+ self.hasConsensusBasecalls = False
+
+ self._mainBasecallsGroup = self._basecallsGroup if self.hasRawBasecalls \
+ else self._ccsBasecallsGroup
+
+ if regionH5Filename is None:
+ # load region information from the bas/bax file
+ self._loadRegions(self.file)
+ else:
+ # load region information from a separate region file
+ self.loadExternalRegions(regionH5Filename)
+
+ # Create a variable to store the chemistry information
+ self._sequencingChemistry = None
+ #
+ # ZMW metric cache -- probably want to move prod and readScore
+ # here.
+ #
+ self.__metricCache = {}
+
+ def _loadRegions(self, fh):
+ """
+ Loads region table information from the given file handle and applies
+ it to the ZMW data.
+ """
+ holeNumbers = self._mainBasecallsGroup["ZMW/HoleNumber"].value
+ self._holeNumberToIndex = dict(zip(holeNumbers, range(len(holeNumbers))))
+
+ #
+ # Region table
+ #
+ self.regionTable = toRecArray(REGION_TABLE_DTYPE,
+ fh["/PulseData/Regions"].value)
+
+ self._regionTableIndex = _makeRegionTableIndex(self.regionTable.holeNumber)
+ isHqRegion = self.regionTable.regionType == HQ_REGION
+ hqRegions = self.regionTable[isHqRegion]
+
+ if len(hqRegions) != len(holeNumbers):
+ # Bug 23585: pre-2.1 primary had a bug where a bas file
+ # could get a broken region table, lacking an HQ region
+ # entry for a ZMW. This happened fairly rarely, mostly on
+ # very long traces. Workaround here is to rebuild HQ
+ # regions table with empty HQ region entries for those
+ # ZMWs.
+ hqRegions_ = toRecArray(REGION_TABLE_DTYPE,
+ np.zeros(shape=len(holeNumbers),
+ dtype=REGION_TABLE_DTYPE))
+ hqRegions_.holeNumber = holeNumbers
+ for record in hqRegions:
+ hn = record.holeNumber
+ hqRegions_[self._holeNumberToIndex[hn]] = record
+ hqRegions = hqRegions_
+
+ hqRegionLength = hqRegions.regionEnd - hqRegions.regionStart
+ holeStatus = self._mainBasecallsGroup["ZMW/HoleStatus"].value
+
+ #
+ # Sequencing ZMWs - Note: this differs from Primary's
+ # definition. To obtain those values, one would use the
+ # `allSequencingZmws` property.
+ #
+ self._sequencingZmws = \
+ holeNumbers[(holeStatus == SEQUENCING_ZMW) &
+ (self._mainBasecallsGroup["ZMW/NumEvent"].value > 0) &
+ (hqRegionLength > 0)]
+
+ self._allSequencingZmws = holeNumbers[holeStatus == SEQUENCING_ZMW]
+
+ def loadExternalRegions(self, regionH5Filename):
+ """
+ Loads regions defined in the given file, overriding those found in the
+ bas/bax file.
+ """
+ try:
+ fh = h5py.File(op.abspath(op.expanduser(regionH5Filename)), "r")
+ except IOError:
+ raise IOError, ("Invalid or nonexistent file %s" % regionH5Filename)
+
+ self._loadRegions(fh)
+ fh.close()
+
+ # A sanity check that the given region table provides information for
+ # hole numbers contain in this base file.
+ baxHoleNumbers = self._mainBasecallsGroup["ZMW/HoleNumber"].value
+ rgnHoleNumbers = self.regionTable.holeNumber
+ if not np.in1d(rgnHoleNumbers, baxHoleNumbers).all():
+ msg = "Region file (%s) does not contain the same hole numbers as " \
+ "bas/bax file (%s)"
+ raise IOError, (msg % (regionH5Filename, self.filename))
+
+ @property
+ def sequencingZmws(self):
+ """
+ A list of the hole numbers that produced useable sequence data.
+ Specifically, this means ZMWs that have an HQ region.
+ """
+ return self._sequencingZmws
+
+ @property
+ def allSequencingZmws(self):
+ """
+ A list of the hole numbers that are capable of producing
+ sequencing data. This differs from the `sequencingZmws` in
+ that zmws are not filtered according to their HQ status. This
+ number is fixed per chip, whereas the `sequencingZmws` depends
+ on things such as loading.
+ """
+ return self._allSequencingZmws
+
+ def __getitem__(self, holeNumber):
+ return Zmw(self, holeNumber)
+
+ #
+ # Iterators over Zmws, ZmwReads
+ #
+
+ def __iter__(self):
+ for holeNumber in self.sequencingZmws:
+ yield self[holeNumber]
+
+ def reads(self):
+ if self.hasRawBasecalls:
+ for zmw in self:
+ yield zmw.read()
+
+ def subreads(self):
+ if self.hasRawBasecalls:
+ for zmw in self:
+ for subread in zmw.subreads:
+ yield subread
+
+ def ccsReads(self):
+ if self.hasConsensusBasecalls:
+ for zmw in self:
+ if zmw.ccsRead is not None:
+ yield zmw.ccsRead
+
+ # ------------------------------
+
+ @property
+ def movieName(self):
+ movieNameAttr = self.file["/ScanData/RunInfo"].attrs["MovieName"]
+
+ # In old bas.h5 files, attributes of ScanData/RunInfo are stored as
+ # strings in arrays of length one.
+ if (isinstance(movieNameAttr, (np.ndarray, list)) and
+ len(movieNameAttr) == 1):
+ movieNameString = movieNameAttr[0]
+ else:
+ movieNameString = movieNameAttr
+
+ if not isinstance(movieNameString, basestring):
+ raise TypeError("Unsupported movieName {m} of type {t}."
+ .format(m=movieNameString,
+ t=type(movieNameString)))
+ return movieNameString
+
+ @property
+ def _chemistryBarcodeTripleInFile(self):
+ """
+ The chemistry barcode triple consists of (BindingKit,
+ SequencingKit, SoftwareVersion) and is written on the
+ instrument to the bax file as of primary version 2.1. Prior
+ to that, it was only written in the metadata.xml.
+ """
+ try:
+ bindingKit = self.file["/ScanData/RunInfo"].attrs["BindingKit"]
+ sequencingKit = self.file["/ScanData/RunInfo"].attrs["SequencingKit"]
+ # version string in bas file looks like "2.1.1.1.x", we have to extract
+ # the "2.1"
+ tmp = self.file["/PulseData/BaseCalls"].attrs["ChangeListID"]
+ swVersion= ".".join(tmp.split(".")[0:2])
+ return (bindingKit, sequencingKit, swVersion)
+ except:
+ return None
+
+ @property
+ def _chemistryBarcodeTripleFromMetadataXML(self):
+ try:
+ movieName = self.movieName
+ _up = op.dirname(op.dirname(self.filename))
+ metadataLocation = op.join(_up, movieName + ".metadata.xml")
+ triple = tripleFromMetadataXML(metadataLocation)
+ return triple
+ except ChemistryLookupError:
+ return None
+
+ @property
+ def chemistryBarcodeTriple(self):
+ triple = self._chemistryBarcodeTripleInFile or self._chemistryBarcodeTripleFromMetadataXML
+ if triple:
+ return triple
+ else:
+ raise ChemistryLookupError, "Could not find chemistry barcodes in file or companion metadata.xml"
+
+ @property
+ def sequencingChemistry(self):
+ """
+ Find the name of the chemistry by consulting, in order of preference:
+ 1) Barcode triple in file
+ 2) "SequencingChemistry" attr in file (chemistry override)
+ 3) metadata.xml companion file
+ """
+ if self._sequencingChemistry is None:
+ triple = self._chemistryBarcodeTripleInFile
+ if triple is not None:
+ self._sequencingChemistry = decodeTriple(*triple)
+ elif "SequencingChemistry" in self.file["/ScanData/RunInfo"].attrs:
+ self._sequencingChemistry = self.file["/ScanData/RunInfo"].attrs["SequencingChemistry"]
+ else:
+ tripleFromXML = self._chemistryBarcodeTripleFromMetadataXML
+ if tripleFromXML is not None:
+ self._sequencingChemistry = decodeTriple(*tripleFromXML)
+ else:
+ raise ChemistryLookupError, "Chemistry information could not be found for this file"
+ return self._sequencingChemistry
+
+ def __len__(self):
+ return len(self.sequencingZmws)
+
+ def close(self):
+ if hasattr(self, "file") and self.file is not None:
+ self.file.close()
+ self.file = None
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ def listZmwMetrics(self):
+ return self._basecallsGroup["ZMWMetrics"].keys()
+
+ def zmwMetric(self, name, index):
+ # we are going to cache these lazily because it is very likely
+ # that if one ZMW asked for the metric others aren't far
+ # behind.
+ if name not in self.__metricCache:
+ k = "/".join(("ZMWMetrics", name))
+ self.__metricCache[name] = self._mainBasecallsGroup[k].value
+
+ v = self.__metricCache[name]
+ if len(v.shape) > 1:
+ return v[index,]
+ else:
+ return v[index]
+
+
+class BasH5Reader(object):
+ """
+ .. testsetup:: *
+
+ from pbcore.io import BasH5Reader
+ from pbcore import data
+ filename = data.getBasH5s()[0]
+ b = BasH5Reader(filename)
+ zmw8 = b[8]
+
+ The `BasH5Reader` provides access to the basecall and pulse metric
+ data encoded in PacBio bas.h5 files. To access data using a
+ `BasH5Reader`, the standard idiom is:
+
+ 1. Index into the `BasH5Reader` using the ZMW hole number to get a `Zmw` object::
+
+ >>> b
+ <BasH5Reader: m110818_075520_42141_c100129202555500000315043109121112_s1_p0>
+ >>> zmw8 = b[8]
+ >>> zmw8
+ <Zmw: m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8>
+
+ 2. Extract `ZmwRead` objects from the `Zmw` object by:
+
+ - Using the `.subreads` property to extract the subreads, which
+ are the subintervals of the raw read corresponding to the
+ SMRTbell insert::
+
+ >>> subreads = zmw8.subreads
+ >>> print subreads
+ [<ZmwRead: m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/3381_3881>,
+ <ZmwRead: m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/3924_4398>,
+ <ZmwRead: m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/4445_4873>,
+ <ZmwRead: m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/4920_5354>,
+ <ZmwRead: m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/5413_5495>]
+
+ - For CCS bas files, using the `.ccsRead` property to extract
+ the CCS (consensus) read, which is a consensus sequence
+ precomputed from the subreads. Older bas files, from when
+ CCS was computed on the instrument, may contain both CCS- and
+ sub- reads.
+
+ >>> zmw8.ccsRead
+ <CCSZmwRead: m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/ccs>
+
+ - Use the `.read()` method to get the full raw read, or
+ `.read(start, end)` to extract a custom subinterval.
+
+ >>> zmw8.read()
+ <ZmwRead: m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/3381_5495>
+ >>> zmw8.read(3390, 3400)
+ <ZmwRead: m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/3390_3400>
+
+ 3. With a `ZmwRead` object in hand, extract the desired
+ basecalls and pulse metrics::
+
+ >>> subreads[0].readName
+ "m110818_075520_42141_c100129202555500000315043109121112_s1_p0/8/3381_3881"
+ >>> subreads[0].basecalls()
+ "AGCCCCGTCGAGAACATACAGGTGGCCAATTTCACAGCCTCTTGCCTGGGCGATCCCGAACATCGCACCGGA..."
+ >>> subreads[0].InsertionQV()
+ array([12, 12, 10, 2, 7, 14, 13, 18, 15, 16, 16, 15, 10, 12, 3, 14, ...])
+
+ Note that not every ZMW on a chip produces usable sequencing
+ data. The `BasH5Reader` has a property `sequencingZmws` is a list
+ of the hole numbers where usable sequence was recorded.
+ Iteration over the `BasH5Reader` object allows you to iterate over
+ the `Zmw` objects providing usable sequence.
+ """
+ def __init__(self, *args):
+ assert len(args) > 0
+
+ if len(args) == 1:
+ filename = args[0]
+ try:
+ self.filename = op.abspath(op.expanduser(filename))
+ self.file = h5py.File(self.filename, "r")
+ except IOError:
+ raise IOError, ("Invalid or nonexistent bas/bax file %s" % filename)
+
+
+ # Is this a multi-part or single-part?
+ if self.file.get("MultiPart"):
+ directory = op.dirname(self.filename)
+ self._parts = [ BaxH5Reader(op.join(directory, fn))
+ for fn in self.file["/MultiPart/Parts"] ]
+ self._holeLookupVector = self.file["/MultiPart/HoleLookup"][:,1]
+ self._holeLookup = self._holeLookupVector.__getitem__
+ else:
+ self._parts = [ BaxH5Reader(self.filename) ]
+ self._holeLookup = (lambda holeNumber: 1)
+ else:
+ partFilenames = args
+ self.filename = None
+ self.file = None
+ self._parts = [ BaxH5Reader(fn) for fn in partFilenames ]
+ holeLookupDict = { hn : (i + 1)
+ for i in xrange(len(self._parts))
+ for hn in self._parts[i]._holeNumberToIndex }
+ self._holeLookup = lambda hn: holeLookupDict[hn]
+ self._sequencingZmws = np.concatenate([ part.sequencingZmws
+ for part in self._parts ])
+
+ @property
+ def parts(self):
+ return self._parts
+
+ @property
+ def sequencingZmws(self):
+ return self._sequencingZmws
+
+ @property
+ def allSequencingZmws(self):
+ return np.concatenate([ part.allSequencingZmws
+ for part in self._parts ])
+
+ @property
+ def hasConsensusBasecalls(self):
+ return all(part.hasConsensusBasecalls for part in self._parts)
+
+ @property
+ def hasRawBasecalls(self):
+ return all(part.hasRawBasecalls for part in self._parts)
+
+
+ #
+ # Iterators
+ #
+
+ def __iter__(self):
+ """
+ Iterate over ZMWs
+ """
+ for holeNumber in self.sequencingZmws:
+ yield self[holeNumber]
+
+ def reads(self):
+ for part in self._parts:
+ for read in part.reads():
+ yield read
+
+ def subreads(self):
+ for part in self._parts:
+ for subread in part.subreads():
+ yield subread
+
+ def ccsReads(self):
+ for part in self._parts:
+ for ccsRead in part.ccsReads():
+ yield ccsRead
+
+ # ----------
+
+ def __len__(self):
+ return len(self.sequencingZmws)
+
+ def _getitemScalar(self, holeNumber):
+ part = self.parts[self._holeLookup(holeNumber)-1]
+ return part[holeNumber]
+
+ def __getitem__(self, holeNumbers):
+ if (isinstance(holeNumbers, int) or
+ issubclass(type(holeNumbers), np.integer)):
+ return self._getitemScalar(holeNumbers)
+ elif isinstance(holeNumbers, slice):
+ return [ self._getitemScalar(r)
+ for r in xrange(*holeNumbers.indices(len(self)))]
+ elif isinstance(holeNumbers, list) or isinstance(holeNumbers, np.ndarray):
+ if len(holeNumbers) == 0:
+ return []
+ else:
+ entryType = type(holeNumbers[0])
+ if entryType == int or issubclass(entryType, np.integer):
+ return [ self._getitemScalar(r) for r in holeNumbers ]
+ elif entryType == bool or issubclass(entryType, np.bool_):
+ return [ self._getitemScalar(r) for r in np.flatnonzero(holeNumbers) ]
+ raise TypeError, "Invalid type for BasH5Reader slicing"
+
+ @property
+ def movieName(self):
+ return self._parts[0].movieName
+
+ @property
+ def chemistryBarcodeTriple(self):
+ return self._parts[0].chemistryBarcodeTriple
+
+ @property
+ def sequencingChemistry(self):
+ return self._parts[0].sequencingChemistry
+
+ def __len__(self):
+ return len(self.sequencingZmws)
+
+ def close(self):
+ if hasattr(self, "file") and self.file is not None:
+ self.file.close()
+ self.file = None
+ for part in self.parts:
+ part.close()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ def __iter__(self):
+ for holeNumber in self.sequencingZmws:
+ yield self[holeNumber]
+
+ def __repr__(self):
+ return "<BasH5Reader: %s>" % self.movieName
+
+ # Make cursor classes available
+ Zmw = Zmw
+ ZmwRead = ZmwRead
+ CCSZmwRead = CCSZmwRead
+
+def sniffMovieName(basFilename):
+ # The clean way to do this is the get the moviename attribute from
+ # the file, but unfortunately that approach is unusable slow.
+ # Here we assume that the filename follows the standard PacBio
+ # naming convention.
+ movieName = op.basename(basFilename).split(".")[0]
+ return movieName
+
+class BasH5Collection(object):
+ """
+ Class representing a collection of base call (bas/bax) files.
+
+ Can be initialized from a list of bas/bax files, or an input.fofn
+ file containing a list of bas/bax files
+ """
+
+ def __init__(self, *args):
+ #
+ # Implementation notes: find all the bas/bax files, and group
+ # them together by movieName
+ #
+ basFilenames = []
+ for arg in args:
+ if arg.endswith(".fofn"):
+ for fn in readFofn(arg):
+ basFilenames.append(fn)
+ else:
+ basFilenames.append(arg)
+
+ movieNames = map(sniffMovieName, basFilenames)
+ movieNamesAndFiles = sorted(zip(movieNames, basFilenames))
+
+ self.readers = OrderedDict(
+ [ (k , BasH5Reader(*[val[1] for val in v]))
+ for k, v in groupby(movieNamesAndFiles, lambda t: t[0]) ])
+
+ @property
+ def movieNames(self):
+ return self.readers.keys()
+
+ def __getitem__(self, key):
+ """
+ Slice by movie name, zmw name, or zmw range name, using standard
+ PacBio naming conventions. Examples:
+
+ - ["m110818_..._s1_p0"] -> BasH5Reader
+ - ["m110818_..._s1_p0/24480"] -> Zmw
+ - ["m110818_..._s1_p0/24480/20_67"] -> ZmwRead
+ - ["m110818_..._s1_p0/24480/ccs"] -> CCSZmwRead
+ """
+ indices = key.rstrip("/").split("/")
+
+ if len(indices) < 1:
+ raise KeyError("Invalid slice of BasH5Collection")
+
+ if len(indices) >= 1:
+ result = self.readers[indices[0]]
+ if len(indices) >= 2:
+ result = result[int(indices[1])]
+ if len(indices) >= 3:
+ if indices[2] == "ccs":
+ result = result.ccsRead
+ else:
+ start, end = map(int, indices[2].split("_"))
+ result = result.read(start, end)
+ return result
+
+ #
+ # Iterators over Zmw, ZmwRead objects
+ #
+
+ def __iter__(self):
+ for reader in self.readers.values():
+ for zmw in reader: yield zmw
+
+ def reads(self):
+ for reader in self.readers.values():
+ for read in reader.reads():
+ yield read
+
+ def subreads(self):
+ for reader in self.readers.values():
+ for read in reader.subreads():
+ yield read
+
+ def ccsReads(self):
+ for reader in self.readers.values():
+ for read in reader.ccsReads():
+ yield read
diff --git a/pbcore/io/FastaIO.py b/pbcore/io/FastaIO.py
new file mode 100644
index 0000000..64f1af4
--- /dev/null
+++ b/pbcore/io/FastaIO.py
@@ -0,0 +1,459 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+"""
+Streaming I/O support for FASTA files.
+"""
+
+__all__ = [ "FastaRecord",
+ "FastaReader",
+ "FastaWriter",
+ "FastaTable",
+ "IndexedFastaReader",
+ "splitFastaHeader"]
+
+from .base import ReaderBase, WriterBase
+from ._utils import splitFileContents
+from pbcore import sequence
+from pbcore.util.decorators import deprecated
+
+import md5, mmap, numpy as np, re
+from collections import namedtuple, OrderedDict, Sequence
+from os.path import abspath, expanduser, isfile
+
+
+def splitFastaHeader( name ):
+ """
+ Split a FASTA/FASTQ header into its id and comment components
+ """
+ nameParts = re.split('\s', name, maxsplit=1)
+ id_ = nameParts[0]
+ if len(nameParts) > 1:
+ comment = nameParts[1].strip()
+ else:
+ comment = None
+ return (id_, comment)
+
+class FastaRecord(object):
+ """
+ A FastaRecord object models a named sequence in a FASTA file.
+ """
+ DELIMITER = ">"
+ COLUMNS = 60
+
+ def __init__(self, header, sequence):
+ try:
+ assert "\n" not in header
+ assert "\n" not in sequence
+ assert self.DELIMITER not in sequence
+ self._header = header
+ self._sequence = sequence
+ self._md5 = md5.md5(self.sequence).hexdigest()
+ self._id, self._comment = splitFastaHeader(header)
+ except AssertionError:
+ raise ValueError("Invalid FASTA record data")
+
+ @property
+ def header(self):
+ """
+ The header of the sequence in the FASTA file, equal to the entire
+ first line of the FASTA record following the '>' character.
+
+ .. warning::
+
+ You should almost certainly be using "id", not "header".
+ """
+ return self._header
+
+ @property
+ def name(self):
+ """
+ DEPRECATED: The name of the sequence in the FASTA file, equal to
+ the entire FASTA header following the '>' character
+ """
+ return self._header
+
+ @property
+ def id(self):
+ """
+ The id of the sequence in the FASTA file, equal to the FASTA header
+ up to the first whitespace.
+ """
+ return self._id
+
+ @property
+ def comment(self):
+ """
+ The comment associated with the sequence in the FASTA file, equal to
+ the contents of the FASTA header following the first whitespace
+ """
+ return self._comment
+
+ @property
+ def sequence(self):
+ """
+ The sequence for the record as present in the FASTA file.
+ (Newlines are removed but otherwise no sequence normalization
+ is performed).
+ """
+ return self._sequence
+
+ @property
+ @deprecated
+ def length(self):
+ """
+ Get the length of the FASTA sequence
+ """
+ return len(self._sequence)
+
+ @property
+ def md5(self):
+ """
+ The MD5 checksum (hex digest) of `sequence`
+ """
+ return self._md5
+
+ @classmethod
+ def fromString(cls, s):
+ """
+ Interprets a string as a FASTA record. Does not make any
+ assumptions about wrapping of the sequence string.
+ """
+ try:
+ lines = s.splitlines()
+ assert len(lines) > 1
+ assert lines[0][0] == cls.DELIMITER
+ header = lines[0][1:]
+ sequence = "".join(lines[1:])
+ return FastaRecord(header, sequence)
+ except AssertionError:
+ raise ValueError("String not recognized as a valid FASTA record")
+
+ def reverseComplement(self, preserveHeader=False):
+ """
+ Return a new FastaRecord with the reverse-complemented DNA sequence.
+ Optionally, supply a name
+ """
+ rcSequence = sequence.reverseComplement(self.sequence)
+ if preserveHeader:
+ return FastaRecord(self.header, rcSequence)
+ else:
+ rcName = '{0} [revcomp]'.format(self.header.strip())
+ return FastaRecord(rcName, rcSequence)
+
+ def __len__(self):
+ return len(self._sequence)
+
+ def __eq__(self, other):
+ if isinstance(other, self.__class__):
+ return (self.header == other.header and
+ self.sequence == other.sequence)
+ else:
+ return False
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+ def __repr__(self):
+ return "<FastaRecord: %s>" % self.header
+
+ def __str__(self):
+ """
+ Output a string representation of this FASTA record, observing
+ standard conventions about sequence wrapping.
+ """
+ return (">%s\n" % self.header) + \
+ wrap(self.sequence, self.COLUMNS)
+
+
+class FastaReader(ReaderBase):
+ """
+ Streaming reader for FASTA files, useable as a one-shot iterator
+ over FastaRecord objects. Agnostic about line wrapping.
+
+ Example:
+
+ .. doctest::
+
+ >>> from pbcore.io import FastaReader
+ >>> from pbcore import data
+ >>> filename = data.getTinyFasta()
+ >>> r = FastaReader(filename)
+ >>> for record in r:
+ ... print record.header, len(record.sequence), record.md5
+ ref000001|EGFR_Exon_2 183 e3912e9ceacd6538ede8c1b2adda7423
+ ref000002|EGFR_Exon_3 203 4bf218da37175a91869033024ac8f9e9
+ ref000003|EGFR_Exon_4 215 245bc7a046aad0788c22b071ed210f4d
+ ref000004|EGFR_Exon_5 157 c368b8191164a9d6ab76fd328e2803ca
+ >>> r.close()
+
+ """
+ DELIMITER = ">"
+
+ def __iter__(self):
+ try:
+ parts = splitFileContents(self.file, ">")
+ assert "" == next(parts)
+ for part in parts:
+ yield FastaRecord.fromString(">" + part)
+ except AssertionError:
+ raise ValueError("Invalid FASTA file")
+
+
+class FastaWriter(WriterBase):
+ """
+ A FASTA file writer class
+
+ Example:
+
+ .. doctest::
+
+ >>> from pbcore.io import FastaWriter
+ >>> with FastaWriter("output.fasta.gz") as writer:
+ ... writer.writeRecord("dog", "GATTACA")
+ ... writer.writeRecord("cat", "CATTACA")
+
+ (Notice that underlying file will be automatically closed after
+ exit from the `with` block.)
+
+ .. testcleanup::
+
+ import os; os.unlink("output.fasta.gz")
+
+ """
+ def writeRecord(self, *args):
+ """
+ Write a FASTA record to the file. If given one argument, it is
+ interpreted as a ``FastaRecord``. Given two arguments, they
+ are interpreted as the name and the sequence.
+ """
+ if len(args) not in (1, 2):
+ raise ValueError
+ if len(args) == 1:
+ record = args[0]
+ assert isinstance(record, FastaRecord)
+ else:
+ header, sequence = args
+ record = FastaRecord(header, sequence)
+ self.file.write(str(record))
+ self.file.write("\n")
+
+
+##
+## Utility functions for FastaReader
+##
+def wrap(s, columns):
+ return "\n".join(s[start:start+columns]
+ for start in xrange(0, len(s), columns))
+
+
+
+# ------------------------------------------------------------------------------
+# IndexedFastaReader: random access Fasta class
+#
+
+FaiRecord = namedtuple("FaiRecord", ("id", "comment", "header", "length", "offset", "lineWidth", "stride"))
+
+def faiFilename(fastaFilename):
+ return fastaFilename + ".fai"
+
+def loadFastaIndex(faidxFilename, fastaView):
+
+ if not isfile(faidxFilename): # os.path.isfile
+ raise IOError("Companion FASTA index (.fai) file not found or "
+ "malformatted! Use 'samtools faidx' to generate FASTA "
+ "index.")
+
+ tbl = []
+ # NB: We have to look back in the FASTA to find the full header;
+ # only "id" makes it into the fai.
+ offsetEnd = 0
+ for line in open(faidxFilename):
+ length, offset, lineWidth, blen = map(int, line.split()[-4:])
+ newlineWidth = blen - lineWidth # 2 for DOS, 1 for UNIX
+ header_ = fastaView[offsetEnd:offset]
+ assert (header_[0] == ">" and header_[-1] == "\n")
+ header = header_[1:-newlineWidth]
+ id, comment = splitFastaHeader(header)
+ q, r = divmod(length, lineWidth)
+ numNewlines = q + (r > 0)
+ offsetEnd = offset + length + numNewlines*newlineWidth
+ record = FaiRecord(id, comment, header, length, offset, lineWidth, blen)
+ tbl.append(record)
+ return tbl
+
+def fileOffset(faiRecord, pos):
+ """
+ Find the in-file position (in bytes) corresponding to the position
+ in the named contig, using the FASTA index.
+ """
+ q, r = divmod(pos, faiRecord.lineWidth)
+ offset = faiRecord.offset + q*faiRecord.stride + r
+ return offset
+
+class MmappedFastaSequence(Sequence):
+ """
+ A string-like view of a contig sequence that is backed by a file
+ using mmap.
+ """
+ def __init__(self, view, faiRecord):
+ self.view = view
+ self.faiRecord = faiRecord
+
+ def __getitem__(self, spec):
+ if isinstance(spec, slice):
+ start, stop, stride = spec.indices(len(self))
+ if stride != 1:
+ raise ValueError, "Unsupported stride"
+ elif spec < 0:
+ start = self.faiRecord.length + spec
+ stop = start + 1
+ stride = 1
+ else:
+ start = spec
+ stop = start + 1
+ stride = 1
+ if not (0 <= start <= stop <= self.faiRecord.length):
+ raise IndexError, "Out of bounds"
+ startOffset = fileOffset(self.faiRecord, start)
+ endOffset = fileOffset(self.faiRecord, stop)
+ snip = self.view[startOffset:endOffset].translate(None, "\r\n")
+ return snip
+
+ def __len__(self):
+ return self.faiRecord.length
+
+ def __eq__(self, other):
+ return (isinstance(other, MmappedFastaSequence) and
+ self[:] == other[:])
+
+class IndexedFastaRecord(object):
+ def __init__(self, view, faiRecord):
+ self.view = view
+ self.faiRecord = faiRecord
+
+ @property
+ def name(self):
+ return self.header
+
+ @property
+ def header(self):
+ return self.faiRecord.header
+
+ @property
+ def id(self):
+ return self.faiRecord.id
+
+ @property
+ def comment(self):
+ return self.faiRecord.comment
+
+ @property
+ def sequence(self):
+ return MmappedFastaSequence(self.view, self.faiRecord)
+
+ @property
+ @deprecated
+ def length(self):
+ return self.faiRecord.length
+
+ def __len__(self):
+ return self.faiRecord.length
+
+ def __repr__(self):
+ return "<IndexedFastaRecord: %s>" % self.header
+
+ def __eq__(self, other):
+ return (isinstance(other, IndexedFastaRecord) and
+ self.header == other.header and
+ self.sequence == other.sequence)
+
+class IndexedFastaReader(ReaderBase, Sequence):
+ """
+ Random-access FASTA file reader.
+
+ Requires that the lines of the FASTA file be fixed-length and that
+ there is a FASTA index file (generated by `samtools faidx`) with
+ name `fastaFilename.fai` in the same directory.
+
+ .. doctest::
+
+ >>> from pbcore.io import FastaTable
+ >>> from pbcore import data
+ >>> filename = data.getFasta()
+ >>> t = IndexedFastaReader(filename)
+ >>> print t[:4] # doctest: +NORMALIZE_WHITESPACE
+ [<IndexedFastaRecord: ref000001|EGFR_Exon_2>,
+ <IndexedFastaRecord: ref000002|EGFR_Exon_3>,
+ <IndexedFastaRecord: ref000003|EGFR_Exon_4>,
+ <IndexedFastaRecord: ref000004|EGFR_Exon_5>]
+ >>> t.close()
+
+ """
+ def __init__(self, filename):
+ self.filename = abspath(expanduser(filename))
+ self.file = open(self.filename, "r")
+ self.view = mmap.mmap(self.file.fileno(), 0,
+ prot=mmap.PROT_READ)
+ self.faiFilename = faiFilename(self.filename)
+ self.fai = loadFastaIndex(self.faiFilename, self.view)
+ self.contigLookup = self._loadContigLookup()
+
+ def _loadContigLookup(self):
+ contigLookup = dict()
+ for (pos, faiRecord) in enumerate(self.fai):
+ contigLookup[pos] = faiRecord
+ contigLookup[faiRecord.id] = faiRecord
+ contigLookup[faiRecord.header] = faiRecord
+ return contigLookup
+
+ def __getitem__(self, key):
+ if key < 0:
+ key = len(self) + key
+
+ if isinstance(key, slice):
+ indices = xrange(*key.indices(len(self)))
+ return [ IndexedFastaRecord(self.view, self.contigLookup[i])
+ for i in indices ]
+ elif key in self.contigLookup:
+ return IndexedFastaRecord(self.view, self.contigLookup[key])
+ else:
+ raise IndexError, "Contig not in FastaTable"
+
+ def __iter__(self):
+ return (self[i] for i in xrange(len(self)))
+
+ def __len__(self):
+ return len(self.fai)
+
+# old name for IndexedFastaReader was FastaTable
+FastaTable = IndexedFastaReader
diff --git a/pbcore/io/FastqIO.py b/pbcore/io/FastqIO.py
new file mode 100644
index 0000000..3a8706b
--- /dev/null
+++ b/pbcore/io/FastqIO.py
@@ -0,0 +1,259 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+"""
+I/O support for FASTQ files
+"""
+
+__all__ = [ "FastqRecord",
+ "FastqReader",
+ "FastqWriter",
+ "qvsFromAscii",
+ "asciiFromQvs" ]
+import numpy as np
+from .base import ReaderBase, WriterBase
+from .FastaIO import splitFastaHeader
+from pbcore import sequence
+from pbcore.util.decorators import deprecated
+
+class FastqRecord(object):
+ """
+ A ``FastqRecord`` object models a named sequence and its quality
+ values in a FASTQ file. For reference consult `Wikipedia's FASTQ
+ entry`_. We adopt the Sanger encoding convention, allowing the
+ encoding of QV values in [0, 93] using ASCII 33 to 126. We only
+ support FASTQ files in the four-line convention (unwrapped).
+ Wrapped FASTQ files are generally considered a bad idea as the @,
+ + delimiters can also appear in the quality string, thus parsing
+ cannot be done safely.
+
+ .. _Wikipedia's FASTQ entry: http://en.wikipedia.org/wiki/FASTQ_format
+ """
+ DELIMITER1 = "@"
+ DELIMITER2 = "+"
+
+ def __init__(self, header, sequence, quality=None, qualityString=None):
+ try:
+ assert "\n" not in header
+ assert "\n" not in sequence
+ self._header = header
+ self._sequence = sequence
+ self._id, self._comment = splitFastaHeader(header)
+
+ # Only one of quality, qualityString should be provided
+ assert (quality is None) != (qualityString is None)
+ if quality is not None:
+ self._quality = quality
+ else:
+ self._quality = qvsFromAscii(qualityString)
+ assert len(self.sequence) == len(self.quality)
+ except AssertionError:
+ raise ValueError("Invalid FASTQ record data")
+
+ @property
+ def header(self):
+ """
+ The header of the sequence in the FASTQ file
+ """
+ return self._header
+
+ @property
+ def name(self):
+ """
+ DEPRECATED: The name of the sequence in the FASTQ file
+ """
+ return self._header
+
+ @property
+ def id(self):
+ """
+ The id of the sequence in the FASTQ file, equal to the FASTQ header
+ up to the first whitespace.
+ """
+ return self._id
+
+ @property
+ @deprecated
+ def length(self):
+ """
+ The length of the sequence
+ """
+ return len(self.sequence)
+
+ @property
+ def comment(self):
+ """
+ The comment associated with the sequence in the FASTQ file, equal to
+ the contents of the FASTQ header following the first whitespace
+ """
+ return self._comment
+
+ @property
+ def sequence(self):
+ """
+ The sequence for the record as present in the FASTQ file.
+ """
+ return self._sequence
+
+ @property
+ def quality(self):
+ """
+ The quality values, as an array of integers
+ """
+ return self._quality
+
+ @property
+ def qualityString(self):
+ """
+ The quality values as an ASCII-encoded string
+ """
+ return asciiFromQvs(self._quality)
+
+ @classmethod
+ def fromString(cls, s):
+ """
+ Interprets a string as a FASTQ record. Only supports four-line
+ format, as wrapped FASTQs can't easily be safely parsed.
+ """
+ try:
+ lines = s.rstrip().splitlines()
+ assert len(lines) == 4
+ assert lines[0][0] == cls.DELIMITER1
+ assert lines[2][0] == cls.DELIMITER2
+ assert len(lines[1]) == len(lines[3])
+ header = lines[0][1:]
+ sequence = lines[1]
+ quality = qvsFromAscii(lines[3])
+ return FastqRecord(header, sequence, quality)
+ except AssertionError:
+ raise ValueError("String not recognized as a valid FASTQ record")
+
+ def reverseComplement(self, preserveHeader=False):
+ """
+ Return a new FastaRecord with the reverse-complemented DNA sequence.
+ Optionally, supply a name
+ """
+ rcSequence = sequence.reverseComplement(self.sequence)
+ rcQuality = sequence.reverse(self.quality)
+ if preserveHeader:
+ return FastqRecord(self.header, rcSequence, rcQuality)
+ else:
+ rcName = '{0} [revcomp]'.format(self.header.strip())
+ return FastqRecord(rcName, rcSequence, rcQuality)
+
+ def __len__(self):
+ return len(self._sequence)
+
+ def __eq__(self, other):
+ if isinstance(other, self.__class__):
+ return (self.header == other.header and
+ self.sequence == other.sequence and
+ np.array_equiv(self.quality, other.quality))
+ else:
+ return False
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+ def __str__(self):
+ """
+ Output a string representation of this FASTQ record, in
+ standard four-line format.
+ """
+ return "\n".join([self.DELIMITER1 + self.header,
+ self.sequence,
+ self.DELIMITER2,
+ self.qualityString])
+
+class FastqReader(ReaderBase):
+ """
+ Reader for FASTQ files, useable as a one-shot iterator over
+ FastqRecord objects. FASTQ files must follow the four-line
+ convention.
+ """
+ def __iter__(self):
+ """
+ One-shot iteration support
+ """
+ while True:
+ lines = [next(self.file) for i in xrange(4)]
+ yield FastqRecord(lines[0][1:-1],
+ lines[1][:-1],
+ qualityString=lines[3][:-1])
+
+
+class FastqWriter(WriterBase):
+ """
+ A FASTQ file writer class
+
+ Example:
+
+ .. doctest::
+
+ >>> from pbcore.io import FastqWriter
+ >>> with FastqWriter("output.fq.gz") as writer:
+ ... writer.writeRecord("dog", "GATTACA", [35]*7)
+ ... writer.writeRecord("cat", "CATTACA", [35]*7)
+
+ .. testcleanup::
+
+ import os; os.unlink("output.fq.gz")
+
+ (Notice that underlying file will be automatically closed after
+ exit from the `with` block.)
+ """
+ def writeRecord(self, *args):
+ """
+ Write a FASTQ record to the file. If given one argument, it is
+ interpreted as a ``FastqRecord``. Given three arguments, they
+ are interpreted as the name, sequence, and quality.
+ """
+ if len(args) not in (1, 3):
+ raise ValueError
+ if len(args) == 1:
+ record = args[0]
+ assert isinstance(record, FastqRecord)
+ else:
+ header, sequence, quality = args
+ record = FastqRecord(header, sequence, quality)
+ self.file.write(str(record))
+ self.file.write("\n")
+
+
+##
+## Utility
+##
+def qvsFromAscii(s):
+ return (np.fromstring(s, dtype=np.uint8) - 33)
+
+def asciiFromQvs(a):
+ return (np.clip(a, 0, 93).astype(np.uint8) + 33).tostring()
diff --git a/pbcore/io/FofnIO.py b/pbcore/io/FofnIO.py
new file mode 100644
index 0000000..10d6bdb
--- /dev/null
+++ b/pbcore/io/FofnIO.py
@@ -0,0 +1,96 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,g SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Authors: David Alexander
+
+from pbcore.io.base import getFileHandle
+from os.path import dirname, isabs, join, abspath, expanduser
+import xml.etree.ElementTree as ET
+
+
+__all__ = [ "readFofn",
+ "readInputXML",
+ "enumeratePulseFiles" ]
+
+def readFofn(f):
+ """
+ Return iterator over filenames in a FOFN ("file-of-filenames")
+ file or file-like object.
+
+ If f is a path to a true FOFN on disk, any paths listed in the
+ FOFN that are relative (i.e., do not contain a leading '/') will
+ be reckoned from the directory containing the FOFN.
+ """
+ if isinstance(f, basestring):
+ fofnRoot = dirname(abspath(expanduser(f)))
+ else:
+ fofnRoot = None
+
+ for line in getFileHandle(f):
+ path = line.rstrip()
+ if not path:
+ continue # skip empty lines
+ elif isabs(path):
+ yield path
+ elif fofnRoot is not None:
+ yield join(fofnRoot, path)
+ else:
+ raise IOError, "Cannot handle relative paths in StringIO FOFN"
+
+def readInputXML(fname):
+ tree = ET.parse(fname)
+ root = tree.getroot()
+ for elt in root.iter():
+ if elt.tag=="location":
+ yield elt.text
+
+def enumeratePulseFiles(fname):
+ """
+ A pulse file is a file with suffix .bax.h5, .plx.h5, or bas.h5
+
+ fname is either a name of a pulse file, a list of names of pulse
+ files, a FOFN (file of file names) listing pulse files, or an
+ input.xml file.
+
+ This is a generalization of readFofn for the case where fname is
+ of type fofn|pulse, provided for convenience for tools that accept
+ such an argument.
+ """
+ if isinstance(fname, list):
+ for fname_ in fname:
+ yield fname_
+ elif fname.endswith(".fofn"):
+ for pls in readFofn(fname):
+ yield pls
+ elif fname.endswith(".xml"):
+ for pls in readInputXML(fname):
+ yield pls
+ else:
+ yield fname
diff --git a/pbcore/io/GffIO.py b/pbcore/io/GffIO.py
new file mode 100644
index 0000000..1a3a2ce
--- /dev/null
+++ b/pbcore/io/GffIO.py
@@ -0,0 +1,233 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+"""
+I/O support for GFF3 files.
+
+The specification for the GFF format is available at
+ http://www.sequenceontology.org/gff3.shtml
+"""
+
+__all__ = [ "Gff3Record",
+ "GffReader",
+ "GffWriter" ]
+
+from .base import ReaderBase, WriterBase
+from collections import OrderedDict
+from copy import copy as shallow_copy
+
+class Gff3Record(object):
+ """
+ Class for GFF record, providing uniform access to standard
+ GFF fields and attributes.
+
+ .. doctest::
+
+ >>> from pbcore.io import Gff3Record
+ >>> record = Gff3Record("chr1", 10, 11, "insertion",
+ ... attributes=[("foo", "1"), ("bar", "2")])
+ >>> record.start
+ 10
+ >>> record.foo
+ '1'
+ >>> record.baz = 3
+ >>> del record.baz
+
+ Attribute access using record.fieldName notation raises ``ValueError``
+ if an attribute named fieldName doesn't exist. Use::
+
+ >>> record.get(fieldName)
+
+ to fetch a field or attribute with None default or::
+
+ >>> record.get(fieldName, defaultValue)
+
+ to fetch the field or attribute with a custom default.
+ """
+ _GFF_COLUMNS = [ "seqid", "source", "type",
+ "start", "end", "score",
+ "strand", "phase", "attributes" ]
+
+ def __init__(self, seqid, start, end, type,
+ score=".", strand=".", phase=".",
+ source=".", attributes=()):
+ self.seqid = seqid
+ self.source = source
+ self.type = type
+ self.start = start
+ self.end = end
+ self.score = score
+ self.strand = strand
+ self.phase = phase
+ self.attributes = OrderedDict(attributes)
+
+ def copy(self):
+ """
+ Return a shallow copy
+ """
+ return shallow_copy(self)
+
+ @classmethod
+ def fromString(cls, s):
+ """
+ Parse a string as a GFF record.
+ Trailing whitespace is ignored.
+ """
+ columns = s.rstrip().rstrip(";").split("\t")
+ try:
+ assert len(columns) == len(cls._GFF_COLUMNS)
+ attributes = map(tupleFromGffAttribute, columns[-1].split(";"))
+ (_seqid, _source, _type, _start,
+ _end, _score, _strand, _phase) = columns[:-1]
+ return Gff3Record(_seqid, int(_start), int(_end), _type,
+ _score, _strand, _phase, _source, attributes)
+ except (AssertionError, ValueError):
+ raise ValueError("Could not interpret string as a Gff3Record: %s" % s)
+
+
+ @staticmethod
+ def _formatField(field):
+ if type(field) == float:
+ return "%.2f" % field
+ else:
+ return "%s" % field
+
+ def __str__(self):
+ formattedAttributes = ";".join(
+ ("%s=%s" % (k, self._formatField(v))
+ for (k, v) in self.attributes.iteritems()))
+ formattedFixedColumns = "\t".join(
+ self._formatField(getattr(self, k))
+ for k in self._GFF_COLUMNS[:-1])
+ return "%s\t%s" % (formattedFixedColumns,
+ formattedAttributes)
+
+ #
+ # Access to the attributes list using
+ # dot notation, providing a uniform
+ # interface. Exception if attribute
+ # not found.
+ #
+ def __getattr__(self, name):
+ if name in self.attributes:
+ return self.attributes[name]
+ else:
+ raise AttributeError
+
+ def __setattr__(self, name, value):
+ if name in self._GFF_COLUMNS:
+ object.__setattr__(self, name, value)
+ else:
+ self.attributes[name] = value
+
+ def __delattr__(self, name):
+ del self.attributes[name]
+
+ #
+ # Access without exceptions.
+ #
+ def get(self, name, default=None):
+ return getattr(self, name, default)
+
+ def put(self, name, value):
+ setattr(self, name, value)
+
+class GffReader(ReaderBase):
+ """
+ A GFF file reader class
+ """
+ def _readHeaders(self):
+ headers = []
+ firstLine = None
+ for line in self.file:
+ if line.startswith("##"):
+ headers.append(line.rstrip())
+ else:
+ firstLine = line
+ break
+ return headers, firstLine
+
+ def __init__(self, f):
+ super(GffReader, self).__init__(f)
+ self.headers, self.firstLine = self._readHeaders()
+
+ def __iter__(self):
+ if self.firstLine:
+ yield Gff3Record.fromString(self.firstLine)
+ self.firstLine = None
+ for line in self.file:
+ yield Gff3Record.fromString(line)
+
+
+class GffWriter(WriterBase):
+ """
+ A GFF file writer class
+ """
+ def __init__(self, f):
+ super(GffWriter, self).__init__(f)
+ self.writeHeader("##gff-version 3")
+
+ def writeHeader(self, headerLine):
+ if not headerLine.startswith("##"):
+ raise ValueError("GFF headers must start with ##")
+ self.file.write("{0}\n".format(headerLine.rstrip()))
+
+ def writeRecord(self, record):
+ assert isinstance(record, Gff3Record)
+ self.file.write("{0}\n".format(str(record)))
+
+#
+# Utility functions
+#
+
+def floatValue(s):
+ try:
+ return float(s)
+ except:
+ return None
+
+def integerValue(s):
+ try:
+ return int(s)
+ except:
+ return None
+
+def grok(s):
+ iv = integerValue(s)
+ if iv is not None: return iv
+ fv = floatValue(s)
+ if fv is not None: return fv
+ return s
+
+def tupleFromGffAttribute(s):
+ k, v = s.split("=")
+ return k, grok(v)
diff --git a/pbcore/io/__init__.py b/pbcore/io/__init__.py
new file mode 100644
index 0000000..89a588f
--- /dev/null
+++ b/pbcore/io/__init__.py
@@ -0,0 +1,40 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from .BasH5IO import *
+from .FastaIO import *
+from .FastqIO import *
+from .FofnIO import *
+from .GffIO import *
+from .base import *
+
+from .align import *
+
+from .opener import *
diff --git a/pbcore/io/_utils.py b/pbcore/io/_utils.py
new file mode 100644
index 0000000..a138cb3
--- /dev/null
+++ b/pbcore/io/_utils.py
@@ -0,0 +1,246 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from __future__ import absolute_import
+import h5py, numpy as np
+from cStringIO import StringIO
+
+
+def arrayFromDataset(ds, offsetBegin, offsetEnd):
+ """
+ Extract a one-dimensional array from an HDF5 dataset.
+ """
+ shape = (offsetEnd - offsetBegin,)
+ a = np.ndarray(shape=shape, dtype=ds.dtype)
+ mspace = h5py.h5s.create_simple(shape)
+ fspace = ds.id.get_space()
+ fspace.select_hyperslab((offsetBegin,), shape, (1,))
+ ds.id.read(mspace, fspace, a)
+ return a
+
+
+def splitFileContents(f, delimiter, BLOCKSIZE=8192):
+ """
+ Same semantics as f.read().split(delimiter), but with memory usage
+ determined by largest chunk rather than entire file size
+ """
+ remainder = StringIO()
+ while True:
+ block = f.read(BLOCKSIZE)
+ if not block:
+ break
+ parts = block.split(delimiter)
+ remainder.write(parts[0])
+ for part in parts[1:]:
+ yield remainder.getvalue()
+ remainder = StringIO()
+ remainder.write(part)
+ yield remainder.getvalue()
+
+
+
+# For reasons that are obscure to me, the recarray outer join
+# functionality in numpy's lib.recfunctions is broken as of numpy
+# 1.6.1. Here is the implementation I found in matplotlib (BSD
+# compatible license; need to add license note to LICENSE), which
+# seems to work.
+# --DHA
+
+def is_string_like(obj):
+ 'Return True if *obj* looks like a string'
+ if isinstance(obj, (str, unicode)): return True
+ # numpy strings are subclass of str, ma strings are not
+ if ma.isMaskedArray(obj):
+ if obj.ndim == 0 and obj.dtype.kind in 'SU':
+ return True
+ else:
+ return False
+ try: obj + ''
+ except: return False
+ return True
+
+def rec_join(key, r1, r2, jointype='inner', defaults=None, r1postfix='1', r2postfix='2'):
+ """
+ Join record arrays *r1* and *r2* on *key*; *key* is a tuple of
+ field names -- if *key* is a string it is assumed to be a single
+ attribute name. If *r1* and *r2* have equal values on all the keys
+ in the *key* tuple, then their fields will be merged into a new
+ record array containing the intersection of the fields of *r1* and
+ *r2*.
+
+ *r1* (also *r2*) must not have any duplicate keys.
+
+ The *jointype* keyword can be 'inner', 'outer', 'leftouter'. To
+ do a rightouter join just reverse *r1* and *r2*.
+
+ The *defaults* keyword is a dictionary filled with
+ ``{column_name:default_value}`` pairs.
+
+ The keywords *r1postfix* and *r2postfix* are postfixed to column names
+ (other than keys) that are both in *r1* and *r2*.
+ """
+
+ if is_string_like(key):
+ key = (key, )
+
+ for name in key:
+ if name not in r1.dtype.names:
+ raise ValueError('r1 does not have key field %s'%name)
+ if name not in r2.dtype.names:
+ raise ValueError('r2 does not have key field %s'%name)
+
+ def makekey(row):
+ return tuple([row[name] for name in key])
+
+ r1d = dict([(makekey(row),i) for i,row in enumerate(r1)])
+ r2d = dict([(makekey(row),i) for i,row in enumerate(r2)])
+
+ r1keys = set(r1d.keys())
+ r2keys = set(r2d.keys())
+
+ common_keys = r1keys & r2keys
+
+ r1ind = np.array([r1d[k] for k in common_keys])
+ r2ind = np.array([r2d[k] for k in common_keys])
+
+ common_len = len(common_keys)
+ left_len = right_len = 0
+ if jointype == "outer" or jointype == "leftouter":
+ left_keys = r1keys.difference(r2keys)
+ left_ind = np.array([r1d[k] for k in left_keys])
+ left_len = len(left_ind)
+ if jointype == "outer":
+ right_keys = r2keys.difference(r1keys)
+ right_ind = np.array([r2d[k] for k in right_keys])
+ right_len = len(right_ind)
+
+ def key_desc(name):
+ 'if name is a string key, use the larger size of r1 or r2 before merging'
+ dt1 = r1.dtype[name]
+ if dt1.type != np.string_:
+ return (name, dt1.descr[0][1])
+
+ dt2 = r1.dtype[name]
+ assert dt2==dt1
+ if dt1.num>dt2.num:
+ return (name, dt1.descr[0][1])
+ else:
+ return (name, dt2.descr[0][1])
+
+
+ keydesc = [key_desc(name) for name in key]
+
+ def mapped_r1field(name):
+ """
+ The column name in *newrec* that corresponds to the column in *r1*.
+ """
+ if name in key or name not in r2.dtype.names: return name
+ else: return name + r1postfix
+
+ def mapped_r2field(name):
+ """
+ The column name in *newrec* that corresponds to the column in *r2*.
+ """
+ if name in key or name not in r1.dtype.names: return name
+ else: return name + r2postfix
+
+ r1desc = [(mapped_r1field(desc[0]), desc[1]) for desc in r1.dtype.descr if desc[0] not in key]
+ r2desc = [(mapped_r2field(desc[0]), desc[1]) for desc in r2.dtype.descr if desc[0] not in key]
+ newdtype = np.dtype(keydesc + r1desc + r2desc)
+
+ newrec = np.recarray((common_len + left_len + right_len,), dtype=newdtype)
+
+ if defaults is not None:
+ for thiskey in defaults:
+ if thiskey not in newdtype.names:
+ warnings.warn('rec_join defaults key="%s" not in new dtype names "%s"'%(
+ thiskey, newdtype.names))
+
+ for name in newdtype.names:
+ dt = newdtype[name]
+ if dt.kind in ('f', 'i'):
+ newrec[name] = 0
+
+ if jointype != 'inner' and defaults is not None: # fill in the defaults enmasse
+ newrec_fields = newrec.dtype.fields.keys()
+ for k, v in defaults.items():
+ if k in newrec_fields:
+ newrec[k] = v
+
+ for field in r1.dtype.names:
+ newfield = mapped_r1field(field)
+ if common_len:
+ newrec[newfield][:common_len] = r1[field][r1ind]
+ if (jointype == "outer" or jointype == "leftouter") and left_len:
+ newrec[newfield][common_len:(common_len+left_len)] = r1[field][left_ind]
+
+ for field in r2.dtype.names:
+ newfield = mapped_r2field(field)
+ if field not in key and common_len:
+ newrec[newfield][:common_len] = r2[field][r2ind]
+ if jointype == "outer" and right_len:
+ newrec[newfield][-right_len:] = r2[field][right_ind]
+
+ newrec.sort(order=key)
+
+ return newrec
+
+
+def drop_fields(rec, names):
+ """
+ Return a new numpy record array with fields in *names* dropped.
+ """
+
+ names = set(names)
+ Nr = len(rec)
+
+ newdtype = np.dtype([(name, rec.dtype[name]) for name in rec.dtype.names
+ if name not in names])
+
+ newrec = np.recarray(rec.shape, dtype=newdtype)
+ for field in newdtype.names:
+ newrec[field] = rec[field]
+
+ return newrec
+
+def print_rec_array(rec):
+ """
+ Pretty-print a recarray
+ """
+ print "foo"
+
+
+class CommonEqualityMixin(object):
+ def __eq__(self, other):
+ return (isinstance(other, self.__class__)
+ and self.__dict__ == other.__dict__)
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
diff --git a/pbcore/io/align/BamAlignment.py b/pbcore/io/align/BamAlignment.py
new file mode 100644
index 0000000..f192ceb
--- /dev/null
+++ b/pbcore/io/align/BamAlignment.py
@@ -0,0 +1,571 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+from functools import wraps
+from bisect import bisect_right, bisect_left
+
+from pbcore.sequence import reverseComplement
+from ._BamSupport import *
+from ._AlignmentMixin import AlignmentRecordMixin
+
+__all__ = [ "BamAlignment" ]
+
+
+def _unrollCigar(cigar, exciseSoftClips=False):
+ """
+ Run-length decode the cigar (input is BAM packed CIGAR, not a cigar string)
+
+ Removes hard clip ops from the output. Remove all?
+ """
+ cigarArray = np.array(cigar, dtype=int)
+ hasHardClipAtLeft = cigarArray[0,0] == BAM_CHARD_CLIP
+ hasHardClipAtRight = cigarArray[-1,0] == BAM_CHARD_CLIP
+ ncigar = len(cigarArray)
+ x = np.s_[int(hasHardClipAtLeft) : ncigar - int(hasHardClipAtRight)]
+ ops = np.repeat(cigarArray[x,0], cigarArray[x,1])
+ if exciseSoftClips:
+ return ops[ops != BAM_CSOFT_CLIP]
+ else:
+ return ops
+
+def _makePulseFeatureAccessor(featureName):
+ def f(self, aligned=True, orientation="native"):
+ return self.pulseFeature(featureName, aligned, orientation)
+ return f
+
+
+def requiresReference(method):
+ @wraps(method)
+ def f(bamAln, *args, **kwargs):
+ if not bamAln.bam.isReferenceLoaded:
+ raise UnavailableFeature, "this feature requires loaded reference sequence"
+ else:
+ return method(bamAln, *args, **kwargs)
+ return f
+
+def requiresPbi(method):
+ @wraps(method)
+ def f(bamAln, *args, **kwargs):
+ if bamAln.rowNumber is None:
+ raise UnavailableFeature, "this feature requires a PacBio BAM index"
+ else:
+ return method(bamAln, *args, **kwargs)
+ return f
+
+def requiresMapping(method):
+ @wraps(method)
+ def f(bamAln, *args, **kwargs):
+ if bamAln.isUnmapped:
+ raise UnavailableFeature, "this feature requires a *mapped* BAM record"
+ else:
+ return method(bamAln, *args, **kwargs)
+ return f
+
+
+class BamAlignment(AlignmentRecordMixin):
+ def __init__(self, bamReader, pysamAlignedRead, rowNumber=None):
+ #TODO: make these __slot__
+ self.peer = pysamAlignedRead
+ self.bam = bamReader
+ self.rowNumber = rowNumber
+ self.tStart = self.peer.pos
+ self.tEnd = self.peer.aend
+ # Our terminology doesn't agree with pysam's terminology for
+ # "query", "read". This makes this code confusing.
+ if self.peer.is_reverse:
+ clipLeft = self.peer.rlen - self.peer.qend
+ clipRight = self.peer.qstart
+ else:
+ clipLeft = self.peer.qstart
+ clipRight = self.peer.rlen - self.peer.qend
+ self.aStart = self.qStart + clipLeft
+ self.aEnd = self.qEnd - clipRight
+
+ # Cache of unrolled cigar, in genomic orientation
+ self._unrolledCigar = None
+
+
+ @property
+ def reader(self):
+ return self.bam
+
+ @property
+ def qId(self):
+ return self.readGroupInfo.ID
+
+ @property
+ def qName(self):
+ return self.peer.qname
+
+ @property
+ def qStart(self):
+ return self.peer.opt("qs")
+
+ @property
+ def qEnd(self):
+ return self.peer.opt("qe")
+
+ @property
+ def qLen(self):
+ return self.peer.query_length
+
+ @property
+ def tId(self):
+ return self.peer.tid
+
+ @property
+ def isMapped(self):
+ return not self.isUnmapped
+
+ @property
+ def isUnmapped(self):
+ return self.peer.is_unmapped
+
+ @property
+ def isReverseStrand(self):
+ return self.peer.is_reverse
+
+ @property
+ def isForwardStrand(self):
+ return not self.peer.is_reverse
+
+ @property
+ def HoleNumber(self):
+ return self.peer.opt("zm")
+
+ @property
+ def MapQV(self):
+ return self.peer.mapq
+
+ @requiresMapping
+ def clippedTo(self, refStart, refEnd):
+ """
+ Return a new `BamAlignment` that refers to a subalignment of
+ this alignment, as induced by clipping to reference
+ coordinates `refStart` to `refEnd`.
+
+ .. warning::
+ This function takes time linear in the length of the alignment.
+ """
+ assert type(self) is BamAlignment
+ if (refStart >= refEnd or
+ refStart >= self.tEnd or
+ refEnd <= self.tStart):
+ raise IndexError, "Clipping query does not overlap alignment"
+
+ # The clipping region must intersect the alignment, though it
+ # does not have to be contained wholly within it.
+ refStart = max(self.referenceStart, refStart)
+ refEnd = min(self.referenceEnd, refEnd)
+ refPositions = self.referencePositions(orientation="genomic")
+ readPositions = self.readPositions(orientation="genomic")
+ uc = self.unrolledCigar(orientation="genomic")
+
+ # Clipping positions within the alignment array
+ clipStart = bisect_right(refPositions, refStart) - 1
+ clipEnd = bisect_left(refPositions, refEnd)
+
+ tStart = refStart
+ tEnd = refEnd
+ cUc = uc[clipStart:clipEnd]
+ readLength = sum(cUc != BAM_CDEL)
+ if self.isForwardStrand:
+ aStart = readPositions[clipStart]
+ aEnd = aStart + readLength
+ else:
+ aEnd = readPositions[clipStart] + 1
+ aStart = aEnd - readLength
+ return ClippedBamAlignment(self, tStart, tEnd, aStart, aEnd, cUc)
+
+ @property
+ @requiresMapping
+ def referenceInfo(self):
+ return self.bam.referenceInfo(self.referenceId)
+
+ @property
+ @requiresMapping
+ def referenceName(self):
+ return self.referenceInfo.FullName
+
+ @property
+ def movieName(self):
+ return self.readGroupInfo.MovieName
+
+ @property
+ def readGroupInfo(self):
+ return self.bam.readGroupInfo(rgAsInt(self.peer.opt("RG")))
+
+ @property
+ def readType(self):
+ return self.readGroupInfo.ReadType
+
+ @property
+ def sequencingChemistry(self):
+ return self.readGroupInfo.SequencingChemistry
+
+ @property
+ def referenceId(self):
+ return self.tId
+
+ @property
+ def queryStart(self):
+ return self.qStart
+
+ @property
+ def queryEnd(self):
+ return self.qEnd
+
+
+ #TODO: provide this in cmp.h5 but throw "unsupported"
+ @property
+ def queryName(self):
+ return self.peer.qname
+
+ @property
+ @requiresPbi
+ def identity(self):
+ if self.readLength == 0:
+ return 0.
+ else:
+ return 1. - float(self.nMM + self.nIns + self.nDel)/self.readLength
+
+ @property
+ def numPasses(self):
+ return self.peer.opt("np")
+
+ @property
+ def zScore(self):
+ raise UnavailableFeature("No ZScore in BAM")
+
+ @property
+ def barcode(self):
+ raise Unimplemented()
+
+ @property
+ def barcodeName(self):
+ raise Unimplemented()
+
+ @requiresReference
+ def transcript(self, orientation="native", style="gusfield"):
+ """
+ A text representation of the alignment moves (see Gusfield).
+ This can be useful in pretty-printing an alignment.
+ """
+ uc = self.unrolledCigar(orientation)
+ ref = np.fromstring(self.reference(aligned=True, orientation=orientation), dtype=np.int8)
+ read = np.fromstring(self.read(aligned=True, orientation=orientation), dtype=np.int8)
+ isMatch = (ref == read)
+
+ # Disambiguate the "M" op
+ cigarPlus = uc
+ cigarPlus[(~isMatch) & (cigarPlus == BAM_CMATCH)] = BAM_CDIFF # 'X'
+ cigarPlus[( isMatch) & (cigarPlus == BAM_CMATCH)] = BAM_CEQUAL # '='
+
+ # MIDNSHP=X
+ _exoneratePlusTrans = np.fromstring("Z ZZZZ|*", dtype=np.int8)
+ _exonerateTrans = np.fromstring("Z ZZZZ| ", dtype=np.int8)
+ _cigarTrans = np.fromstring("ZIDZZZZMM", dtype=np.int8)
+ _gusfieldTrans = np.fromstring("ZIDZZZZMR", dtype=np.int8)
+
+ if style == "exonerate+": return _exoneratePlusTrans [cigarPlus].tostring()
+ elif style == "exonerate": return _exonerateTrans [cigarPlus].tostring()
+ elif style == "cigar": return _cigarTrans [cigarPlus].tostring()
+ else: return _gusfieldTrans [cigarPlus].tostring()
+
+
+ @requiresReference
+ def reference(self, aligned=True, orientation="native"):
+ if not (orientation == "native" or orientation == "genomic"):
+ raise ValueError, "Bad `orientation` value"
+ tSeq = self.bam.referenceFasta[self.referenceName].sequence[self.tStart:self.tEnd]
+ shouldRC = orientation == "native" and self.isReverseStrand
+ tSeqOriented = reverseComplement(tSeq) if shouldRC else tSeq
+ if aligned:
+ x = np.fromstring(tSeqOriented, dtype=np.int8)
+ y = self._gapifyRef(x, orientation)
+ return y.tostring()
+ else:
+ return tSeqOriented
+
+ @requiresMapping
+ def unrolledCigar(self, orientation="native"):
+ """
+ Run-length decode the CIGAR encoding, and orient. Clipping ops are removed.
+ """
+ if self.isUnmapped: return None
+
+ if self._unrolledCigar is None:
+ self._unrolledCigar = _unrollCigar(self.peer.cigar, exciseSoftClips=True)
+
+ if (orientation == "native" and self.isReverseStrand):
+ return self._unrolledCigar[::-1]
+ else:
+ return self._unrolledCigar
+
+ @requiresMapping
+ def referencePositions(self, aligned=True, orientation="native"):
+ """
+ Returns an array of reference positions.
+
+ If aligned is True, the array has the same length as the
+ alignment and referencePositions[i] = reference position of
+ the i'th column in the oriented alignment.
+
+ If aligned is False, the array has the same length as the read
+ and referencePositions[i] = reference position of the i'th
+ base in the oriented read.
+ """
+ assert (aligned in (True, False) and
+ orientation in ("native", "genomic"))
+
+ ucOriented = self.unrolledCigar(orientation)
+ refNonGapMask = (ucOriented != BAM_CINS)
+
+ if self.isReverseStrand and orientation == "native":
+ pos = self.tEnd - 1 - np.hstack([0, np.cumsum(refNonGapMask[:-1])])
+ else:
+ pos = self.tStart + np.hstack([0, np.cumsum(refNonGapMask[:-1])])
+
+ if aligned:
+ return pos
+ else:
+ return pos[ucOriented != BAM_CDEL]
+
+ def readPositions(self, aligned=True, orientation="native"):
+ """
+ Returns an array of read positions.
+
+ If aligned is True, the array has the same length as the
+ alignment and readPositions[i] = read position of the i'th
+ column in the oriented alignment.
+
+ If aligned is False, the array has the same length as the
+ mapped reference segment and readPositions[i] = read position
+ of the i'th base in the oriented reference segment.
+ """
+ assert (aligned in (True, False) and
+ orientation in ("native", "genomic"))
+
+ ucOriented = self.unrolledCigar(orientation)
+ readNonGapMask = (ucOriented != BAM_CDEL)
+
+ if self.isReverseStrand and orientation == "genomic":
+ pos = self.aEnd - 1 - np.hstack([0, np.cumsum(readNonGapMask[:-1])])
+ else:
+ pos = self.aStart + np.hstack([0, np.cumsum(readNonGapMask[:-1])])
+
+ if aligned:
+ return pos
+ else:
+ return pos[ucOriented != BAM_CINS]
+
+
+ def pulseFeature(self, featureName, aligned=True, orientation="native"):
+ """
+ Retrieve the pulse feature as indicated.
+ - `aligned` : whether gaps should be inserted to reflect the alignment
+ - `orientation`: "native" or "genomic"
+
+ Note that this function assumes the the feature is stored in
+ native orientation in the file, so it is not appropriate to
+ use this method to fetch the read or the qual, which are
+ oriented genomically in the file.
+ """
+ if not (orientation == "native" or orientation == "genomic"):
+ raise ValueError, "Bad `orientation` value"
+ if self.isUnmapped and (orientation != "native" or aligned == True):
+ raise UnavailableFeature, \
+ "Cannot get genome oriented/aligned features from unmapped BAM record"
+ # 1. Extract in native orientation
+ tag, kind_, dtype_ = PULSE_FEATURE_TAGS[featureName]
+ data_ = self.peer.opt(tag)
+ if isinstance(data_, str):
+ data = np.fromstring(data_, dtype=dtype_)
+ else:
+ # This is about 300x slower than the fromstring above.
+ # Unless pysam exposes buffer or numpy interface,
+ # is is going to be very slow.
+ data = np.fromiter(data_, dtype=dtype_)
+ del data_
+ assert len(data) == self.peer.rlen
+
+ # 2. Decode
+ if kind_ == "qv":
+ data -= 33
+ elif kind_ == "time":
+ data = codeToFrames(data)
+
+ # 3. Clip
+ # [s, e) delimits the range, within the query, that is in the aligned read.
+ # This will be determined by the soft clips actually in the file as well as those
+ # imposed by the clipping API here.
+ s = self.aStart - self.qStart
+ e = self.aEnd - self.qStart
+ assert s >= 0 and e <= len(data)
+ clipped = data[s:e]
+
+ # 4. Orient
+ shouldReverse = self.isReverseStrand and orientation == "genomic"
+ if kind_ == "base":
+ ungapped = reverseComplementAscii(clipped) if shouldReverse else clipped
+ else:
+ ungapped = clipped[::-1] if shouldReverse else clipped
+
+ # 5. Gapify if requested
+ if aligned == False:
+ return ungapped
+ else:
+ return self._gapifyRead(ungapped, orientation)
+
+ def _gapifyRead(self, data, orientation):
+ return self._gapify(data, orientation, BAM_CDEL)
+
+ def _gapifyRef(self, data, orientation):
+ return self._gapify(data, orientation, BAM_CINS)
+
+ def _gapify(self, data, orientation, gapOp):
+ if self.isUnmapped: return data
+
+ # Precondition: data must already be *in* the specified orientation
+ if data.dtype == np.int8:
+ gapCode = ord("-")
+ else:
+ gapCode = data.dtype.type(-1)
+ uc = self.unrolledCigar(orientation=orientation)
+ alnData = np.repeat(np.array(gapCode, dtype=data.dtype), len(uc))
+ gapMask = (uc == gapOp)
+ alnData[~gapMask] = data
+ return alnData
+
+ IPD = _makePulseFeatureAccessor("IPD")
+ PulseWidth = _makePulseFeatureAccessor("PulseWidth")
+ #QualityValue = _makePulseFeatureAccessor("QualityValue")
+ InsertionQV = _makePulseFeatureAccessor("InsertionQV")
+ DeletionQV = _makePulseFeatureAccessor("DeletionQV")
+ DeletionTag = _makePulseFeatureAccessor("DeletionTag")
+ MergeQV = _makePulseFeatureAccessor("MergeQV")
+ SubstitutionQV = _makePulseFeatureAccessor("SubstitutionQV")
+
+ def read(self, aligned=True, orientation="native"):
+ if not (orientation == "native" or orientation == "genomic"):
+ raise ValueError, "Bad `orientation` value"
+ if self.isUnmapped and (orientation != "native" or aligned == True):
+ raise UnavailableFeature, \
+ "Cannot get genome oriented/aligned features from unmapped BAM record"
+ data = np.fromstring(self.peer.seq, dtype=np.int8)
+ s = self.aStart - self.qStart
+ e = self.aEnd - self.qStart
+ l = self.qLen
+ # clip
+ assert l == len(data) and s >= 0 and e <= l
+ if self.isForwardStrand: clipped = data[s:e]
+ else: clipped = data[(l-e):(l-s)]
+ # orient
+ shouldReverse = self.isReverseStrand and orientation == "native"
+ ungapped = reverseComplementAscii(clipped) if shouldReverse else clipped
+ # gapify
+ if aligned: r = self._gapifyRead(ungapped, orientation)
+ else: r = ungapped
+ return r.tostring()
+
+ def __repr__(self):
+ if self.isUnmapped:
+ return "Unmapped BAM record: " + self.queryName
+ else:
+ return "BAM alignment: %s @ %s %3d %9d %9d" \
+ % (self.queryName, ("+" if self.isForwardStrand else "-"),
+ self.referenceId, self.tStart, self.tEnd)
+
+ def __str__(self):
+ if self.bam.isReferenceLoaded:
+ COLUMNS = 80
+ val = ""
+ val += repr(self) + "\n\n"
+ val += "Read: " + self.readName + "\n"
+ val += "Reference: " + self.referenceName + "\n\n"
+ val += "Read length: " + str(self.readLength) + "\n"
+ #val += "Identity: " + "%0.3f" % self.identity + "\n"
+
+ alignedRead = self.read()
+ alignedRef = self.reference()
+ transcript = self.transcript(style="exonerate+")
+ refPos = self.referencePositions()
+ refPosString = "".join([str(pos % 10) for pos in refPos])
+ for i in xrange(0, len(alignedRef), COLUMNS):
+ val += "\n"
+ val += " " + refPosString[i:i+COLUMNS] + "\n"
+ val += " " + alignedRef [i:i+COLUMNS] + "\n"
+ val += " " + transcript [i:i+COLUMNS] + "\n"
+ val += " " + alignedRead [i:i+COLUMNS] + "\n"
+ val += "\n"
+ return val
+ else:
+ return repr(self)
+
+ def __cmp__(self, other):
+ return cmp((self.referenceId, self.tStart, self.tEnd),
+ (other.referenceId, other.tStart, other.tEnd))
+
+ @requiresPbi
+ def __getattr__(self, key):
+ if key in self.bam.pbi.columnNames:
+ return self.bam.pbi[self.rowNumber][key]
+ else:
+ raise AttributeError, "no such column in pbi index"
+
+ def __dir__(self):
+ if self.bam.pbi is not None:
+ return self.bam.pbi.columnNames
+
+class ClippedBamAlignment(BamAlignment):
+ def __init__(self, aln, tStart, tEnd, aStart, aEnd, unrolledCigar):
+ # Self-consistency checks
+ assert aln.isMapped
+ assert tStart <= tEnd
+ assert aStart <= aEnd
+ assert sum(unrolledCigar != BAM_CDEL) == (aEnd - aStart)
+
+ # Assigment
+ self.peer = aln.peer
+ self.bam = aln.bam
+ self.rowNumber = aln.rowNumber
+ self.tStart = tStart
+ self.tEnd = tEnd
+ self.aStart = aStart
+ self.aEnd = aEnd
+ self._unrolledCigar = unrolledCigar # genomic orientation
+
+ def unrolledCigar(self, orientation="native"):
+ if orientation=="native" and self.isReverseStrand:
+ return self._unrolledCigar[::-1]
+ else:
+ return self._unrolledCigar
diff --git a/pbcore/io/align/BamIO.py b/pbcore/io/align/BamIO.py
new file mode 100644
index 0000000..01de9ce
--- /dev/null
+++ b/pbcore/io/align/BamIO.py
@@ -0,0 +1,394 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+__all__ = [ "BamReader", "IndexedBamReader" ]
+
+from pysam import Samfile
+from pbcore.io import FastaTable
+from pbcore.chemistry import decodeTriple, ChemistryLookupError
+
+import numpy as np
+from itertools import groupby
+from functools import wraps
+from os.path import abspath, expanduser, exists
+
+from ..base import ReaderBase
+from .PacBioBamIndex import PacBioBamIndex
+from .BamAlignment import *
+from ._BamSupport import *
+from ._AlignmentMixin import AlignmentReaderMixin, IndexedAlignmentReaderMixin
+
+
+def requiresBai(method):
+ @wraps(method)
+ def f(bamReader, *args, **kwargs):
+ if not bamReader.peer._hasIndex():
+ raise UnavailableFeature, "this feature requires an standard BAM index file (bam.bai)"
+ else:
+ return method(bamReader, *args, **kwargs)
+ return f
+
+
+class _BamReaderBase(ReaderBase):
+ """
+ The BamReader class provides a high-level interface to PacBio BAM
+ files. If a PacBio BAM index (bam.pbi file) is present and the
+ user instantiates the BamReader using the reference FASTA as the
+ second argument, the BamReader will provide an interface
+ compatible with CmpH5Reader.
+ """
+ def _loadReferenceInfo(self):
+ refRecords = self.peer.header["SQ"]
+ refNames = [r["SN"] for r in refRecords]
+ refLengths = [r["LN"] for r in refRecords]
+ refMD5s = [r["M5"] for r in refRecords]
+ refIds = map(self.peer.gettid, refNames)
+ nRefs = len(refRecords)
+
+ if nRefs > 0:
+ self._referenceInfoTable = np.rec.fromrecords(zip(
+ refIds,
+ refIds,
+ refNames,
+ refNames,
+ refLengths,
+ refMD5s,
+ np.zeros(nRefs, dtype=np.uint32),
+ np.zeros(nRefs, dtype=np.uint32)),
+ dtype=[('ID', '<i8'), ('RefInfoID', '<i8'),
+ ('Name', 'O'), ('FullName', 'O'),
+ ('Length', '<i8'), ('MD5', 'O'),
+ ('StartRow', '<u4'), ('EndRow', '<u4')])
+ self._referenceDict = {}
+ self._referenceDict.update(zip(refIds, self._referenceInfoTable))
+ self._referenceDict.update(zip(refNames, self._referenceInfoTable))
+ else:
+ self._referenceInfoTable = None
+ self._referenceDict = None
+
+ def _loadReadGroupInfo(self):
+ rgs = self.peer.header["RG"]
+ readGroupTable_ = []
+ pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys())
+ for rg in rgs:
+ rgID = rgAsInt(rg["ID"])
+ rgName = rg["PU"]
+ ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""])
+ # spec: we only consider first two components of basecaller version
+ # in "chem" lookup
+ basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2])
+ triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
+ rgChem = decodeTriple(*triple)
+ rgReadType = ds["READTYPE"]
+ readGroupTable_.append((rgID, rgName, rgReadType, rgChem))
+ pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys())
+
+ self._readGroupTable = np.rec.fromrecords(
+ readGroupTable_,
+ dtype=[("ID" , np.int32),
+ ("MovieName" , "O"),
+ ("ReadType" , "O"),
+ ("SequencingChemistry", "O")])
+ assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
+ "First 8 chars of read group IDs must be unique!"
+
+ self._readGroupDict = { rg.ID : rg
+ for rg in self._readGroupTable }
+
+ self._pulseFeaturesAvailable = pulseFeaturesInAll_
+
+
+ def _loadProgramInfo(self):
+ pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None))
+ for pg in self.peer.header.get("PG", []) ]
+
+ if len(pgRecords) > 0:
+ self._programTable = np.rec.fromrecords(
+ pgRecords,
+ dtype=[("ID" , "O"),
+ ("Version", "O"),
+ ("CommandLine", "O")])
+ else:
+ self._programTable = None
+
+ def _loadReferenceFasta(self, referenceFastaFname):
+ ft = FastaTable(referenceFastaFname)
+ # Verify that this FASTA is in agreement with the BAM's
+ # reference table---BAM should be a subset.
+ fastaIdsAndLens = set((c.id, len(c)) for c in ft)
+ bamIdsAndLens = set((c.Name, c.Length) for c in self.referenceInfoTable)
+ if not bamIdsAndLens.issubset(fastaIdsAndLens):
+ raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM"
+ self.referenceFasta = ft
+
+ def _checkFileCompatibility(self):
+ # Verify that this is a "pacbio" BAM file of version at least
+ # 3.0b3
+ try:
+ checkedVersion = self.version
+ except:
+ raise IncompatibleFile(
+ "This BAM file is incompatible with this API " +
+ "(only PacBio BAM files version >= 3.0b3 are supported)")
+
+ def __init__(self, fname, referenceFastaFname=None):
+ self.filename = fname = abspath(expanduser(fname))
+ self.peer = Samfile(fname, "rb", check_sq=False)
+ self._checkFileCompatibility()
+
+ self._loadReferenceInfo()
+ self._loadReadGroupInfo()
+ self._loadProgramInfo()
+
+ self.referenceFasta = None
+ if referenceFastaFname is not None:
+ if self.isUnmapped:
+ raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
+ self._loadReferenceFasta(referenceFastaFname)
+
+ @property
+ def isIndexLoaded(self):
+ return self.index is not None
+
+ @property
+ def isReferenceLoaded(self):
+ return self.referenceFasta is not None
+
+ @property
+ def isUnmapped(self):
+ return not(self.isMapped)
+
+ @property
+ def isMapped(self):
+ return len(self.peer.header["SQ"]) > 0
+
+ @property
+ def alignmentIndex(self):
+ raise UnavailableFeature("BAM has no alignment index")
+
+ @property
+ def movieNames(self):
+ return set([mi.MovieName for mi in self.readGroupTable])
+
+ @property
+ def readGroupTable(self):
+ return self._readGroupTable
+
+ def readGroupInfo(self, readGroupId):
+ return self._readGroupDict[readGroupId]
+
+ @property
+ def sequencingChemistry(self):
+ """
+ List of the sequencing chemistries by movie. Order is
+ unspecified.
+ """
+ return list(self.readGroupTable.SequencingChemistry)
+
+ @property
+ def referenceInfoTable(self):
+ return self._referenceInfoTable
+
+ #TODO: standard? how about subread instead? why capitalize ccs?
+ # can we standardize this? is cDNA an additional possibility
+ @property
+ def readType(self):
+ """
+ Either "standard", "CCS", "mixed", or "unknown", to represent the
+ type of PacBio reads aligned in this BAM file.
+ """
+ readTypes = self.readGroupTable.ReadType
+ if all(readTypes == "SUBREAD"):
+ return "standard"
+ elif all(readTypes == "CCS"):
+ return "CCS"
+ elif all((readTypes == "CCS") | (readTypes == "SUBREAD")):
+ return "mixed"
+ else:
+ return "unknown"
+
+ @property
+ def version(self):
+ return self.peer.header["HD"]["pb"]
+
+ def versionAtLeast(self, minimalVersion):
+ raise Unimplemented()
+
+ def softwareVersion(self, programName):
+ raise Unimplemented()
+
+ @property
+ def isSorted(self):
+ return self.peer.header["HD"]["SO"] == "coordinate"
+
+ @property
+ def isBarcoded(self):
+ raise Unimplemented()
+
+ @property
+ def isEmpty(self):
+ return (len(self) == 0)
+
+ def referenceInfo(self, key):
+ return self._referenceDict[key]
+
+ def atOffset(self, offset):
+ self.peer.seek(offset)
+ return BamAlignment(self, next(self.peer))
+
+ def hasPulseFeature(self, featureName):
+ return featureName in self._pulseFeaturesAvailable
+
+ def pulseFeaturesAvailable(self):
+ return self._pulseFeaturesAvailable
+
+ @property
+ def barcode(self):
+ raise Unimplemented()
+
+ @property
+ def barcodeName(self):
+ raise Unimplemented()
+
+ @property
+ def barcodes(self):
+ raise Unimplemented()
+
+ @requiresBai
+ def __len__(self):
+ return self.peer.mapped + self.peer.unmapped
+
+ def close(self):
+ if hasattr(self, "file") and self.file is not None:
+ self.file.close()
+ self.file = None
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+
+class BamReader(_BamReaderBase, AlignmentReaderMixin):
+ """
+ Reader for a BAM with a bam.bai (SAMtools) index, but not a
+ bam.pbi (PacBio) index. Supports basic BAM operations.
+ """
+ def __init__(self, fname, referenceFastaFname=None):
+ super(BamReader, self).__init__(fname, referenceFastaFname)
+
+ def __iter__(self):
+ self.peer.reset()
+ for a in self.peer:
+ yield BamAlignment(self, a)
+
+ # TODO: cmp.h5 readsInRange only accepts int key, not string.
+ # that's just lame, fix it.
+ def readsInRange(self, winId, winStart, winEnd, justIndices=False):
+ # PYSAM BUG: fetch doesn't work if arg 1 is tid and not rname
+ if not isinstance(winId, str):
+ winId = self.peer.getrname(winId)
+ if justIndices == True:
+ raise UnavailableFeature("BAM is not random-access")
+ else:
+ return ( BamAlignment(self, it)
+ for it in self.peer.fetch(winId, winStart, winEnd, multiple_iterators=False) )
+
+ def __getitem__(self, rowNumbers):
+ raise UnavailableFeature("Use IndexedBamReader to get row-number based slicing.")
+
+
+
+class IndexedBamReader(_BamReaderBase, IndexedAlignmentReaderMixin):
+ """
+ A `IndexedBamReader` is a BAM reader class that uses the
+ ``bam.pbi`` (PacBio BAM index) file to enable random access by
+ "row number" and to provide access to precomputed semantic
+ information about the BAM records
+ """
+ def __init__(self, fname, referenceFastaFname=None):
+ super(IndexedBamReader, self).__init__(fname, referenceFastaFname)
+ self.pbi = None
+ pbiFname = self.filename + ".pbi"
+ if exists(pbiFname):
+ self.pbi = PacBioBamIndex(pbiFname)
+ else:
+ raise IOError, "IndexedBamReader requires bam.pbi index file"
+
+ def atRowNumber(self, rn):
+ offset = self.pbi.virtualFileOffset[rn]
+ self.peer.seek(offset)
+ return BamAlignment(self, next(self.peer), rn)
+
+ def readsInRange(self, winId, winStart, winEnd, justIndices=False):
+ if isinstance(winId, str):
+ winId = self.referenceInfo(winId).ID
+ ix = self.pbi.rangeQuery(winId, winStart, winEnd)
+ if justIndices:
+ return ix
+ else:
+ return self[ix]
+
+ def __iter__(self):
+ for rn in xrange(len(self.pbi)):
+ yield self.atRowNumber(rn)
+
+ def __len__(self):
+ return len(self.pbi)
+
+ def __getitem__(self, rowNumbers):
+ if (isinstance(rowNumbers, int) or
+ issubclass(type(rowNumbers), np.integer)):
+ return self.atRowNumber(rowNumbers)
+ elif isinstance(rowNumbers, slice):
+ return [ self.atRowNumber(r)
+ for r in xrange(*rowNumbers.indices(len(self)))]
+ elif isinstance(rowNumbers, list) or isinstance(rowNumbers, np.ndarray):
+ if len(rowNumbers) == 0:
+ return []
+ else:
+ entryType = type(rowNumbers[0])
+ if entryType == int or issubclass(entryType, np.integer):
+ return [ self.atRowNumber(r) for r in rowNumbers ]
+ elif entryType == bool or issubclass(entryType, np.bool_):
+ return [ self.atRowNumber(r) for r in np.flatnonzero(rowNumbers) ]
+ raise TypeError, "Invalid type for IndexedBamReader slicing"
+
+ def __getattr__(self, key):
+ if key in self.pbi.columnNames:
+ return self.pbi[key]
+ else:
+ raise AttributeError, "no such column in pbi index"
+
+ def __dir__(self):
+ return self.pbi.columnNames
diff --git a/pbcore/io/align/BlasrIO.py b/pbcore/io/align/BlasrIO.py
new file mode 100644
index 0000000..0d80df0
--- /dev/null
+++ b/pbcore/io/align/BlasrIO.py
@@ -0,0 +1,116 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+from pbcore.io.base import ReaderBase
+
+__all__ = [ "M4Record",
+ "M4Reader",
+ "M5Record",
+ "M5Reader" ]
+
+class MalformattedRecord(Exception): pass
+
+class M4Record(object):
+ """
+ Record for alignment summary record output from BLASR -m 4 option
+ """
+ @classmethod
+ def fromString(cls, s):
+ obj = cls()
+ try:
+ columns = s.strip().split()
+ obj.qName = columns[0]
+ obj.tName = columns[1]
+ obj.score = int(columns[2])
+ obj.percentSimilarity = float(columns[3])
+ obj.qStrand = int(columns[4])
+ obj.qStart = int(columns[5])
+ obj.qEnd = int(columns[6])
+ obj.qLength = int(columns[7])
+ obj.tStrand = int(columns[8])
+ obj.tStart = int(columns[9])
+ obj.tEnd = int(columns[10])
+ obj.tLength = int(columns[11])
+ obj.mapQV = int(columns[12])
+ return obj
+ except:
+ raise MalformattedRecord(s)
+
+class M4Reader(ReaderBase):
+ """
+ Reader for -m 4 formatted alignment summary information from BLASR
+ """
+ def __iter__(self):
+ for line in self.file:
+ yield M4Record.fromString(line)
+
+
+
+class M5Record(object):
+ """
+ Record for alignment summary record output from BLASR -m 5 option
+ """
+ @classmethod
+ def fromString(cls, s):
+ obj = cls()
+ try:
+ columns = s.strip().split()
+ obj.qName = columns[0]
+ obj.qLength = int(columns[1])
+ obj.qStart = int(columns[2])
+ obj.qEnd = int(columns[3])
+ obj.qStrand = columns[4]
+ obj.tName = columns[5]
+ obj.tLength = int(columns[6])
+ obj.tStart = int(columns[7])
+ obj.tEnd = int(columns[8])
+ obj.tStrand = columns[9]
+ obj.score = float(columns[10])
+ obj.numMatch = int(columns[11])
+ obj.numMismatch = int(columns[12])
+ obj.numIns = int(columns[13])
+ obj.numDel = int(columns[14])
+ obj.mapQV = int(columns[15])
+ obj.qAlignedSeq = columns[16]
+ obj.matchPattern = columns[17]
+ obj.tAlignedSeq = columns[18]
+ return obj
+ except:
+ raise MalformattedRecord(s)
+
+class M5Reader(ReaderBase):
+ """
+ Reader for -m 5 formatted alignment summary information from BLASR
+ """
+ def __iter__(self):
+ for line in self.file:
+ yield M5Record.fromString(line)
diff --git a/pbcore/io/align/CmpH5IO.py b/pbcore/io/align/CmpH5IO.py
new file mode 100755
index 0000000..b94126d
--- /dev/null
+++ b/pbcore/io/align/CmpH5IO.py
@@ -0,0 +1,1277 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+__all__ = [ "CmpH5Reader",
+ "CmpH5Alignment",
+ "EmptyCmpH5Error" ]
+
+import h5py, numpy as np, warnings
+from bisect import bisect_left, bisect_right
+from collections import Counter, OrderedDict
+from itertools import groupby
+from os.path import abspath, expanduser
+from pbcore.io.rangeQueries import makeReadLocator
+from pbcore.io._utils import rec_join, arrayFromDataset
+from pbcore.io.FastaIO import splitFastaHeader
+from pbcore.io.base import ReaderBase
+from pbcore.chemistry import decodeTriple, ChemistryLookupError
+from pbcore.util.decorators import deprecated
+
+from ._AlignmentMixin import AlignmentRecordMixin, IndexedAlignmentReaderMixin
+
+# ========================================
+# Data manipulation routines.
+#
+GAP = 0b0000
+
+_basemap = { 0b0000 : ord("-"),
+ 0b0001 : ord("A"),
+ 0b0010 : ord("C"),
+ 0b0100 : ord("G"),
+ 0b1000 : ord("T"),
+ 0b1111 : ord("N") }
+
+_cBasemap = { 0b0000 : ord("-"),
+ 0b0001 : ord("T"),
+ 0b0010 : ord("G"),
+ 0b0100 : ord("C"),
+ 0b1000 : ord("A"),
+ 0b1111 : ord("N") }
+
+_basemapArray = np.ndarray(shape=(max(_basemap.keys()) + 1,), dtype=np.byte)
+_cBasemapArray = np.ndarray(shape=(max(_basemap.keys()) + 1,), dtype=np.byte)
+
+for (e, v) in _basemap.iteritems():
+ _basemapArray[e] = v
+for (e, v) in _cBasemap.iteritems():
+ _cBasemapArray[e] = v
+
+_baseEncodingToInt = np.array([-1]*16)
+_baseEncodingToInt[0b0000] = 0
+_baseEncodingToInt[0b0001] = 1
+_baseEncodingToInt[0b0010] = 2
+_baseEncodingToInt[0b0100] = 3
+_baseEncodingToInt[0b1000] = 4
+_baseEncodingToInt[0b1111] = 5 # 'N' base
+
+# These are 2D tables indexed by (readInt, refInt)
+# 'N' base is never considered a mismatch.
+_gusfieldTranscriptTable = \
+ np.fromstring("ZDDDDDZ"
+ "IMRRRMZ"
+ "IRMRRMZ"
+ "IRRMRMZ"
+ "IRRRMMZ"
+ "IMMMMMZ"
+ "ZZZZZZZ", dtype=np.uint8).reshape(7, 7)
+_cigarTranscriptTable = \
+ np.fromstring("ZDDDDDZ"
+ "IMMMMMZ"
+ "IMMMMMZ"
+ "IMMMMMZ"
+ "IMMMMMZ"
+ "IMMMMMZ"
+ "ZZZZZZZ", dtype=np.uint8).reshape(7, 7)
+_exonerateTranscriptTable = \
+ np.fromstring("Z Z"
+ " | |Z"
+ " | |Z"
+ " | |Z"
+ " ||Z"
+ " |||||Z"
+ "ZZZZZZZ", dtype=np.uint8).reshape(7, 7)
+_exoneratePlusTranscriptTable = \
+ np.fromstring("Z Z"
+ " |***|Z"
+ " *|**|Z"
+ " **|*|Z"
+ " ***||Z"
+ " |||||Z"
+ "ZZZZZZZ", dtype=np.uint8).reshape(7, 7)
+
+class EmptyCmpH5Error(Exception):
+ """An exception raised when CmpH5Reader tries to read from a
+ cmp.h5 with no alignments.
+ """
+ pass
+
+def readFromAlignmentArray(a, gapped=True, complement=False):
+ """
+ Decode the read component of an alignment array.
+ """
+ if complement:
+ r = _cBasemapArray[a >> 4]
+ else:
+ r = _basemapArray[a >> 4]
+ if not gapped:
+ r = r[r != ord("-")]
+ return r.tostring()
+
+def referenceFromAlignmentArray(a, gapped=True, complement=False):
+ """
+ Decode the reference component of an alignment array.
+ """
+ if complement:
+ r = _cBasemapArray[a & 0b1111]
+ else:
+ r = _basemapArray[a & 0b1111]
+ if not gapped:
+ r = r[r != ord("-")]
+ return r.tostring()
+
+def ungappedPulseArray(a):
+ """
+ Return a pulse array with encoded gaps removed.
+ """
+ dtype = a.dtype
+ if dtype == np.float32:
+ return a[~np.isnan(a)]
+ elif dtype == np.uint8:
+ return a[a != np.uint8(-1)]
+ elif dtype == np.uint16:
+ return a[a != np.uint16(-1)]
+ elif dtype == np.uint32:
+ return a[a != np.uint32(-1)]
+ elif dtype == np.int8:
+ return a[a != ord("-")]
+ else:
+ raise Exception, "Invalid pulse array type"
+
+
+
+# ========================================
+# Alignment record type
+#
+
+ALIGNMENT_INDEX_COLUMNS = ["AlnID", "AlnGroupID", "MovieID", "RefGroupID",
+ "tStart", "tEnd", "RCRefStrand", "HoleNumber",
+ "SetNumber", "StrobeNumber", "MoleculeID",
+ "rStart", "rEnd", "MapQV", "nM", "nMM", "nIns",
+ "nDel", "Offset_begin", "Offset_end", "nBackRead",
+ "nReadOverlap"]
+
+ALIGNMENT_INDEX_DTYPE = [(COLUMN_NAME, np.uint32)
+ for COLUMN_NAME in ALIGNMENT_INDEX_COLUMNS]
+
+
+OFFSET_TABLE_DTYPE = [ ("ID", np.uint32),
+ ("StartRow", np.uint32),
+ ("EndRow", np.uint32) ]
+
+
+def _makePulseFeatureAccessor(featureName):
+ def f(self, aligned=True, orientation="native"):
+ return self.pulseFeature(featureName, aligned, orientation)
+ return f
+
+class CmpH5Alignment(AlignmentRecordMixin):
+ """
+ A lightweight class representing a single alignment record in a
+ CmpH5 file, providing access to all columns of a single row of the
+ alignment index, and on-demand access to the corresponding
+ sequence and pulse features.
+
+ `CmpH5Alignment` objects are obtained by slicing a
+ `CmpH5Reader` object:
+
+ .. doctest::
+
+ >>> c[26]
+ CmpH5 alignment: + 1 7441 7699
+
+ >>> print c[26]
+ CmpH5 alignment: + 1 7441 7699
+ <BLANKLINE>
+ Read: m110818_075520_42141_c100129202555500000315043109121112_s2_p0/1009/44_322
+ Reference: lambda_NEB3011
+ <BLANKLINE>
+ Read length: 278
+ Concordance: 0.871
+ <BLANKLINE>
+ 12345678901234567890123456789001223456789012345678890112345678990112344567890011
+ AACTGGTCACGGTCGTGGCACTGGTGAAG-CT-GCATACTGATGCACTT-CAC-GCCACGCG-GG-ATG-AACCTG-T-G
+ ||||||| |||| ||||||||| |||| || ||||||||| ||||| ||| |||||||| || ||| |||||| | |
+ AACTGGT--CGGT--TGGCACTGG-GAAGCCTTGCATACTGA-GCACT-CCACGGCCACGCGGGGAATGAAACCTGGTGG
+ <BLANKLINE>
+ <BLANKLINE>
+ 23456789012345678900123456678901234456789012345678901234566789011234556789012345
+ GCATTTGTGCTGCCGGGA-ACGGCG-TTTCGTGT-CTCTGCCGGTGTGGCAGCCGAA-ATGAC-AGAG-CGCGGCCTGGC
+ |||||||||||||||||| |||||| |||||| | |||||||||||||||||||||| ||||| |||| |||||||||||
+ GCATTTGTGCTGCCGGGAAACGGCGTTTTCGT-TCCTCTGCCGGTGTGGCAGCCGAAAATGACAAGAGCCGCGGCCTGGC
+ <BLANKLINE>
+ <BLANKLINE>
+ 67899012345677890123456789901123456789901233456789012345678901234556778901223455
+ CAG-AATGCAAT-AACGGGAGGCGC-TG-TGGCTGAT-TTCG-ATAACCTGTTCGATGCTGCCAT-TG-CCCGC-GCC-G
+ ||| |||||||| |||||||||||| || |||||||| |||| |||||||||||||||||||||| || ||||| ||| |
+ CAGAAATGCAATAAACGGGAGGCGCTTGCTGGCTGATTTTCGAATAACCTGTTCGATGCTGCCATTTGACCCGCGGCCGG
+ <BLANKLINE>
+ <BLANKLINE>
+ 6678901234567889012345667890123456789012345678
+ -ATGAAACGATAC-GCGGGTAC-ATGGGAACGTCAGCCACCATTAC
+ |||||||||||| |||||||| |||||||||||||||||||||||
+ AATGAAACGATACGGCGGGTACAATGGGAACGTCAGCCACCATTAC
+ <BLANKLINE>
+ <BLANKLINE>
+
+ The `orientation` argument to some data access methods determines
+ how reverse-strand alignments are returned to the user. `.cmp.h5`
+ files natively encode reverse strand reads in read-order,
+ uncomplemented, with the *reference* reverse-complemented. Most
+ analysis applications will want to use the data in this order,
+ which we term the *NATIVE* orientation.
+
+ Some applications that involve collating or displaying the reads
+ aligned to the reference genome want the reference to be presented
+ in its genomic order, and the *read* to be reverse-complemented.
+ We term this *GENOMIC* orientation.
+
+ Methods prefixed with *aligned* return data (bases or features)
+ that include gaps, which are encoded according to the data type.
+ Methods not prefixed with *aligned* excise the gaps.
+
+ Sequence and pulse features are not cached.
+ """
+ __slots__ = ["cmpH5", "rowNumber"]
+
+ def __init__(self, cmph5, rowNumber):
+ self.cmpH5 = cmph5
+ self.rowNumber = rowNumber
+
+ @property
+ def reader(self):
+ return self.cmpH5
+
+ def clippedTo(self, refStart, refEnd):
+ """
+ Return a new `CmpH5Alignment` that refers to a subalignment of
+ this alignment, as induced by clipping to reference
+ coordinates `refStart` to `refEnd`.
+
+ .. warning::
+ This function takes time linear in the length of the alignment.
+ """
+ if (refStart >= refEnd or
+ refStart >= self.tEnd or
+ refEnd <= self.tStart):
+ raise IndexError, "Clipping query does not overlap alignment"
+ else:
+ return ClippedCmpH5Alignment(self, refStart, refEnd)
+
+ @property
+ def _alignmentGroup(self):
+ return self.cmpH5._alignmentGroup(self.AlnGroupID)
+
+ @property
+ def referenceInfo(self):
+ return self.cmpH5.referenceInfo(self.RefGroupID)
+
+ @property
+ def referenceName(self):
+ return self.referenceInfo.FullName
+
+ @property
+ def ReadGroupID(self):
+ return np.int32(self.MovieID)
+
+ @property
+ def qId(self):
+ # Forward compatibility with BAM API
+ return self.ReadGroupID
+
+ @property
+ def aStart(self):
+ # Forward compatibility with BAM API
+ return self.rStart
+
+ @property
+ def aEnd(self):
+ return self.rEnd
+
+ @property
+ def holeNumber(self):
+ # Forward compatibility with BAM API
+ return self.HoleNumber
+
+ @property
+ def mapQV(self):
+ # Forward compatibility with BAM API
+ return self.MapQV
+
+ @property
+ def readGroupInfo(self):
+ """
+ Returns the corresponding record from the `readGroupTable`.
+ """
+ # TODO: add doctest
+ return self.cmpH5.readGroupInfo(self.ReadGroupID)
+
+ @property
+ def movieInfo(self):
+ """
+ .. deprecated:: 0.9.2
+ Use :attr:`readGroupInfo`, which is compatible with BAM usage
+
+ Returns a record (extracted from the cmph5's `movieInfoTable`)
+ containing information about the movie that the read was
+ extracted from. This record should be accessed using
+ dot-notation, according to the column names documented in
+ `movieInfoTable`.
+ """
+ return self.cmpH5.movieInfo(self.MovieID)
+
+ @property
+ def movieName(self):
+ return self.cmpH5._movieInfo(self.MovieID).Name
+
+ @property
+ def isForwardStrand(self):
+ return self.RCRefStrand == 0
+
+ @property
+ def isReverseStrand(self):
+ return self.RCRefStrand == 1
+
+ @property
+ def referenceId(self):
+ return self.RefGroupID
+
+ @property
+ def identity(self):
+ """
+ Return the identity of this alignment, calculated as
+ (#matchs / read length)
+
+ .. doctest::
+
+ >>> c[26].identity
+ 0.87050359712230219
+ """
+ if self.readLength == 0:
+ return 0.
+ else:
+ return 1. - float(self.nMM + self.nIns + self.nDel)/self.readLength
+
+ @property
+ def accuracy(self):
+ """
+ Return the identity of this alignment, calculated as
+ (#matchs / read length)
+
+ .. deprecated:: 0.9.5
+ Use :attr:`identity`
+ """
+ return self.identity
+
+ @property
+ def similarity(self):
+ """
+ Replicates the pctsimilarity field from blasr, calculated as
+ #matches/mean(aligned_length, read_length)
+ """
+ meanLength = (self.readLength + self.referenceSpan)/2.0
+
+ if meanLength == 0:
+ return 0.
+ else:
+ return float(self.nM/meanLength)
+
+ @property
+ def numPasses(self):
+ """
+ (CCS only) The number subreads that were used to produce this CCS read.
+ """
+ return self.cmpH5.numPasses[self.rowNumber]
+
+ @property
+ def zScore(self):
+ """
+ (PacBio internal files only)
+
+ The z-score of the alignment, using a null model of a random
+ sequence alignment.
+ """
+ return self.cmpH5.zScore[self.rowNumber]
+
+ @property
+ def barcode(self):
+ """
+ The barcode ID (integer key) for this alignment's read
+ Behavior undefined if file is not barcoded.
+ """
+ return self.cmpH5.barcodes[self.rowNumber]
+
+ @property
+ def barcodeName(self):
+ """
+ The barcode name (string) for this alignment's read
+ Behavior undefined if file is not barcoded.
+ """
+ return self.cmpH5.barcodeName[self.barcode]
+
+ @property
+ def sequencingChemistry(self):
+ return self.cmpH5.sequencingChemistry[self.MovieID-1]
+
+ def alignmentArray(self, orientation="native"):
+ """
+ Direct access to the raw, encoded aligment array, which is a
+ packed representation of the aligned read and reference.
+ """
+ alnDs = self._alignmentGroup["AlnArray"]
+ alnArray = arrayFromDataset(alnDs, self.Offset_begin, self.Offset_end)
+ if self.RCRefStrand and (orientation == "genomic"):
+ return alnArray[::-1]
+ else:
+ return alnArray
+
+ def transcript(self, orientation="native", style="gusfield"):
+ """
+ A text representation of the alignment moves (see Gusfield).
+ This can be useful in pretty-printing an alignment.
+ """
+ if style == "exonerate+":
+ tbl = _exoneratePlusTranscriptTable
+ elif style == "exonerate":
+ tbl = _exonerateTranscriptTable
+ elif style == "cigar":
+ tbl = _cigarTranscriptTable
+ else:
+ tbl = _gusfieldTranscriptTable
+ alnArr = self.alignmentArray(orientation)
+ readBaseInts = _baseEncodingToInt[alnArr >> 4]
+ refBaseInts = _baseEncodingToInt[alnArr & 0b1111]
+ return tbl[readBaseInts, refBaseInts].tostring()
+
+ def read(self, aligned=True, orientation="native"):
+ """
+ Return the read portion of the alignment as a string.
+
+ If `aligned` is true, the aligned representation is returned,
+ including gaps; otherwise the unaligned read basecalls are
+ returned.
+
+ If `orientation` is "native", the returned read bases are
+ presented in the order they were read by the sequencing
+ machine. If `orientation` is "genomic", the returned read
+ bases are presented in such a way as to collate with the
+ forward strand of the reference---which requires reverse
+ complementation of reverse-strand reads.
+ """
+ return readFromAlignmentArray(self.alignmentArray(orientation),
+ gapped=aligned,
+ complement=(self.RCRefStrand and
+ orientation == "genomic"))
+
+ @property
+ def readType(self):
+ return self.cmpH5.readType
+
+ def reference(self, aligned=True, orientation="native"):
+ """
+ Return the read portion of the alignment as a string.
+
+ If `aligned` is true, the aligned representation of the
+ reference is returned, including gaps; otherwise the unaligned
+ reference bases are returned.
+
+ If `orientation` is "native", the reference is presented in
+ the order it is stored in the cmp.h5 file---for reverse-strand
+ reads, the reference is reverse-complemented. If
+ `orientation` is "genomic", the forward strand reference is returned.
+ """
+ return referenceFromAlignmentArray(self.alignmentArray(orientation),
+ gapped=aligned,
+ complement=(self.RCRefStrand and
+ orientation == "genomic"))
+
+ def referencePositions(self, orientation="native"):
+ """
+ Returns an array of reference positions such that
+ referencePositions[i] = reference position of the i'th column
+ in the alignment. Insertions are grouped with the following
+ reference base, in the specified orientation.
+
+ Length of output array = length of alignment
+ """
+ referenceNonGapMask = (self.alignmentArray(orientation) & 0b1111) != GAP
+ if self.RCRefStrand and orientation == "native":
+ return self.tEnd - 1 - np.hstack([0, np.cumsum(referenceNonGapMask[:-1])])
+ else:
+ return self.tStart + np.hstack([0, np.cumsum(referenceNonGapMask[:-1])])
+
+ def readPositions(self, orientation="native"):
+ """
+ Returns an array of read positions such that
+ readPositions[i] = read position of the i'th column
+ in the alignment. Insertions are grouped with the following
+ read base, in the specified orientation.
+
+ Length of output array = length of alignment
+ """
+ readNonGapMask = (self.alignmentArray(orientation) >> 4) != GAP
+ if self.RCRefStrand and orientation == "genomic":
+ return self.rEnd - 1 - np.hstack([0, np.cumsum(readNonGapMask[:-1])])
+ else:
+ return self.rStart + np.hstack([0, np.cumsum(readNonGapMask[:-1])])
+
+ def pulseFeature(self, featureName, aligned=True, orientation="native"):
+ """
+ Access a pulse feature by name.
+ """
+ pulseDataset = self._alignmentGroup[featureName]
+ pulseArray = arrayFromDataset(pulseDataset, self.Offset_begin, self.Offset_end)
+ if self.RCRefStrand and orientation == "genomic":
+ alignedPulseArray = pulseArray[::-1]
+ else:
+ alignedPulseArray = pulseArray
+ if aligned:
+ return alignedPulseArray
+ else:
+ return ungappedPulseArray(alignedPulseArray)
+
+ IPD = _makePulseFeatureAccessor("IPD")
+ PulseWidth = _makePulseFeatureAccessor("PulseWidth")
+ QualityValue = _makePulseFeatureAccessor("QualityValue")
+ InsertionQV = _makePulseFeatureAccessor("InsertionQV")
+ DeletionQV = _makePulseFeatureAccessor("DeletionQV")
+ DeletionTag = _makePulseFeatureAccessor("DeletionTag")
+ MergeQV = _makePulseFeatureAccessor("MergeQV")
+ SubstitutionQV = _makePulseFeatureAccessor("SubstitutionQV")
+
+ def __getattr__(self, key):
+ return self.cmpH5.alignmentIndex[self.rowNumber][key]
+
+ def __repr__(self):
+ return "CmpH5 alignment: %s %3d %9d %9d" \
+ % (("+" if self.isForwardStrand else "-"),
+ self.RefGroupID, self.tStart, self.tEnd)
+
+ def __str__(self):
+ COLUMNS = 80
+ val = ""
+ val += repr(self) + "\n\n"
+ val += "Read: " + self.readName + "\n"
+ val += "Reference: " + self.referenceName + "\n\n"
+ val += "Read length: " + str(self.readLength) + "\n"
+ val += "Concordance: " + "%0.3f" % self.identity + "\n"
+
+ alignedRead = self.read()
+ alignedRef = self.reference()
+ transcript = self.transcript(style="exonerate+")
+ refPos = self.referencePositions()
+ refPosString = "".join([str(pos % 10) for pos in refPos])
+ for i in xrange(0, len(alignedRef), COLUMNS):
+ val += "\n"
+ val += " " + refPosString[i:i+COLUMNS] + "\n"
+ val += " " + alignedRef [i:i+COLUMNS] + "\n"
+ val += " " + transcript [i:i+COLUMNS] + "\n"
+ val += " " + alignedRead [i:i+COLUMNS] + "\n"
+ val += "\n"
+ return val
+
+ def __cmp__(self, other):
+ return cmp((self.RefGroupID, self.tStart, self.tEnd),
+ (other.RefGroupID, other.tStart, other.tEnd))
+
+ def __dir__(self):
+ # Special magic improving IPython completion
+ return ALIGNMENT_INDEX_COLUMNS
+
+class ClippedCmpH5Alignment(CmpH5Alignment):
+ """
+ An alignment from a cmp.h5 file that has been clipped to specified
+ reference bounds using the `CmpH5Alignment.clippedTo` method.
+ """
+ # We use these fields to shadow fields in the
+ # alignment index row.
+ __slots__ = [ "tStart",
+ "tEnd",
+ "rStart",
+ "rEnd",
+ "Offset_begin",
+ "Offset_end",
+ "nM",
+ "nMM",
+ "nIns",
+ "nDel" ]
+
+ def __init__(self, aln, refStart, refEnd):
+ # The clipping region must intersect the alignment, though it
+ # does not have to be contained wholly within it.
+ refStart = max(aln.referenceStart, refStart)
+ refEnd = min(aln.referenceEnd, refEnd)
+ assert refStart <= refEnd
+
+ super(ClippedCmpH5Alignment, self).__init__(aln.cmpH5, aln.rowNumber)
+ refPositions = aln.referencePositions(orientation="genomic")
+ readPositions = aln.readPositions(orientation="genomic")
+
+ # Clipping positions within the alignment array
+ clipStart = bisect_right(refPositions, refStart) - 1
+ clipEnd = bisect_left(refPositions, refEnd)
+
+ # Overlay the new bounds.
+ self.tStart = refStart
+ self.tEnd = refEnd
+ if aln.isForwardStrand:
+ self.Offset_begin = aln.Offset_begin + clipStart
+ self.Offset_end = aln.Offset_begin + clipEnd
+ self.rStart = readPositions[clipStart]
+ else:
+ self.Offset_begin = aln.Offset_end - clipEnd
+ self.Offset_end = aln.Offset_end - clipStart
+ self.rEnd = readPositions[clipStart] + 1
+ alnMoveCounts = Counter(self.transcript(style="gusfield"))
+ self.nM = alnMoveCounts["M"]
+ self.nMM = alnMoveCounts["R"]
+ self.nIns = alnMoveCounts["I"]
+ self.nDel = alnMoveCounts["D"]
+ readLength = self.nM + self.nMM + self.nIns
+ if aln.isForwardStrand:
+ self.rEnd = self.rStart + readLength
+ else:
+ self.rStart = self.rEnd - readLength
+ assert self.rStart <= self.rEnd
+
+
+# ========================================
+# CmpH5 reader class
+#
+class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
+ """
+ The `CmpH5Reader` class is a lightweight and efficient API for
+ accessing PacBio ``cmp.h5`` alignment files. Alignment records
+ can be obtained via random access (via Python indexing/slicing),
+ iteration, or range queries (via readsInRange).
+
+ .. testsetup:: *
+
+ from pbcore import data
+ from pbcore.io import CmpH5Reader
+ filename = data.getCmpH5()
+ c = CmpH5Reader(filename)
+ a0 = c[0]
+ a1 = c[1]
+
+ .. doctest::
+
+ >>> import pbcore.data # For an example data file
+ >>> from pbcore.io import CmpH5Reader
+ >>> filename = pbcore.data.getCmpH5()
+ >>> c = CmpH5Reader(filename)
+ >>> c[0]
+ CmpH5 alignment: - 1 0 290
+ >>> c[0:2] # doctest: +NORMALIZE_WHITESPACE
+ [CmpH5 alignment: - 1 0 290,
+ CmpH5 alignment: + 1 0 365]
+ >>> sum(aln.readLength for aln in c)
+ 26103
+
+ """
+ def __init__(self, filenameOrH5File, sharedAlignmentIndex=None):
+
+ # The sharedAlignmentIndex is a copy of the /AlnInfo/AlnIndex dataset
+ # for the file indicated by filenameOrH5File that's already opened and
+ # held in memory by another process. When it isn't None, this process
+ # doesn't have to keep its own copy of the dataset, which can save
+ # memory. This is useful for quiver and kineticsTools where there's a
+ # master process that opens the cmph5 file and schedules slaves that
+ # only need a read-only copy of the reader.
+
+ # It is an unchecked runtime error to supply a sharedAlignmentIndex
+ # that is not identical to the AlnIndex in the filenameOrH5File
+
+ if isinstance(filenameOrH5File, h5py.File):
+ if filenameOrH5File.mode != "r":
+ raise ValueError("HDF5 files used by CmpH5Reader must be opened read-only!")
+ self.filename = filenameOrH5File.filename
+ self.file = filenameOrH5File
+ else:
+ try:
+ self.filename = abspath(expanduser(filenameOrH5File))
+ self.file = h5py.File(self.filename, "r")
+ except IOError:
+ raise IOError, ("Invalid or nonexistent cmp.h5 file %s" % filenameOrH5File)
+
+ self._loadAlignmentInfo(sharedAlignmentIndex)
+ self._loadMovieInfo()
+ self._loadReferenceInfo()
+ self._loadMiscInfo()
+
+ # These are loaded on demand
+ self._readGroupTable = None
+ self._readGroupDict = None
+
+ def _loadAlignmentInfo(self, sharedAlignmentIndex=None):
+ # If a sharedAlignmentIndex is not provided, read it from the file. If
+ # it is provided, don't read anything from the file or store anything
+ # else in memory
+ if sharedAlignmentIndex is None:
+ if len(self.file["/AlnInfo/AlnIndex"]) == 0:
+ raise EmptyCmpH5Error("Empty cmp.h5 file, cannot be read by CmpH5Reader")
+ rawAlignmentIndex = self.file["/AlnInfo/AlnIndex"].value
+ self._alignmentIndex = (rawAlignmentIndex.view(dtype = ALIGNMENT_INDEX_DTYPE)
+ .view(np.recarray)
+ .flatten())
+ else:
+ self._alignmentIndex = sharedAlignmentIndex
+ self._alignmentIndex.setflags(write=False)
+
+ # This is the only sneaky part of this whole class. We do not
+ # store the raw h5py group object; rather we cache a dict of {
+ # dataset_name -> dataset }. This way we avoid B-tree
+ # scanning in basic data access.
+ self._alignmentGroupById = {}
+ for (alnGroupId, alnGroupPath) in zip(self.file["/AlnGroup/ID"],
+ self.file["/AlnGroup/Path"]):
+ alnGroup = self.file[alnGroupPath]
+ self._alignmentGroupById[alnGroupId] = dict(alnGroup.items())
+
+
+ def _loadMovieInfo(self):
+ numMovies = len(self.file["/MovieInfo/ID"])
+
+ if "FrameRate" in self.file["/MovieInfo"]:
+ frameRate = self.file["/MovieInfo/FrameRate"].value
+ timeScale = 1.0/frameRate
+ else:
+ frameRate = [np.nan] * numMovies
+ timeScale = [1.0] * numMovies
+
+ self._movieInfoTable = np.rec.fromrecords(
+ zip(self.file["/MovieInfo/ID"],
+ self.file["/MovieInfo/Name"],
+ frameRate,
+ timeScale),
+ dtype=[("ID" , int),
+ ("Name" , object),
+ ("FrameRate" , float),
+ ("TimeScale" , float)])
+
+ self._movieDict = {}
+ for record in self._movieInfoTable:
+ assert record.ID not in self._movieDict
+ self._movieDict[record.ID] = record
+ self._movieDict[record.Name] = record
+
+ def _loadReadGroupInfo(self):
+ # This is invoked lazily to allow operation on cmp.h5s with
+ # missing chemistry info.
+ assert (self._readGroupTable is None) and (self._readGroupDict is None)
+ self._readGroupTable = np.rec.fromrecords(
+ zip(self._movieInfoTable.ID,
+ self._movieInfoTable.Name,
+ [self.readType] * len(self._movieInfoTable.ID),
+ self.sequencingChemistry),
+ dtype=[("ID" , np.int32),
+ ("MovieName" , "O"),
+ ("ReadType" , "O"),
+ ("SequencingChemistry", "O")])
+ self._readGroupDict = { rg.ID : rg
+ for rg in self._readGroupTable }
+
+ def _loadReferenceInfo(self):
+ _referenceGroupTbl = np.rec.fromrecords(
+ zip(self.file["/RefGroup/ID"],
+ self.file["/RefGroup/RefInfoID"],
+ [path[1:] for path in self.file["/RefGroup/Path"]]),
+ dtype=[("ID" , int),
+ ("RefInfoID", int),
+ ("Name" , object)])
+
+ _referenceInfoTbl = np.rec.fromrecords(
+ zip(self.file["/RefInfo/ID"],
+ self.file["/RefInfo/FullName"],
+ self.file["/RefInfo/Length"],
+ self.file["/RefInfo/MD5"]) ,
+ dtype=[("RefInfoID", int),
+ ("FullName" , object),
+ ("Length" , int),
+ ("MD5" , object)])
+
+ self._referenceInfoTable = \
+ rec_join("RefInfoID", _referenceGroupTbl, _referenceInfoTbl, jointype="inner")
+
+ if self.isSorted:
+ _offsetTable = self.file["/RefGroup/OffsetTable"].value \
+ .view(dtype=OFFSET_TABLE_DTYPE) \
+ .view(np.recarray) \
+ .flatten()
+ self._referenceInfoTable = rec_join("ID",
+ self._referenceInfoTable,
+ _offsetTable,
+ jointype="inner")
+ self._referenceDict = {}
+ self._readLocatorByKey = {}
+ for record in self._referenceInfoTable:
+ if record.ID != -1:
+ assert record.ID != record.Name
+ shortName = splitFastaHeader(record.FullName)[0]
+ if (shortName in self._referenceDict or
+ record.ID in self._referenceDict or
+ record.Name in self._referenceDict or
+ record.FullName in self._referenceDict or
+ record.MD5 in self._referenceDict):
+ raise ValueError, "Duplicate reference contig sequence or identifier"
+ else:
+ self._referenceDict[shortName] = record
+ self._referenceDict[record.ID] = record
+ self._referenceDict[record.Name] = record
+ self._referenceDict[record.FullName] = record
+ self._referenceDict[record.MD5] = record
+
+ if self.isSorted:
+ readLocator = makeReadLocator(self, record.ID)
+ self._readLocatorByKey[record.ID] = readLocator
+ self._readLocatorByKey[shortName] = readLocator
+
+ def _loadMiscInfo(self):
+ if "NumPasses" in self.file["/AlnInfo"]:
+ self.numPasses = self.file["/AlnInfo/NumPasses"].value
+
+ if "Barcode" in self.file["/AlnInfo"]:
+ # Build forward and backwards id<->label lookup tables
+ self._barcodeName = OrderedDict(zip(self.file["/BarcodeInfo/ID"],
+ self.file["/BarcodeInfo/Name"]))
+ self._barcode = OrderedDict(zip(self.file["/BarcodeInfo/Name"],
+ self.file["/BarcodeInfo/ID"]))
+ # Barcode ID per row
+ self._barcodes = self.file["/AlnInfo/Barcode"].value[:,1]
+
+ if "ZScore" in self.file["/AlnInfo"]:
+ self.zScore = self.file["/AlnInfo/ZScore"].value
+
+ self._sequencingChemistry = None
+
+
+ @property
+ def sequencingChemistry(self):
+ if self._sequencingChemistry is None:
+ mi = dict(self.file["/MovieInfo"])
+ if (("BindingKit" in mi) and
+ ("SequencingKit" in mi) and
+ ("SoftwareVersion" in mi)):
+ # New way
+ self._sequencingChemistry = \
+ [ decodeTriple(bk, sk, sv)
+ for (bk, sk, sv) in zip(
+ mi["BindingKit"],
+ mi["SequencingKit"],
+ mi["SoftwareVersion"]) ]
+ elif "SequencingChemistry" in mi:
+ # Old way
+ self._sequencingChemistry = mi["SequencingChemistry"].value
+ else:
+ raise ChemistryLookupError, "Chemistry information could not be found in cmp.h5!"
+ return self._sequencingChemistry
+
+
+ @property
+ def alignmentIndex(self):
+ """
+ Return the alignment index data structure, which is the
+ central data structure in the cmp.h5 file, as a numpy
+ recarray.
+
+ The `dtype` of the recarray is::
+
+ dtype([('AlnID', int),
+ ('AlnGroupID', int),
+ ('MovieID', int),
+ ('RefGroupID', int),
+ ('tStart', int),
+ ('tEnd', int),
+ ('RCRefStrand', int),
+ ('HoleNumber', int),
+ ('SetNumber', int),
+ ('StrobeNumber', int),
+ ('MoleculeID', int),
+ ('rStart', int),
+ ('rEnd', int),
+ ('MapQV', int),
+ ('nM', int),
+ ('nMM', int),
+ ('nIns', int),
+ ('nDel', int),
+ ('Offset_begin', int),
+ ('Offset_end', int),
+ ('nBackRead', int),
+ ('nReadOverlap', int)])
+
+ Access to the alignment index is provided to allow users to
+ perform vectorized computations over all alignments in the file.
+
+ .. doctest::
+
+ >>> c.alignmentIndex.MapQV[0:10]
+ array([254, 254, 0, 254, 254, 254, 254, 254, 254, 254], dtype=uint32)
+
+ Alignment index fields are also exposed as fields of the
+ `CmpH5Reader` object, allowing a convenient shorthand.
+
+ .. doctest::
+
+ >>> c.MapQV[0:10]
+ array([254, 254, 0, 254, 254, 254, 254, 254, 254, 254], dtype=uint32)
+
+ The alignment index row for a given alignment can also be
+ accessed directly as a field of a `CmpH5Alignment` object
+
+ .. doctest::
+
+ >>> c[26].MapQV
+ 254
+ """
+ return self._alignmentIndex
+
+ @property
+ def movieInfoTable(self):
+ """
+ .. deprecated:: 0.9.2
+ Use :attr:`readGroupTable`, which is compatible with BAM usage
+
+ Return a numpy recarray summarizing source movies for the
+ reads in this file.
+
+ The `dtype` of this recarray is::
+
+ dtype([('ID', 'int'),
+ ('Name', 'string'),
+ ('FrameRate', 'float'),
+ ('TimeScale', 'float')])
+
+ `TimeScale` is the factor to multiply time values (IPD,
+ PulseWidth) by in order to get times in seconds. The
+ `FrameRate` field should *not* be used directly as it will be
+ NaN for pre-1.3 cmp.h5 files.
+ """
+ return self._movieInfoTable
+
+ @property
+ def referenceInfoTable(self):
+ """
+ .. _referenceInfoTable:
+
+ Return a numpy recarray summarizing the references that were
+ aligned against.
+
+ The `dtype` of this recarray is::
+
+ dtype([('RefInfoID', int),
+ ('ID', int),
+ ('Name', string),
+ ('FullName', string),
+ ('Length', int),
+ ('MD5', string),
+ ('StartRow', int),
+ ('EndRow', int) ])
+
+ (the last two columns are omitted for unsorted `cmp.h5` files).
+ """
+ return self._referenceInfoTable
+
+ @property
+ def readType(self):
+ """
+ Either "standard" or "CCS", indicating the type of reads that
+ were aligned to the reference.
+
+ .. doctest::
+
+ >>> c.readType
+ 'standard'
+ """
+ return self.file.attrs["ReadType"]
+
+ @property
+ def version(self):
+ """
+ The CmpH5 format version string.
+
+ .. doctest::
+
+ >>> c.version
+ '1.2.0.SF'
+ """
+ return self.file.attrs["Version"]
+
+ def versionAtLeast(self, minimalVersion):
+ """
+ Compare the file version to `minimalVersion`.
+
+ .. doctest::
+
+ >>> c.versionAtLeast("1.3.0")
+ False
+ """
+ myVersionTuple = map(int, self.version.split(".")[:3])
+ minimalVersionTuple = map(int, minimalVersion.split(".")[:3])
+ return myVersionTuple >= minimalVersionTuple
+
+ def softwareVersion(self, programName):
+ """
+ Return the version of program `programName` that processed
+ this file.
+ """
+ filelog = dict(zip(self.file["/FileLog/Program"],
+ self.file["/FileLog/Version"]))
+ return filelog.get(programName, None)
+
+ @property
+ def isSorted(self):
+ return "OffsetTable" in self.file["/RefGroup"]
+
+ @property
+ def isBarcoded(self):
+ return "Barcode" in self.file["/AlnInfo"]
+
+ @property
+ def isEmpty(self):
+ return len(self.file["/AlnInfo/AlnIndex"]) == 0
+
+ def _alignmentGroup(self, alnGroupId):
+ return self._alignmentGroupById[alnGroupId]
+
+ @property
+ def movieNames(self):
+ return set([mi.Name for mi in self._movieDict.values()])
+
+ @property
+ def ReadGroupID(self):
+ return self.MovieID
+
+ @property
+ def readGroupTable(self):
+ # TODO: add doctest
+ if self._readGroupTable is None:
+ self._loadReadGroupInfo()
+ return self._readGroupTable
+
+ def readGroupInfo(self, rgId):
+ """
+ Access information about a movie whose reads are represented
+ in the file.
+
+ The returned value is a record from the :attr:`readGroupTable`
+ """
+ # TODO: add doctest
+ if self._readGroupDict is None:
+ self._loadReadGroupInfo()
+ return self._readGroupDict[rgId]
+
+
+ def _movieInfo(self, movieId):
+ return self._movieDict[movieId]
+
+ def movieInfo(self, movieId):
+ """
+ .. deprecated:: 0.9.2
+ Use :attr:`readGroupInfo`, which is compatible with BAM usage
+
+ Access information about a movie whose reads are represented
+ in the file.
+
+ The returned value is a record from the :attr:`movieInfoTable`
+ """
+ return self._movieInfo(movieId)
+
+ def referenceInfo(self, key):
+ """
+ Access information about a reference that was aligned against.
+ Key can be reference ID (integer), name ("ref000001"), full
+ name (e.g. "lambda_NEB3011"), truncated full name (full name
+ up to the first whitespace, following the samtools convention)
+ or MD5 sum hex string (e.g. "a1319ff90e994c8190a4fe6569d0822a").
+
+ The returned value is a record from the :ref:referenceInfoTable .
+
+ .. doctest::
+
+ >>> ri = c.referenceInfo("ref000001")
+ >>> ri.FullName
+ 'lambda_NEB3011'
+ >>> ri.MD5
+ 'a1319ff90e994c8190a4fe6569d0822a'
+
+ """
+ return self._referenceDict[key]
+
+ def readsInRange(self, refKey, refStart, refEnd, justIndices=False):
+ """
+ Get a list of reads overlapping (i.e., intersecting---not
+ necessarily spanning) a given reference window.
+
+ If `justIndices` is ``False``, the list returned will contain
+ `CmpH5Alignment` objects.
+
+ If `justIndices` is ``True``, the list returned will contain
+ row numbers in the alignment index table. Slicing the
+ `CmpH5Reader` object with these row numbers can be used to get
+ the corresponding `CmpH5Alignment` objects.
+
+ The contig key can be either the ``RefID``, or the short name
+ (FASTA header up to first space).
+
+ .. doctest::
+
+ >>> c.readsInRange(1, 0, 1000) # doctest: +NORMALIZE_WHITESPACE
+ [CmpH5 alignment: - 1 0 290,
+ CmpH5 alignment: + 1 0 365]
+
+ >>> rowNumbers = c.readsInRange(1, 0, 1000, justIndices=True)
+ >>> rowNumbers
+ array([0, 1], dtype=uint32)
+ """
+
+ if not self.isSorted:
+ raise Exception, "CmpH5 is not sorted"
+ rowNumbers = self._readLocatorByKey[refKey](refStart, refEnd, justIndices=True)
+ if justIndices:
+ return rowNumbers
+ else:
+ return self[rowNumbers]
+
+ def hasPulseFeature(self, featureName):
+ """
+ Are the datasets for pulse feature `featureName` loaded in
+ this file? Specifically, is it loaded for all movies within
+ this cmp.h5?
+
+ .. doctest::
+
+ >>> c.hasPulseFeature("InsertionQV")
+ True
+ >>> c.hasPulseFeature("MergeQV")
+ False
+
+ """
+ return all(featureName in alnGroup.keys()
+ for alnGroup in self._alignmentGroupById.values())
+
+ def pulseFeaturesAvailable(self):
+ """
+ What pulse features are available in this cmp.h5 file?
+
+ .. doctest::
+
+ >>> c.pulseFeaturesAvailable()
+ [u'QualityValue', u'IPD', u'PulseWidth', u'InsertionQV', u'DeletionQV']
+
+ """
+ pulseFeaturesByMovie = [ alnGroup.keys()
+ for alnGroup in self._alignmentGroupById.values() ]
+ pulseFeaturesAvailableAsSet = set.intersection(*map(set, pulseFeaturesByMovie))
+ pulseFeaturesAvailableAsSet.discard("AlnArray")
+ return list(pulseFeaturesAvailableAsSet)
+
+ @property
+ def barcode(self):
+ """
+ Returns a dict mapping of barcode name to integer barcode.
+ Behavior undefined if file is not barcoded.
+ """
+ return self._barcode
+
+ @property
+ def barcodeName(self):
+ """
+ Returns a dict mapping of barcode integer id to name.
+ Behavior undefined if file is not barcoded.
+ """
+ return self._barcodeName
+
+ @property
+ def barcodes(self):
+ """
+ Returns an array of barcode integer ids, of the same length as the
+ alignment array.
+
+ Behavior undefined if file is not barcoded.
+ """
+ return self._barcodes
+
+ @property
+ def qId(self):
+ # Forward compatibility with BAM API
+ return self.ReadGroupID
+
+ @property
+ def holeNumber(self):
+ # Forward compatibility with BAM API
+ return self.HoleNumber
+
+ @property
+ def mapQV(self):
+ # Forward compatibility with BAM API
+ return self.MapQV
+
+ def __getitem__(self, rowNumbers):
+ if (isinstance(rowNumbers, int) or
+ issubclass(type(rowNumbers), np.integer)):
+ return CmpH5Alignment(self, rowNumbers)
+ elif isinstance(rowNumbers, slice):
+ return [CmpH5Alignment(self, r)
+ for r in xrange(*rowNumbers.indices(len(self)))]
+ elif isinstance(rowNumbers, list) or isinstance(rowNumbers, np.ndarray):
+ if len(rowNumbers) == 0:
+ return []
+ else:
+ entryType = type(rowNumbers[0])
+ if entryType == int or issubclass(entryType, np.integer):
+ return [CmpH5Alignment(self, r) for r in rowNumbers]
+ elif entryType == bool or issubclass(entryType, np.bool_):
+ return [CmpH5Alignment(self, r) for r in np.flatnonzero(rowNumbers)]
+ raise TypeError, "Invalid type for CmpH5Reader slicing"
+
+ def __iter__(self):
+ return (self[i] for i in xrange(len(self)))
+
+ def __len__(self):
+ return len(self.alignmentIndex)
+
+ def __getattr__(self, key):
+ # Avoid infinite recursion in degenerate cases.
+ return self.__getattribute__("alignmentIndex")[key]
+
+ def close(self):
+ if hasattr(self, "file") and self.file is not None:
+ self.file.close()
+ self.file = None
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ def __dir__(self):
+ # Special magic improving IPython completion
+ return ALIGNMENT_INDEX_COLUMNS
diff --git a/pbcore/io/align/PacBioBamIndex.py b/pbcore/io/align/PacBioBamIndex.py
new file mode 100644
index 0000000..12e4750
--- /dev/null
+++ b/pbcore/io/align/PacBioBamIndex.py
@@ -0,0 +1,121 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+import h5py
+import numpy as np
+from os.path import abspath, expanduser
+from functools import wraps
+from collections import namedtuple
+
+class PacBioBamIndex(object):
+ """
+ The PacBio BAM index is a companion file allowing modest
+ *semantic* queries on PacBio BAM files without iterating over the
+ entire file. By convention, the PacBio BAM index has extension
+ "bam.pbi".
+
+ The bam.pbi index is an HDF5 file containing two data frames
+ (groups containing arrays (frame columns) of common length):
+
+ - A table with a row per BAM record, columns reflecting
+ precomputed statistics per record
+
+ - A table with a row per reference contig (tid) in the BAM,
+ indicating the range of rows pertaining to the
+ """
+ def _loadColumns(self, f):
+ g = f["PacBioBamIndex/Columns"]
+ columnNamesAndColumns = sorted([ (k, v[:]) for (k, v) in g.iteritems() ])
+ columnNames, columns = zip(*columnNamesAndColumns)
+ return np.rec.fromarrays(columns, names=columnNames)
+
+ def _loadVersion(self, f):
+ return f["PacBioBamIndex"].attrs["Version"]
+
+ def _loadOffsets(self, f):
+ pass
+
+ def __init__(self, pbiFilename):
+ pbiFilename = abspath(expanduser(pbiFilename))
+ with h5py.File(pbiFilename, "r") as f:
+ try:
+ self._version = self._loadVersion(f)
+ self._columns = self._loadColumns(f)
+ self._offsets = self._loadOffsets(f)
+ except Exception as e:
+ raise IOError, "Malformed bam.pbi file: " + str(e)
+
+
+ @property
+ def version(self):
+ return self._version
+
+ @property
+ def columnNames(self):
+ return list(self._columns.dtype.names)
+
+ def __getattr__(self, columnName):
+ if columnName in self.columnNames:
+ return self._columns[columnName]
+ else:
+ raise AttributeError, "pbi has no column named '%s'" % columnName
+
+ def __getitem__(self, rowNumber):
+ return self._columns[rowNumber]
+
+ def __dir__(self):
+ # Special magic for IPython tab completion
+ return self.columnNames
+
+ def __len__(self):
+ return len(self._columns)
+
+ def __iter__(self):
+ for i in xrange(len(self)):
+ yield self[i]
+
+ def rangeQuery(self, winId, winStart, winEnd):
+ #
+ # A read overlaps the window if winId == tid and
+ #
+ # (tStart < winEnd) && (tEnd > winStart) (1)
+ #
+ # We are presently doing this naively right now, just
+ # computing the predicate over all rows. If/when we determine
+ # this is too slow, we can accelerate using the nBackread
+ # approach we use int he cmph5, doing binary search to
+ # identify a candidate range and then culling the range.
+ #
+ ix = np.flatnonzero((self.tId == winId) &
+ (self.tStart < winEnd) &
+ (self.tEnd > winStart))
+ return ix
diff --git a/pbcore/io/align/_AlignmentMixin.py b/pbcore/io/align/_AlignmentMixin.py
new file mode 100644
index 0000000..d5e3500
--- /dev/null
+++ b/pbcore/io/align/_AlignmentMixin.py
@@ -0,0 +1,210 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+__all__ = [ "AlignmentReaderMixin",
+ "AlignmentRecordMixin",
+ "IndexedAlignmentReaderMixin" ]
+
+from pbcore.io import BasH5Collection
+import numpy as np
+
+class AlignmentReaderMixin(object):
+ """
+ Mixin class for higher-level functionality of alignment file
+ readers.
+ """
+ def attach(self, fofnFilename):
+ """
+ Attach the actual movie data files that were used to create this
+ alignment file.
+ """
+ self.basH5Collection = BasH5Collection(fofnFilename)
+
+ @property
+ def moviesAttached(self):
+ return (hasattr(self, "basH5Collection") and self.basH5Collection is not None)
+
+
+class IndexedAlignmentReaderMixin(AlignmentReaderMixin):
+ """
+ Mixin class for alignment readers that have access to an alignment
+ index.
+ """
+ def readsByName(self, query):
+ """
+ Identifies reads by name query. The name query is interpreted as follows:
+
+ - "movieName/holeNumber[/[*]]" => gets all records from a chosen movie, ZMW
+ - "movieName/holeNumber/rStart_rEnd => gets all records *overlapping* read range query in movie, ZMW
+ - "movieName/holeNumber/ccs" => gets CCS records from chose movie, ZMW (zero or one)
+
+ Records are returned in a list in ascending order of rStart
+ """
+ def rgIDs(movieName):
+ return self.readGroupTable.ID[self.readGroupTable.MovieName == movieName]
+ #return self.movieInfoTable.ID[self.movieInfoTable.Name == movieName]
+
+ def rangeOverlap(w1, w2):
+ s1, e1 = w1
+ s2, e2 = w2
+ return (e1 > s2) and (e2 > s1)
+
+ def rQueryMatch(readName, rQuery):
+ if rQuery == "*" or rQuery == "":
+ return True
+ elif rQuery == "ccs":
+ return readName.endswith("ccs")
+ elif readName.endswith("ccs"):
+ return False
+ else:
+ q = map(int, rQuery.split("_"))
+ r = map(int, readName.split("/")[-1].split("_"))
+ return rangeOverlap(q, r)
+
+ fields = query.split("/")
+ movieName = fields[0]
+ holeNumber = int(fields[1])
+ if len(fields) > 2: rQuery = fields[2]
+ else: rQuery = "*"
+
+ rgs = rgIDs(movieName)
+ rns = np.flatnonzero(np.in1d(self.qId, rgs) &
+ (self.holeNumber == holeNumber))
+ alns = [ a for a in self[rns]
+ if rQueryMatch(a.readName, rQuery) ]
+ return sorted(alns, key=lambda a: a.readStart)
+
+
+class AlignmentRecordMixin(object):
+ """
+ Mixin class providing some higher-level functionality for
+ alignment records.
+ """
+ @property
+ def zmw(self):
+ if not self.reader.moviesAttached:
+ raise ValueError("Movies not attached!")
+ return self.reader.basH5Collection[self.zmwName]
+
+ @property
+ def zmwRead(self):
+ if not self.reader.moviesAttached:
+ raise ValueError("Movies not attached!")
+ return self.reader.basH5Collection[self.readName]
+
+ @property
+ def referenceStart(self):
+ """
+ The left bound of the alignment, in reference coordinates.
+ """
+ return self.tStart
+
+ @property
+ def referenceEnd(self):
+ """
+ The right bound of the alignment, in reference coordinates.
+ """
+ return self.tEnd
+
+ @property
+ def readStart(self):
+ """
+ The left bound of the alignment, in read coordinates (from the BAS.H5 file).
+ """
+ return self.aStart
+
+ @property
+ def readEnd(self):
+ """
+ The right bound of the alignment, in read coordinates (from the BAS.H5 file).
+ """
+ return self.aEnd
+
+ @property
+ def referenceSpan(self):
+ """
+ The length along the reference implied by this alignment.
+ """
+ return self.tEnd - self.tStart
+
+ @property
+ def readLength(self):
+ """
+ The length of the read.
+ """
+ return self.aEnd - self.aStart
+
+ def __len__(self):
+ return self.readLength
+
+ @property
+ def readName(self):
+ """
+ Return the name of the read that was aligned, in standard
+ PacBio format.
+ """
+ zmwName = self.zmwName
+ if self.readType == "CCS":
+ return "%s/ccs" % (zmwName,)
+ else:
+ return "%s/%d_%d" % (zmwName, self.aStart, self.aEnd)
+
+ @property
+ def zmwName(self):
+ return "%s/%d" % (self.movieName, self.HoleNumber)
+
+ def spansReferencePosition(self, pos):
+ """
+ Does the alignment span the given reference position?
+ """
+ return self.tStart <= pos < self.tEnd
+
+ def spansReferenceRange(self, start, end):
+ """
+ Does the alignment span the given reference range, in its entirety?
+ """
+ assert start <= end
+ return (self.tStart <= start <= end <= self.tEnd)
+
+ def overlapsReferenceRange(self, start, end):
+ """
+ Does the alignment overlap the given reference interval?
+ """
+ assert start <= end
+ return (self.tStart < end) and (self.tEnd > start)
+
+ def containedInReferenceRange(self, start, end):
+ """
+ Is the alignment wholly contained within a given reference interval?
+ """
+ assert start <= end
+ return (start <= self.tStart <= self.tEnd <= end)
diff --git a/pbcore/io/align/_BamSupport.py b/pbcore/io/align/_BamSupport.py
new file mode 100644
index 0000000..3175e49
--- /dev/null
+++ b/pbcore/io/align/_BamSupport.py
@@ -0,0 +1,127 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+import numpy as np
+
+class UnavailableFeature(Exception): pass
+class Unimplemented(Exception): pass
+class ReferenceMismatch(Exception): pass
+class IncompatibleFile(Exception): pass
+
+
+PULSE_FEATURE_TAGS = { "InsertionQV" : ("iq", "qv", np.uint8),
+ "DeletionQV" : ("dq", "qv", np.uint8),
+ "DeletionTag" : ("dt", "base", np.int8 ),
+ "SubstitutionQV" : ("sq", "qv", np.uint8),
+ "MergeQV" : ("mq", "qv", np.uint8),
+ "IPD" : ("ip", "time", np.uint8),
+ "PulseWidth" : ("pw", "time", np.uint8) }
+
+COMPLEMENT_MAP = { "A" : "T",
+ "T" : "A",
+ "C" : "G",
+ "G" : "C",
+ "N" : "N",
+ "-" : "-" }
+
+def complementAscii(a):
+ return np.array([ord(COMPLEMENT_MAP[chr(b)]) for b in a], dtype=np.int8)
+
+def reverseComplementAscii(a):
+ return complementAscii(a)[::-1]
+
+
+BAM_CMATCH = 0
+BAM_CINS = 1
+BAM_CDEL = 2
+BAM_CREF_SKIP = 3
+BAM_CSOFT_CLIP = 4
+BAM_CHARD_CLIP = 5
+BAM_CPAD = 6
+BAM_CEQUAL = 7
+BAM_CDIFF = 8
+
+
+
+#
+# qId calculation from RG ID string
+#
+def rgAsInt(rgIdString):
+ return np.int32(int(rgIdString, 16))
+
+#
+# Kinetics: decode the scheme we are using to encode approximate frame
+# counts in 8-bits.
+#
+def _makeFramepoints():
+ B = 2
+ t = 6
+ T = 2**t
+
+ framepoints = []
+ next = 0
+ for i in range(256/T):
+ grain = B**i
+ nextOnes = next + grain * np.arange(0, T)
+ next = nextOnes[-1] + grain
+ framepoints = framepoints + list(nextOnes)
+ return np.array(framepoints, dtype=int)
+
+def _makeLookup(framepoints):
+ # (frame -> code) involves some kind of rounding
+ # basic round-to-nearest
+ frameToCode = np.empty(shape=max(framepoints)+1, dtype=int)
+ for i, (fl, fu) in enumerate(zip(framepoints, framepoints[1:])):
+ if (fu > fl + 1):
+ m = (fl + fu)/2
+ for f in xrange(fl, m):
+ frameToCode[f] = i
+ for f in xrange(m, fu):
+ frameToCode[f] = i + 1
+ else:
+ frameToCode[fl] = i
+ # Extra entry for last:
+ frameToCode[fu] = i + 1
+ return frameToCode, fu
+
+_framepoints = _makeFramepoints()
+_frameToCode, _maxFramepoint = _makeLookup(_framepoints)
+
+def framesToCode(nframes):
+ nframes = np.minimum(_maxFramepoint, nframes)
+ return _frameToCode[nframes]
+
+def codeToFrames(code):
+ return _framepoints[code]
+
+def downsampleFrames(nframes):
+ return codeToFrames(framesToCode(nframes))
diff --git a/pbcore/io/align/__init__.py b/pbcore/io/align/__init__.py
new file mode 100644
index 0000000..8cf3971
--- /dev/null
+++ b/pbcore/io/align/__init__.py
@@ -0,0 +1,34 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from CmpH5IO import *
+from BamIO import *
+from BamAlignment import *
+from BlasrIO import *
diff --git a/pbcore/io/base.py b/pbcore/io/base.py
new file mode 100644
index 0000000..292812c
--- /dev/null
+++ b/pbcore/io/base.py
@@ -0,0 +1,109 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Base classes for readers and writers.
+# Author: David Alexander
+
+from __future__ import absolute_import
+import gzip
+from os.path import abspath, expanduser
+
+__all__ = [ "ReaderBase", "WriterBase" ]
+
+def isFileLikeObject(o):
+ return hasattr(o, "read") and hasattr(o, "write")
+
+def getFileHandle(filenameOrFile, mode="r"):
+ """
+ Given a filename not ending in ".gz", open the file with the
+ appropriate mode.
+
+ Given a filename ending in ".gz", return a filehandle to the
+ unzipped stream.
+
+ Given a file object, return it unless the mode is incorrect--in
+ that case, raise an exception.
+ """
+ assert mode in ("r", "w")
+
+ if isinstance(filenameOrFile, basestring):
+ filename = abspath(expanduser(filenameOrFile))
+ if filename.endswith(".gz"):
+ return gzip.open(filename, mode)
+ else:
+ return open(filename, mode)
+ elif isFileLikeObject(filenameOrFile):
+ return filenameOrFile
+ else:
+ raise Exception("Invalid type to getFileHandle")
+
+class ReaderBase(object):
+ def __init__(self, f):
+ """
+ Prepare for iteration through the records in the file
+ """
+ self.file = getFileHandle(f, "r")
+
+ def close(self):
+ """
+ Close the underlying file
+ """
+ self.file.close()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ def __repr__(self):
+ return "<%s for %s>" % (type(self).__name__, self.filename)
+
+class WriterBase(object):
+ def __init__(self, f):
+ """
+ Prepare for output to the file
+ """
+ self.file = getFileHandle(f, "w")
+
+ def close(self):
+ """
+ Close the underlying file
+ """
+ self.file.close()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+ def __repr__(self):
+ return "<%s for %s>" % (type(self).__name__, self.filename)
diff --git a/pbcore/io/opener.py b/pbcore/io/opener.py
new file mode 100644
index 0000000..094b173
--- /dev/null
+++ b/pbcore/io/opener.py
@@ -0,0 +1,134 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+
+__all__ = [ "openAlignmentFile",
+ "openIndexedAlignmentFile",
+ "openFasta",
+ "openIndexedFasta",
+ "entryPoint" ]
+
+from pbcore.io import (FastaTable, FastaReader,
+ BaxH5Reader, BasH5Reader, BasH5Collection,
+ CmpH5Reader, BamReader, IndexedBamReader,
+ GffReader, FastqReader)
+
+def openIndexedAlignmentFile(fname, referenceFasta=None):
+ """
+ Factory function to get a handle to a reader for an alignment file
+ (cmp.h5 or BAM), requiring index capability (built-in for cmp.h5;
+ requires bam.pbi index for BAM
+
+ The reference FASTA, if provided, must have a FASTA index
+ (fasta.fai).
+ """
+ if fname.endswith("cmp.h5"):
+ return CmpH5Reader(fname)
+ elif fname.endswith("bam"):
+ return IndexedBamReader(fname, referenceFasta)
+ else:
+ raise ValueError, "Invalid alignment file suffix"
+
+def openAlignmentFile(fname, referenceFasta=None):
+ """
+ Factory function to get a handle to a reader for an alignment file
+ (cmp.h5 or BAM), not requiring index capability
+ """
+ if fname.endswith("cmp.h5"):
+ return CmpH5Reader(fname)
+ elif fname.endswith("bam"):
+ try:
+ return IndexedBamReader(fname, referenceFasta)
+ except IOError:
+ return BamReader(fname, referenceFasta)
+
+def openIndexedFasta(fname):
+ """
+ Factory function to get a handle to a FASTA reader, requiring
+ random access capability via the fasta.fai index.
+ """
+ return FastaTable(fname)
+
+def openFasta(fname):
+ """
+ Factory function to get a handle to a FASTA reader, requiring only
+ iteration over the contigs.
+ """
+ try:
+ return FastaTable(fname)
+ except IOError:
+ # TODO: would be better to have a more specific error type
+ return FastaReader(fname)
+
+def _openerFor(ext):
+ if ext == "gff": return GffReader
+ elif ext in ("fq", "fastq"): return FastqReader
+ elif ext in ("fa", "fasta"): return openFasta
+ elif ext == "cmp.h5": return CmpH5Reader
+ elif ext == "bas.h5": return BasH5Reader
+ elif ext == "bax.h5": return BaxH5Reader
+ elif ext == "fofn": return BasH5Collection
+ elif ext == "bam": return openAlignmentFile
+ else:
+ raise ValueError, ("No known opener class for extension %s" % ext)
+
+def _extension(fname):
+ parts = fname.split(".")
+ if parts[-1] == "h5":
+ return ".".join(parts[-2:])
+ else:
+ return parts[-1]
+
+def _openAny(fname, *extraArgs):
+ ext = _extension(fname)
+ opener = _openerFor(ext)
+ return opener(fname, *extraArgs)
+
+def entryPoint():
+ """
+ This entry point (callable from the command line as ".open")
+ provides a convenient way to load up a data file for inspection.
+ """
+ import sys, code
+
+ if len(sys.argv) < 2:
+ print "Requires at least one argument!"
+ return 1
+
+ fname = sys.argv[1]
+ extraArgs = sys.argv[2:]
+
+ f = _openAny(fname, *extraArgs)
+ banner = "Your file has been opened as object 'f'"
+ try:
+ from IPython import embed
+ embed(banner1=banner)
+ except ImportError:
+ code.InteractiveConsole(locals=locals()).interact(banner=banner)
diff --git a/pbcore/io/rangeQueries.py b/pbcore/io/rangeQueries.py
new file mode 100644
index 0000000..0832696
--- /dev/null
+++ b/pbcore/io/rangeQueries.py
@@ -0,0 +1,182 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import h5py as h
+import numpy as n
+import bisect
+
+def rightmostBinSearch(vec, val):
+ """
+ Return the rightmost position in the vector vec of val. If val is
+ absent then we return the leftmost position of the value:
+ min(vec[vec > val]). If val is greater than all elements in vec we
+ return len(vec).
+ """
+ assert(len(vec) > 0)
+
+ i = bisect.bisect_left(vec, val)
+
+ if (len(vec) == i):
+ return(i)
+
+ while (i + 1 < len(vec) and vec[i + 1] == val):
+ i += 1
+
+ return(i)
+
+def leftmostBinSearch(vec, val):
+ """
+ Return the leftmost position in the vector vec of val. If val is
+ absent then we return the lefternmost position for the value:
+ max(vec[vec < val]). The time complexity here is potentially worse
+ than log(n) because of the extra step of walking backwards.
+ """
+ assert(len(vec) > 0)
+ i = bisect.bisect_left(vec, val)
+
+ if (i == 0):
+ return(i)
+ elif (i == len(vec)):
+ v = vec[i-1]
+ i -= 1
+ else:
+ v = vec[i]
+
+ if (v > val):
+ i -= 1
+
+ while (i > 0 and vec[i-1] == vec[i]):
+ i -= 1
+
+ return(i)
+
+
+def getOverlappingRanges(tStart, tEnd, nBack, nOverlap, rangeStart, rangeEnd):
+ """
+ Return indices overlapping the range defined by [rangeStart,
+ rangeEnd). Here tStart, tEnd, nBack, nOverlap are vectors of
+ length n sorted according to tStart and tEnd. The vectors nBack
+ and nOverlap are typically produced by computeIndices[DP].
+ """
+ assert(rangeEnd > rangeStart and
+ len(tStart) == len(tEnd) == len(nBack) == len(nOverlap))
+
+ lM = leftmostBinSearch(tStart, rangeStart)
+ lM = lM - nBack[lM]
+ rM = rightmostBinSearch(tStart, rangeEnd - .5)
+
+ assert(rM >= lM and rM >= 0 and lM >= 0)
+
+ if (lM == rM):
+ return(n.array([], dtype = "uint32"))
+ else:
+ # We only keep the reads in the range lM .. rM that
+ # actually overlap the range, as determined by
+ # tEnd > rangeStart
+ idxs = n.arange(lM, rM, dtype = "uint32") # lM .. rM
+ toKeep = tEnd[idxs] > rangeStart
+ return(idxs[toKeep])
+
+def projectIntoRange(tStart, tEnd, winStart, winEnd):
+ """
+ Find coverage in the range [winStart, winEnd) implied by tStart,
+ tEnd vectors. Coverage can be most efficiently calculated by
+ first obtaining all reads overlapping the range using the
+ getOverlappingRanges function then projecting them into the same
+ or smaller range
+ """
+ assert(len(tStart) == len(tEnd))
+ res = n.zeros(shape=winEnd-winStart, dtype=n.uint)
+ # Clip to window and translate.
+ # Be careful to avoid underflow!
+ tStart_ = n.clip(tStart, winStart, winEnd) - winStart
+ tEnd_ = n.clip(tEnd, winStart, winEnd) - winStart
+ for (s, e) in zip(tStart_, tEnd_):
+ res[s:e] += 1
+ return res
+
+def makeReadLocator(cmpH5, refSeq):
+ """
+ Return a function which can be called iteratively to find reads
+ quickly.
+ """
+ if not cmpH5.isSorted: raise Exception, "CmpH5 is not sorted"
+ offsets = cmpH5.file["/RefGroup/OffsetTable"].value
+ offStart, offEnd = offsets[offsets[:,0] == refSeq, 1:3].ravel()
+
+ if (offEnd - offStart > 0):
+ refAlignIdx = cmpH5.alignmentIndex[offStart:offEnd, ]
+ returnEmpty = False
+ else:
+ refAlignIdx = cmpH5.alignmentIndex[1:2, ]
+ returnEmpty = True
+
+ def f(rangeStart, rangeEnd, justIndices = False):
+ if returnEmpty:
+ ## This looks strange, but the idea is that a rowless matrix
+ ## still has columns and these are what I want to preserve --
+ ## h5py objects cannot be subset by a vector of length 0,
+ ## however, numpy allows this.
+ idxs = n.array([], dtype = 'uint32')
+ else:
+ idxs = getOverlappingRanges(refAlignIdx.tStart, refAlignIdx.tEnd,
+ refAlignIdx.nBackRead, refAlignIdx.nReadOverlap,
+ rangeStart, rangeEnd)
+ if justIndices:
+ return(idxs + offStart)
+ else:
+ return(refAlignIdx[idxs,])
+ return f
+
+def getReadsInRange(cmpH5, coords, justIndices = False):
+ """
+ Return an ndarray representing the portion of the reads which
+ overlap the range specfied by coords, where coords is a
+ three-tuple composed of (refSeqID, rangeStart, rangeEnd). Here,
+ cmpH5 is an hdf5 object representing a pointer to a sorted cmp.h5
+ file.
+ """
+ if not cmpH5.isSorted: raise Exception, "CmpH5 is not sorted"
+ return makeReadLocator(cmpH5, coords[0])(coords[1], coords[2], justIndices)
+
+def getCoverageInRange(cmpH5, coords, rowNumbers=None):
+ """
+ Return a vector of length: coords[2] - coords[1] where each
+ element represents the number of reads overlapping that position
+ in the cmp.h5 file.
+ """
+ if not cmpH5.isSorted: raise Exception, "CmpH5 is not sorted"
+ if rowNumbers==None:
+ rowNumbers = getReadsInRange(cmpH5, coords, justIndices=True)
+ if (len(rowNumbers))==0:
+ return n.array([0]*(coords[2] - coords[1]))
+ else:
+ return(projectIntoRange(cmpH5.tStart[rowNumbers], cmpH5.tEnd[rowNumbers], coords[1], coords[2]))
+
diff --git a/pbcore/model/__init__.py b/pbcore/model/__init__.py
new file mode 100644
index 0000000..6861c47
--- /dev/null
+++ b/pbcore/model/__init__.py
@@ -0,0 +1,29 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
diff --git a/pbcore/sequence.py b/pbcore/sequence.py
new file mode 100644
index 0000000..b3aa1b1
--- /dev/null
+++ b/pbcore/sequence.py
@@ -0,0 +1,62 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# sequence.py: module of basic sequence methods
+# Authors: Brett Bowman, David Alexander
+
+__all__ = [ "complement",
+ "reverseComplement"]
+
+from string import maketrans
+import re
+
+DNA_COMPLEMENT = maketrans('agctAGCT-N', 'tcgaTCGA-N')
+
+def reverse( sequence ):
+ """Return the reverse of any sequence
+ """
+ return sequence[::-1]
+
+def complement( sequence ):
+ """
+ Return the complement of a sequence
+ NOTE: This only currently supports DNA
+ """
+ if re.search('[^AGCTNagctn-]', sequence):
+ raise ValueError("Sequence contains invalid DNA characters - "
+ "only [AGCTN-] allowed")
+ return sequence.translate( DNA_COMPLEMENT )
+
+def reverseComplement( sequence ):
+ """
+ Return the reverse-complement of a sequence
+ NOTE: This only currently supports DNA
+ """
+ return complement(sequence)[::-1]
diff --git a/pbcore/util/Process.py b/pbcore/util/Process.py
new file mode 100644
index 0000000..bd57878
--- /dev/null
+++ b/pbcore/util/Process.py
@@ -0,0 +1,68 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+__doc__="""Useful functions for interacting with processes."""
+import sys
+import os
+import subprocess
+
+def backticks( cmd, merge_stderr=True ):
+ """
+ Simulates the perl backticks (``) command with error-handling support
+ Returns ( command output as sequence of strings, error code, error message )
+ """
+ if merge_stderr:
+ _stderr = subprocess.STDOUT
+ else:
+ _stderr = subprocess.PIPE
+
+ p = subprocess.Popen( cmd, shell=True, stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE, stderr=_stderr,
+ close_fds=True )
+
+ out = [ l[:-1] for l in p.stdout.readlines() ]
+
+ p.stdout.close()
+ if not merge_stderr:
+ p.stderr.close()
+
+ # need to allow process to terminate
+ p.wait()
+
+ errCode = p.returncode and p.returncode or 0
+ if p.returncode>0:
+ errorMessage = os.linesep.join(out)
+ output = []
+ else:
+ errorMessage = ''
+ output = out
+
+ return output, errCode, errorMessage
+
diff --git a/pbcore/util/ToolRunner.py b/pbcore/util/ToolRunner.py
new file mode 100644
index 0000000..37ba71e
--- /dev/null
+++ b/pbcore/util/ToolRunner.py
@@ -0,0 +1,115 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import argparse, cProfile, logging, pstats
+
+
+LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
+
+
+class PBToolRunner(object):
+
+ #
+ # Interface to be overridden in subclasses (client code)
+ #
+ def getVersion(self):
+ raise NotImplementedError()
+
+ def run(self):
+ raise NotImplementedError()
+
+ def validateArgs(self):
+ '''
+ Method to validate args
+ '''
+ pass
+
+ #
+ # Methods below should not be overriden
+ #
+ def __init__(self, description):
+ self._setupParsers(description)
+ self.parser.add_argument(
+ "--verbose", "-v",
+ dest="verbosity", action="count",
+ help="Set the verbosity level")
+ self.parser.add_argument(
+ '--version',
+ action='version', version= '%(prog)s ' + self.getVersion())
+ self.parser.add_argument(
+ "--profile", action="store_true",
+ help="Print runtime profile at exit")
+ self.parser.add_argument(
+ "--debug", action="store_true",
+ help="Catch exceptions in debugger (requires ipdb)")
+
+ def _setupParsers(self, description):
+ self.parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description=description)
+
+ def _parseArgs(self):
+ self.args = self.parser.parse_args()
+
+ def _setupLogging(self):
+ if self.args.verbosity >= 2:
+ logLevel = logging.DEBUG
+ elif self.args.verbosity == 1:
+ logLevel = logging.INFO
+ else:
+ logLevel = logging.WARN
+ logging.basicConfig(level=logLevel, format=LOG_FORMAT)
+
+ def start(self):
+ self._parseArgs()
+ self._setupLogging()
+ self.validateArgs()
+
+ if self.args.debug:
+ try:
+ import ipdb
+ except ImportError:
+ print "--debug requires module 'ipdb'"
+ return -1
+ with ipdb.launch_ipdb_on_exception():
+ self.run()
+
+ elif self.args.profile:
+ l = locals()
+ cProfile.runctx("_rv=self.run()", globals(), l, "profile.out")
+ pstats.Stats("profile.out").sort_stats("time").print_stats(20)
+ return l["_rv"]
+ else:
+ return self.run()
+
+class PBMultiToolRunner(PBToolRunner):
+ def _setupParsers(self, description):
+ self.parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description=description)
+ self.subParsers = self.parser.add_subparsers(dest="subCommand")
diff --git a/pbcore/util/__init__.py b/pbcore/util/__init__.py
new file mode 100644
index 0000000..6861c47
--- /dev/null
+++ b/pbcore/util/__init__.py
@@ -0,0 +1,29 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
diff --git a/pbcore/util/decorators.py b/pbcore/util/decorators.py
new file mode 100644
index 0000000..660ed9f
--- /dev/null
+++ b/pbcore/util/decorators.py
@@ -0,0 +1,17 @@
+import warnings
+
+def deprecated(func):
+ '''This is a decorator which can be used to mark functions
+ as deprecated. It will result in a warning being emitted
+ when the function is used.'''
+ def new_func(*args, **kwargs):
+ if not new_func.__called:
+ warnings.warn('Call to deprecated function "{0}".'.format(func.__name__),
+ stacklevel=2)
+ new_func.__called = True
+ return func(*args, **kwargs)
+ new_func.__name__ = func.__name__
+ new_func.__doc__ = func.__doc__
+ new_func.__dict__.update(func.__dict__)
+ new_func.__called = False
+ return new_func
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..4f5c568
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,32 @@
+from setuptools import setup, Extension, find_packages
+import sys
+
+if ("install" in sys.argv) and sys.version_info < (2, 7, 0):
+ print "pbcore requires Python 2.7"
+ sys.exit(-1)
+
+globals = {}
+execfile("pbcore/__init__.py", globals)
+__VERSION__ = globals["__VERSION__"]
+
+setup(
+ name = 'pbcore',
+ version=__VERSION__,
+ author='Pacific Biosciences',
+ author_email='devnet at pacificbiosciences.com',
+ license=open('LICENSES.txt').read(),
+ packages = find_packages('.'),
+ package_dir = {'':'.'},
+ package_data = {'pbcore': ['data/*.h5', 'data/*.gff', 'data/*.fasta',
+ 'data/*.fasta.fai', 'data/*.fofn', 'data/*.m4',
+ 'data/*.fa', 'data/*.fa.fai',
+ 'data/*.m5', 'data/*.bam', 'data/*.bam.bai', "data/*.bam.pbi",
+ 'chemistry/resources/*.xml']
+ },
+ zip_safe = False,
+ entry_points = { "console_scripts" : [ ".open = pbcore.io.opener:entryPoint" ] },
+ install_requires=[
+ 'h5py >= 2.0.1',
+ 'numpy >= 1.6.0',
+ 'pysam == 0.8.1'
+ ])
diff --git a/tests/test_pbcore_data.py b/tests/test_pbcore_data.py
new file mode 100644
index 0000000..329ff36
--- /dev/null
+++ b/tests/test_pbcore_data.py
@@ -0,0 +1,12 @@
+from nose.tools import assert_equal
+from pbcore import data
+
+class TestGetCmpH5s:
+ def test_get_cmp_h5s(self):
+ for item in data.getCmpH5s():
+ assert 'cmph5' in item
+ assert 'bash5s' in item
+
+class TestGetCmpH5:
+ def test_get_cmp_h5(self):
+ assert data.getCmpH5().endswith(".cmp.h5")
diff --git a/tests/test_pbcore_io_AlnFileReaders.py b/tests/test_pbcore_io_AlnFileReaders.py
new file mode 100644
index 0000000..2ac62c5
--- /dev/null
+++ b/tests/test_pbcore_io_AlnFileReaders.py
@@ -0,0 +1,375 @@
+from numpy.testing import (assert_array_almost_equal as ASIM,
+ assert_array_equal as AEQ)
+from nose.tools import (nottest,
+ assert_raises,
+ assert_equal as EQ)
+from nose import SkipTest
+
+import numpy as np
+import bisect
+import h5py
+from collections import Counter
+
+from pbcore import data
+from pbcore.io import CmpH5Reader, BamReader, IndexedBamReader
+from pbcore.sequence import reverseComplement as RC
+from pbcore.chemistry import ChemistryLookupError
+
+
+class _BasicAlnFileReaderTests(object):
+ """
+ Abstract base class for tests of the basic reader
+ functionality---functionality not requiring the bam.pbi index.
+
+ The tests are pretty tailored to the BAM/cmp.h5 files in
+ pbcore.data.
+ """
+ READER_CONSTRUCTOR = None
+ CONSTRUCTOR_ARGS = None
+ BAX_FILE = data.getBaxForBam()
+
+ def __init__(self):
+ self.f = self.READER_CONSTRUCTOR(*self.CONSTRUCTOR_ARGS)
+ self.alns = list(self.f)
+ self.fwdAln = self.alns[70]
+ self.revAln = self.alns[71]
+
+ def testBasicOperations(self):
+ EQ(False, self.f.isEmpty)
+ EQ(True, self.f.isSorted)
+ EQ(115, len(self.f))
+
+ def testStrandOrientation(self):
+ EQ(True, self.fwdAln.isForwardStrand)
+ EQ(False, self.fwdAln.isReverseStrand)
+ EQ(False, self.revAln.isForwardStrand)
+ EQ(True, self.revAln.isReverseStrand)
+
+ def testReadName(self):
+ EQ("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9681_9727",
+ self.fwdAln.readName)
+ EQ("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9561_9619",
+ self.revAln.readName)
+
+ def testAlignedRead(self):
+ expectedFwdNative = "TACGGTCATCATCTGACACTACAGACTCTGGCATCGCTGTGAAGAC"
+ EQ(expectedFwdNative, self.fwdAln.read(aligned=True))
+ EQ(expectedFwdNative, self.fwdAln.read())
+ EQ(expectedFwdNative, self.fwdAln.read(orientation="genomic"))
+ expectedRevNative = "CTTGTGAAAATGCTGAATTCT-GCGTCG-CTTCACCAGCGATGCCA-AGTCTGTAGTGTCA"
+ EQ(expectedRevNative, self.revAln.read(aligned=True))
+ EQ(expectedRevNative, self.revAln.read())
+ EQ(RC(expectedRevNative), self.revAln.read(orientation="genomic"))
+
+ def testUnalignedRead(self):
+ expectedFwdNative = "TACGGTCATCATCTGACACTACAGACTCTGGCATCGCTGTGAAGAC"
+ EQ(expectedFwdNative, self.fwdAln.read(aligned=False))
+ EQ(expectedFwdNative, self.fwdAln.read(aligned=False, orientation="genomic"))
+ expectedRevNative = "CTTGTGAAAATGCTGAATTCTGCGTCGCTTCACCAGCGATGCCAAGTCTGTAGTGTCA"
+ EQ(expectedRevNative, self.revAln.read(aligned=False))
+ EQ(RC(expectedRevNative), self.revAln.read(aligned=False, orientation="genomic"))
+
+ def testAlignedReference(self):
+ expectedFwdNative = "TACGGTCATCATCTGACACTACAGACTCTGGCATCGCTGTGAAGAC"
+ EQ(expectedFwdNative, self.fwdAln.reference(aligned=True))
+ EQ(expectedFwdNative, self.fwdAln.reference())
+ EQ(expectedFwdNative, self.fwdAln.reference(orientation="genomic"))
+ expectedRevNative = "CTTGTGAAAATGCTGAATT-TCGCGTCGTCTTCA-CAGCGATGCCAGAGTCTGTAGTGTCA"
+ EQ(expectedRevNative, self.revAln.reference(aligned=True))
+ EQ(expectedRevNative, self.revAln.reference())
+ EQ(RC(expectedRevNative), self.revAln.reference(orientation="genomic"))
+
+ def testUnalignedReference(self):
+ expectedFwdNative = "TACGGTCATCATCTGACACTACAGACTCTGGCATCGCTGTGAAGAC"
+ EQ(expectedFwdNative, self.fwdAln.reference(aligned=False))
+ EQ(expectedFwdNative, self.fwdAln.reference(aligned=False, orientation="genomic"))
+ expectedRevNative = "CTTGTGAAAATGCTGAATTTCGCGTCGTCTTCACAGCGATGCCAGAGTCTGTAGTGTCA"
+ EQ(expectedRevNative, self.revAln.reference(aligned=False))
+ EQ(RC(expectedRevNative), self.revAln.reference(aligned=False, orientation="genomic"))
+
+ def testDeletionQV(self):
+ expectedFwdNative = [ 17, 17, 7, 17, 17, 6, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 7, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17 ]
+ AEQ(expectedFwdNative, self.fwdAln.DeletionQV(aligned=True))
+ AEQ(expectedFwdNative, self.fwdAln.DeletionQV())
+ AEQ(expectedFwdNative, self.fwdAln.DeletionQV(orientation="genomic"))
+
+ expectedRevNative = [ 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 255, 7, 17, 17, 17,
+ 17, 17, 255, 6, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 255, 17, 17, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17 ]
+ AEQ(expectedRevNative, self.revAln.DeletionQV(aligned=True))
+ AEQ(expectedRevNative, self.revAln.DeletionQV())
+ AEQ(expectedRevNative[::-1], self.revAln.DeletionQV(orientation="genomic"))
+
+
+ # def testInsertionQV(self):
+ # pass
+
+ # def testSubstitutionQV(self):
+ # pass
+
+ # def testIPD(self):
+ # pass
+
+ def testDeletionTag(self):
+ expectedFwdNative = [78, 78, 84, 78, 78, 67, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 65, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78]
+ AEQ(expectedFwdNative, self.fwdAln.DeletionTag(aligned=True))
+ AEQ(expectedFwdNative, self.fwdAln.DeletionTag())
+ AEQ(expectedFwdNative, self.fwdAln.DeletionTag(orientation="genomic"))
+
+ expectedRevNative = [78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
+ 78, 78, 78, 78, 45, 67, 78, 78, 78, 78, 78, 45, 84, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 45, 78, 78, 78, 78,
+ 78, 78, 78, 78, 78, 78, 78, 78, 78, 78]
+ AEQ(expectedRevNative, self.revAln.DeletionTag(aligned=True))
+ AEQ(expectedRevNative, self.revAln.DeletionTag())
+
+ # TODO: what is the correct behavior here?
+ #AEQ(expectedRevNative[::-1], self.revAln.DeletionTag(orientation="genomic"))
+
+ def testClippedAlignments(self):
+ # Get a more interesting (more gappy) fwd strand aln
+ a = self.alns[2]
+ EQ([(980, 'C', 'C'),
+ (981, 'C', 'C'),
+ (982, 'T', 'T'),
+ (983, 'A', '-'),
+ (984, 'C', 'C'),
+ (985, '-', 'G'),
+ (985, 'T', 'T'),
+ (986, 'T', 'T') ],
+ zip(a.referencePositions(), a.reference(), a.read())[308:316])
+
+ ac1 = a.clippedTo(983, 985)
+ EQ(983, ac1.referenceStart)
+ EQ(985, ac1.referenceEnd)
+ EQ([(983, 'A', '-'),
+ (984, 'C', 'C')],
+ zip(ac1.referencePositions(), ac1.reference(), ac1.read()))
+
+ ac2 = a.clippedTo(982, 986)
+ EQ(982, ac2.referenceStart)
+ EQ(986, ac2.referenceEnd)
+ EQ([(982, 'T', 'T'),
+ (983, 'A', '-'),
+ (984, 'C', 'C'),
+ (985, '-', 'G'),
+ (985, 'T', 'T')],
+ zip(ac2.referencePositions(), ac2.reference(), ac2.read()))
+
+ ac3 = a.clippedTo(984, 985)
+ EQ(984, ac3.referenceStart)
+ EQ(985, ac3.referenceEnd)
+ EQ([(984, 'C', 'C')],
+ zip(ac3.referencePositions(), ac3.reference(), ac3.read()))
+
+ # Get a more interesting (more gappy) rev strand aln
+ b = self.alns[3]
+ EQ([(2216, 'G', 'G'),
+ (2215, 'G', 'G'),
+ (2214, '-', 'C'),
+ (2214, 'C', 'C'),
+ (2213, 'A', 'A'),
+ (2212, 'T', 'T'),
+ (2211, 'G', 'G'),
+ (2210, 'C', 'C'),
+ (2209, 'T', 'T'),
+ (2208, 'G', '-'),
+ (2207, 'G', 'G'),
+ (2206, 'C', 'C')],
+ zip(b.referencePositions(), b.reference(), b.read())[188:200])
+
+ bc1 = b.clippedTo(2208, 2214)
+ EQ([(2213, 'A', 'A'),
+ (2212, 'T', 'T'),
+ (2211, 'G', 'G'),
+ (2210, 'C', 'C'),
+ (2209, 'T', 'T'),
+ (2208, 'G', '-')],
+ zip(bc1.referencePositions(), bc1.reference(), bc1.read()))
+
+ bc2 = b.clippedTo(2207, 2215)
+ EQ([(2214, 'C', 'C'),
+ (2213, 'A', 'A'),
+ (2212, 'T', 'T'),
+ (2211, 'G', 'G'),
+ (2210, 'C', 'C'),
+ (2209, 'T', 'T'),
+ (2208, 'G', '-'),
+ (2207, 'G', 'G')],
+ zip(bc2.referencePositions(), bc2.reference(), bc2.read()))
+
+ bc3 = b.clippedTo(2209, 2214)
+ EQ([(2213, 'A', 'A'),
+ (2212, 'T', 'T'),
+ (2211, 'G', 'G'),
+ (2210, 'C', 'C'),
+ (2209, 'T', 'T')],
+ zip(bc3.referencePositions(), bc3.reference(), bc3.read()))
+
+
+ # Test clipping in a large deletion
+ d = self.alns[52]
+ EQ([(16191, 'C', 'C'),
+ (16192, 'A', 'A'),
+ (16193, 'G', 'G'),
+ (16194, 'C', 'C'),
+ (16195, 'A', 'A'),
+ (16196, 'G', '-'),
+ (16197, 'G', '-'),
+ (16198, 'T', '-'),
+ (16199, 'G', 'G'),
+ (16200, 'A', 'A'),
+ (16201, 'G', 'G')],
+ zip(d.referencePositions(), d.reference(), d.read())[129:140])
+ dc1 = d.clippedTo(16196, 16198)
+
+ # where's the test code?
+
+ def testBaxAttaching(self):
+ # Before attaching, should get sane exceptions
+ with assert_raises(ValueError):
+ self.fwdAln.zmw
+
+ with assert_raises(ValueError):
+ self.fwdAln.zmwRead
+
+ # Now attach
+ self.f.attach(self.BAX_FILE)
+ EQ('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9681_9727',
+ self.fwdAln.readName)
+ EQ('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957'
+ , self.fwdAln.zmwName)
+ EQ('<Zmw: m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957>',
+ repr(self.fwdAln.zmw))
+ EQ('<ZmwRead: m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9681_9727>',
+ repr(self.fwdAln.zmwRead))
+
+ # Check read contents, for every aln.
+ for aln in self.alns:
+ EQ(aln.read(aligned=False, orientation="native"), aln.zmwRead.basecalls())
+
+
+ def testClippingsVsBaxData(self):
+ self.f.attach(self.BAX_FILE)
+ for aln in [self.alns[52], self.alns[8]]:
+ for cS in xrange(aln.tStart, aln.tEnd + 1):
+ for cE in xrange(cS + 1, min(aln.tEnd, cS + 10)):
+ ca = aln.clippedTo(cS, cE)
+ EQ(ca.zmwRead.basecalls(),
+ ca.read(aligned=False, orientation="native"))
+
+ def testReadsInRange(self):
+ wLen = 1000
+ for wStart in xrange(0, 50000, wLen):
+ wEnd = wStart + wLen
+ expectedNames = set([ a.readName for a in self.alns
+ if (a.referenceName == "lambda_NEB3011" and
+ a.overlapsReferenceRange(wStart, wEnd)) ])
+ EQ(expectedNames,
+ set([ a.readName for a in self.f.readsInRange("lambda_NEB3011", wStart, wEnd) ]))
+
+ def testReadGroupTable(self):
+ rgFwd = self.fwdAln.readGroupInfo
+ EQ([('ID', '<i4'), ('MovieName', 'O'), ('ReadType', 'O'), ('SequencingChemistry', 'O')], rgFwd.dtype)
+ EQ("P6-C4", rgFwd.SequencingChemistry)
+ EQ("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0", rgFwd.MovieName)
+ #EQ("bar", rgFwd.ReadType)
+
+ def testSequencingChemistry(self):
+ EQ(["P6-C4"], self.f.sequencingChemistry)
+ EQ("P6-C4", self.fwdAln.sequencingChemistry)
+ EQ("P6-C4", self.revAln.sequencingChemistry)
+
+
+
+class _IndexedAlnFileReaderTests(_BasicAlnFileReaderTests):
+ """
+ Abstract base class for tests of the reader functionality
+ requiring an alignment index (or bam.pbi index)
+ """
+
+ def testMapQV(self):
+ c = Counter(self.f.mapQV)
+ EQ(Counter({254: 115}), c)
+
+ def testHoleNumbers(self):
+ c = Counter([a.holeNumber for a in self.f]) # from records
+ c2 = Counter(self.f.holeNumber) # from index
+ expected = Counter({37134: 14, 6251: 10, 32861: 8, 14743: 4, 35858: 3,
+ 39571: 3, 13473: 3, 32560: 3, 46835: 3, 47698: 3, 16996: 3,
+ 30983: 2, 38025: 2, 36363: 2, 7957: 2, 49050: 2, 23454: 2,
+ 49194: 2, 24494: 2, 20211: 2, 50621: 2, 12736: 2, 19915: 2,
+ 6469: 2, 31174: 2, 32328: 2, 42827: 2, 7247: 2, 50257: 2,
+ 2771: 2, 1650: 2, 45203: 2, 24962: 1, 32901: 1, 36628: 1,
+ 26262: 1, 15641: 1, 19360: 1, 42165: 1, 44356: 1, 51534: 1,
+ 29843: 1, 38754: 1, 52206: 1, 49521: 1, 7670: 1, 54396: 1,
+ 19837: 1})
+ EQ(expected, c)
+ EQ(expected, c2)
+
+ def testAlignedIdentity(self):
+ pass
+
+ def testReadsByName(self):
+ reads2771_1 = self.f.readsByName("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771/*")
+ reads2771_2 = self.f.readsByName("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771")
+ reads2771_3 = self.f.readsByName("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771/")
+
+ expectedReadNames = ["m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771/8741_8874",
+ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771/8942_9480"]
+
+ EQ(expectedReadNames, [r.readName for r in reads2771_1])
+ EQ(expectedReadNames, [r.readName for r in reads2771_2])
+ EQ(expectedReadNames, [r.readName for r in reads2771_3])
+
+
+class TestCmpH5(_IndexedAlnFileReaderTests):
+ READER_CONSTRUCTOR = CmpH5Reader
+ CONSTRUCTOR_ARGS = (data.getBamAndCmpH5()[1],)
+
+ #
+ # Test behaviors specific to CmpH5Reader, which should be few.
+ #
+ def testLazyChemistryResolution(self):
+ """
+ The CmpH5Reader allows reading of files that have missing
+ chemistry information---an exception will be thrown only upon
+ attempts to access the information. We need to retain this
+ behavior for compatibility. """
+ oldCmpH5 = data.getCmpH5()
+
+ C = CmpH5Reader(oldCmpH5) # no exception here
+
+ with assert_raises(ChemistryLookupError):
+ C.sequencingChemistry
+
+ with assert_raises(ChemistryLookupError):
+ C[0].sequencingChemistry
+
+
+class TestBasicBam(_BasicAlnFileReaderTests):
+ READER_CONSTRUCTOR = BamReader
+ CONSTRUCTOR_ARGS = (data.getBamAndCmpH5()[0], data.getLambdaFasta())
+
+ def testSpecVersion(self):
+ EQ("3.0b3", self.f.version)
+
+ # def testNoLegacyBamTags(self):
+ # # junk from older PacBio BAM spec versions doesn't belong
+ # tagNames = [x[0] for x in self.fwdAln.peer.tags]
+ # EQ(set(["RG",
+ # "qs", "qe", "zm", "np", "rq",
+ # "dq", "dt", "iq", "mq", "sq"]),
+ # set(tagNames))
+
+
+class TestIndexedBam(_IndexedAlnFileReaderTests):
+ READER_CONSTRUCTOR = IndexedBamReader
+ CONSTRUCTOR_ARGS = (data.getBamAndCmpH5()[0], data.getLambdaFasta())
diff --git a/tests/test_pbcore_io_BarcodeH5Reader.py b/tests/test_pbcore_io_BarcodeH5Reader.py
new file mode 100644
index 0000000..eb26f89
--- /dev/null
+++ b/tests/test_pbcore_io_BarcodeH5Reader.py
@@ -0,0 +1,141 @@
+import nose.tools
+import numpy
+import numpy.testing
+
+import pbcore.data
+
+from pbcore.data import MOVIE_NAME_BC
+from pbcore.io.BarcodeH5Reader import BarcodeH5Reader, BarcodeH5Fofn, MPBarcodeH5Reader, LabeledZmw
+
+class TestBarcodeH5Reader(object):
+ """Tests of BarcodeH5Reader against a generic BarcodeH5 file
+ """
+
+ def __init__(self):
+ bcFiles = pbcore.data.getBcH5s()
+ print bcFiles
+ self.bc1, self.bc2, self.bc3 = map(BarcodeH5Reader, bcFiles)
+
+ def test_BarcodeH5Reader_basicTest(self):
+ """Test that BcH5Reader correctly sets movie name, barcode labels, and hole numbers
+ """
+
+ nose.tools.assert_equal(MOVIE_NAME_BC, self.bc1.movieName)
+ numpy.testing.assert_array_equal(["F3--R3", "F4--R4", "F6--R6", "F7--R7"],
+ self.bc1.barcodeLabels)
+ numpy.testing.assert_array_equal([ 922, 1416, 1436, 1466, 1480, 1551,
+ 1561, 1564, 1765, 1902, 1925, 1982,
+ 2111, 2133, 2136, 2139, 2210, 2306],
+ self.bc1.holeNumbers)
+
+ nose.tools.assert_equal(MOVIE_NAME_BC, self.bc2.movieName)
+ numpy.testing.assert_array_equal(["F3--R3", "F4--R4", "F6--R6", "F7--R7"],
+ self.bc2.barcodeLabels)
+ numpy.testing.assert_array_equal([54505, 54506, 54507, 54516, 54535, 54542,
+ 54543, 54547, 54562, 54588, 54618, 54622,
+ 54632, 54633, 54645, 54650, 54653, 54658],
+ self.bc2.holeNumbers)
+
+ nose.tools.assert_equal(MOVIE_NAME_BC, self.bc3.movieName)
+ numpy.testing.assert_array_equal(["F3--R3", "F4--R4", "F6--R6", "F7--R7"],
+ self.bc3.barcodeLabels)
+ numpy.testing.assert_array_equal([108990, 109015, 109016, 109017, 109021, 109023,
+ 109029, 109031, 109032, 109033, 109036, 109040,
+ 109042, 109045, 109047, 109071, 109075, 109081],
+ self.bc3.holeNumbers)
+
+ def test_BarcodeH5Reader_iterator(self):
+ """Test that BcH5Reader correctly iterates over it's labeled ZMWs
+ """
+
+ labeledZmws1 = [ lZmw for lZmw in self.bc1.labeledZmws.values() ]
+ sortedZmws1 = sorted(labeledZmws1, key=lambda z: z.holeNumber)
+ nose.tools.assert_equal(sortedZmws1, list(self.bc1))
+
+ labeledZmws2 = [ lZmw for lZmw in self.bc2.labeledZmws.values() ]
+ sortedZmws2 = sorted(labeledZmws2, key=lambda z: z.holeNumber)
+ nose.tools.assert_equal(sortedZmws2, list(self.bc2))
+
+ labeledZmws3 = [ lZmw for lZmw in self.bc3.labeledZmws.values() ]
+ sortedZmws3 = sorted(labeledZmws3, key=lambda z: z.holeNumber)
+ nose.tools.assert_equal(sortedZmws3, list(self.bc3))
+
+class TestBarcodeH5Fofn(object):
+ """Tests of BarcodeH5RFofn against a generic 3 generic BarcodeH5 file
+ """
+
+ def __init__(self):
+ bcFofn = pbcore.data.getBcFofn()
+ print bcFofn
+ self.bcFofn = BarcodeH5Fofn(bcFofn)
+ print self.bcFofn
+
+ def test_BasH5Fofn_basicTest(self):
+ """Test that BcH5Fofn correctly sets movie name, barcode labels, and hole numbers
+ """
+
+ nose.tools.assert_equal(1, len(self.bcFofn.movieNames))
+ numpy.testing.assert_array_equal(MOVIE_NAME_BC, self.bcFofn.movieNames[0])
+ numpy.testing.assert_array_equal(["F3--R3", "F4--R4", "F6--R6", "F7--R7"],
+ self.bcFofn.barcodeLabels)
+ nose.tools.assert_equal("paired", self.bcFofn.scoreMode)
+
+ numpy.testing.assert_array_equal([ 922, 1416, 1436, 1466, 1480, 1551,
+ 1561, 1564, 1765, 1902, 1925, 1982,
+ 2111, 2133, 2136, 2139, 2210, 2306,
+ 54505, 54506, 54507, 54516, 54535, 54542,
+ 54543, 54547, 54562, 54588, 54618, 54622,
+ 54632, 54633, 54645, 54650, 54653, 54658,
+ 108990, 109015, 109016, 109017, 109021, 109023,
+ 109029, 109031, 109032, 109033, 109036, 109040,
+ 109042, 109045, 109047, 109071, 109075, 109081],
+ self.bcFofn.holeNumbers)
+
+ def test_BcH5Fofn_iterator(self):
+ """Test that BcH5Fofn correctly iterates over it's labeled ZMWs
+ """
+
+ labeledZmws = [ lZmw for reader in self.bcFofn._bcH5s
+ for lZmw in reader ]
+ nose.tools.assert_equal(labeledZmws, list(self.bcFofn))
+
+ def test_BcH5Fofn_indexing(self):
+ """Test that BcH5Fofn's indexing correctly slices and returns its contents
+ """
+
+ holeNumTest = self.bcFofn[922]
+ nose.tools.assert_true(isinstance(holeNumTest, LabeledZmw))
+ nose.tools.assert_equal(holeNumTest.holeNumber, 922)
+
+ barcodeTest = self.bcFofn["F3--R3"]
+ nose.tools.assert_true(isinstance(barcodeTest, list))
+ barcodeTestHoleNums = [lzmw.holeNumber for lzmw in barcodeTest]
+ numpy.testing.assert_array_equal([ 1416, 1551, 1561, 1765, 1902, 1925, 2133,
+ 54506, 54588, 54618, 109033, 109036, 109071, 109081],
+ barcodeTestHoleNums)
+
+ movieTest = self.bcFofn[MOVIE_NAME_BC]
+ nose.tools.assert_true(isinstance(movieTest, MPBarcodeH5Reader))
+ movieTestHoleNums = [lzmw.holeNumber for lzmw in movieTest]
+ numpy.testing.assert_array_equal([ 922, 1416, 1436, 1466, 1480, 1551,
+ 1561, 1564, 1765, 1902, 1925, 1982,
+ 2111, 2133, 2136, 2139, 2210, 2306,
+ 54505, 54506, 54507, 54516, 54535, 54542,
+ 54543, 54547, 54562, 54588, 54618, 54622,
+ 54632, 54633, 54645, 54650, 54653, 54658,
+ 108990, 109015, 109016, 109017, 109021, 109023,
+ 109029, 109031, 109032, 109033, 109036, 109040,
+ 109042, 109045, 109047, 109071, 109075, 109081],
+ movieTestHoleNums)
+
+ movieBarcodeTest = self.bcFofn[MOVIE_NAME_BC + "/F3--R3"]
+ movieBarcodeTestHoleNums = [lzmw.holeNumber for lzmw in movieBarcodeTest]
+ numpy.testing.assert_array_equal([ 1416, 1551, 1561, 1765, 1902, 1925, 2133,
+ 54506, 54588, 54618, 109033, 109036, 109071, 109081],
+ movieBarcodeTestHoleNums)
+
+ zmwTest = self.bcFofn[MOVIE_NAME_BC + "/922"]
+ nose.tools.assert_equal(zmwTest.holeNumber, 922)
+
+ subreadTest = self.bcFofn[MOVIE_NAME_BC + "/922/0_1000"]
+ nose.tools.assert_equal(subreadTest.holeNumber, 922)
\ No newline at end of file
diff --git a/tests/test_pbcore_io_BasH5Collection.py b/tests/test_pbcore_io_BasH5Collection.py
new file mode 100644
index 0000000..ef6554e
--- /dev/null
+++ b/tests/test_pbcore_io_BasH5Collection.py
@@ -0,0 +1,28 @@
+from nose.tools import assert_equal, assert_true, assert_false
+from numpy.testing import assert_array_equal
+from StringIO import StringIO
+
+from pbcore.io import BasH5Collection
+from pbcore import data
+
+def lookupSomeReadsByName(bc):
+ pass
+
+def test():
+ for fofn in data.getFofns():
+ bc = BasH5Collection(fofn)
+
+ for zmw in bc:
+ zmwAgain = bc[zmw.zmwName]
+ assert_equal(zmw.zmwName, zmwAgain.zmwName)
+
+
+
+def test_read_iterators():
+ for fofn in data.getFofns():
+ bc = BasH5Collection(fofn)
+
+ # TODO Add some meaningful tests here
+ list(bc.subreads())
+ list(bc.reads())
+ list(bc.ccsReads())
diff --git a/tests/test_pbcore_io_BasH5Reader.py b/tests/test_pbcore_io_BasH5Reader.py
new file mode 100644
index 0000000..a1d5e10
--- /dev/null
+++ b/tests/test_pbcore_io_BasH5Reader.py
@@ -0,0 +1,494 @@
+import inspect
+import os
+
+import h5py
+import nose.tools
+import numpy
+import numpy.testing
+
+import pbcore.data
+
+from pbcore.io.BasH5IO import BasH5Reader, Zmw, ZmwRead, CCSZmwRead
+from pbcore.chemistry import ChemistryLookupError
+
+class TestBasH5Reader_14:
+ """Tests of BasH5Reader against a 1.4 bas.h5 file, no multipart with
+ CCS.
+ """
+
+ def __init__(self):
+ self.cmpH5 = pbcore.io.CmpH5Reader(pbcore.data.getCmpH5())
+ basFiles = pbcore.data.getBasH5s()
+ self.bas1, self.bas2 = map(pbcore.io.BasH5Reader, basFiles)
+
+ def test_BasH5Reader_basicTest(self):
+ """Test that BasH5Reader correctly sets moviename, identifies the
+ sequencingZmws, and finds the subreads for each Zmw.
+ """
+
+ nose.tools.assert_equal(pbcore.data.MOVIE_NAME_14, self.bas1.movieName)
+ numpy.testing.assert_array_equal([ 7, 8, 9, 1000, 1006, 1007,
+ 2001, 2003, 2007, 2008, 3004, 3006,
+ 3008, 4004, 4005, 4006, 4007, 4009],
+ self.bas1.sequencingZmws)
+ numpy.testing.assert_array_equal([ 7, 8, 9, 1000, 1001, 1002,
+ 1003, 1004, 1005, 1006, 1007, 1008,
+ 1009, 2000, 2001, 2002, 2003, 2004,
+ 2005, 2006, 2007, 2008, 2009, 3000,
+ 3001, 3002, 3003, 3004, 3005, 3006,
+ 3007, 3008, 3009, 4000, 4001, 4002,
+ 4003, 4004, 4005, 4006, 4007, 4008,
+ 4009],
+ self.bas1.allSequencingZmws)
+
+ for zmw in self.bas1:
+ nose.tools.assert_greater(len(zmw.subreads), 0)
+
+ def test_BasH5Reader_basecallsVsCmpH5(self):
+ """Compare datasets in the bas.h5 file against those in a corresponding
+ cmp.h5 file.
+ """
+
+ aln = self.cmpH5[2]
+ nose.tools.assert_equal(os.path.join(pbcore.data.MOVIE_NAME_14, "2001", "3580_3922"),
+ aln.readName)
+
+ zmwRead = self.bas1[2001].read(3580, 3922)
+ nose.tools.assert_equal(os.path.join(pbcore.data.MOVIE_NAME_14, "2001", "3580_3922"),
+ zmwRead.readName)
+
+ # Verify that the bases and a couple of quality values are the same
+ nose.tools.assert_equal(aln.read(aligned=False), zmwRead.basecalls())
+ numpy.testing.assert_array_equal(aln.InsertionQV(aligned=False),
+ zmwRead.InsertionQV())
+ numpy.testing.assert_array_equal(aln.DeletionQV(aligned=False),
+ zmwRead.DeletionQV())
+ numpy.testing.assert_array_equal(aln.QualityValue(aligned=False),
+ zmwRead.QualityValue())
+
+ def test_BasH5Reader_regionTableAccessors(self):
+ """Test that BasH5Reader can read the region table and find
+ HQ, insert, and adapter regions.
+ """
+
+ zmw = self.bas1[7]
+ numpy.testing.assert_array_equal(
+ numpy.array([[ 7, 1, 0, 299, -1],
+ [ 7, 1, 343, 991, -1],
+ [ 7, 1, 1032, 1840, -1],
+ [ 7, 0, 299, 343, 681],
+ [ 7, 0, 991, 1032, 804],
+ [ 7, 2, 0, 1578, 0]], dtype=numpy.int32),
+ zmw.regionTable.view(dtype=(numpy.int32, 5)))
+
+ nose.tools.assert_equal((0, 1578), zmw.hqRegion)
+ nose.tools.assert_equal([(299, 343), (991, 1032)], zmw.adapterRegions)
+ nose.tools.assert_equal([(0, 299), (343, 991), (1032, 1578)],
+ zmw.insertRegions)
+
+ def test_BasH5Reader_ccs(self):
+ """Test that BasH5Reader can read the CCS bases."""
+
+ nose.tools.assert_equal(self.bas1[4006].ccsRead.basecalls(),
+ ''.join(['GGCGCACGGAGGAGCAAGCGTGACAGTCCCACGTCATGCCCGCCGACG',
+ 'ATATCGAGCTCGCGCTCACCGCCAGGGTGTGAAGTGAATTCACGGTGC',
+ 'CGCCGAAAGCTGGGCCGGCTTTCGTTCCTTCGCCGGTCAGGAGAAGGC',
+ 'GGACCCCGTCGTGGGCCATTCCGAGCCTGGAGACAGCGGTCGAAAAAG',
+ 'CCTTCGCCAAGCCGGTGGCCAAATGGTCGGCCAGCGAGAATCCGTGC']))
+
+ def test_BasH5Reader_productivity(self):
+ nose.tools.assert_equal(1, self.bas1[4006].productivity)
+
+ def test_BasH5Reader_readScore(self):
+ nose.tools.assert_almost_equal(0.7822426, self.bas1[4006].readScore)
+
+ @nose.tools.raises(ChemistryLookupError)
+ def test_14_missing_chemistry(self):
+ """Tests that we raise an exception when we can't find chemistry information"""
+ self.bas1.sequencingChemistry
+
+ def test_ZmwRead_len(self):
+ """Test that ZmwRead objects have the correct len."""
+ nose.tools.assert_equal(1126, len(self.bas1[4006].read().basecalls()))
+ nose.tools.assert_equal(1126, len(self.bas1[4006].read()))
+ nose.tools.assert_equal(464,
+ len(self.bas1[4006].subreads[0].basecalls()))
+ nose.tools.assert_equal(464, len(self.bas1[4006].subreads[0]))
+ nose.tools.assert_equal(239, len(self.bas1[4006].ccsRead.basecalls()))
+ nose.tools.assert_equal(239, len(self.bas1[4006].ccsRead))
+
+class CommonTests(object):
+
+ ZMW_ATTRIBUTES = ['QualityValue', 'InsertionQV', 'DeletionQV',
+ 'DeletionTag', 'SubstitutionQV', 'SubstitutionTag',
+ 'MergeQV', 'IPD', 'PreBaseFrames', 'PulseWidth',
+ 'WidthInFrames']
+
+ def test_all_fields_accessible(self):
+ # Test that zmws have correct pulse/quality attributes
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+
+ for zmw in reader.sequencingZmws:
+ read = reader[zmw].read()
+ for attribute in self.ZMW_ATTRIBUTES:
+ nose.tools.assert_is_instance(getattr(read, attribute)(),
+ numpy.ndarray)
+ numpy.testing.assert_array_equal(read.IPD(), read.PreBaseFrames())
+ numpy.testing.assert_array_equal(read.PulseWidth(),
+ read.WidthInFrames())
+
+ def test_zmw_region_table(self):
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+
+ sequencing_zmws = set(reader.sequencingZmws)
+
+ for zmw in reader.allSequencingZmws:
+ region_table = reader[zmw].regionTable.tolist()
+ hq_entry = [k for k in region_table if k[1] == 2][0]
+
+ hq_size = hq_entry[3] - hq_entry[2]
+ # Sequencing Zmws should have an HQ region
+ if zmw not in sequencing_zmws:
+ nose.tools.assert_equal(hq_size, 0)
+ else:
+ nose.tools.assert_greater(hq_size, 0)
+
+ for entry in region_table:
+ nose.tools.assert_equal(entry[0], zmw)
+ nose.tools.assert_less_equal(entry[2], entry[3])
+ reader.close()
+
+class ReadIteratorTests(object):
+
+ def test_read_iterators(self):
+ for fname in [self.bash5_filename] + self.baxh5_filenames:
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+
+ if reader.hasConsensusBasecalls:
+ ccsReads = [ zmw.ccsRead
+ for zmw in reader
+ if zmw.ccsRead is not None ]
+ nose.tools.assert_equal(ccsReads, list(reader.ccsReads()))
+ else:
+ nose.tools.assert_equal([], list(reader.ccsReads()))
+
+ if reader.hasRawBasecalls:
+ subreads = [ subread
+ for zmw in reader
+ for subread in zmw.subreads ]
+ nose.tools.assert_equal(subreads, list(reader.subreads()))
+
+ reads = [ zmw.read()
+ for zmw in reader ]
+ nose.tools.assert_equal(reads, list(reader.reads()))
+ else:
+ nose.tools.assert_equal([], list(reader.reads()))
+ nose.tools.assert_equal([], list(reader.subreads()))
+
+class CommonMultiPartTests(object):
+
+ def test_multipart_constructor_bash5(self):
+ # Test the constuctor of a multipart bas.h5 file
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+ nose.tools.assert_is_instance(reader.file, h5py.File)
+
+ # Should have three parts for v2.0 and v2.1
+ nose.tools.assert_equal(len(reader.parts), 3)
+ nose.tools.assert_list_equal(self.baxh5_filenames,
+ [k.filename for k in reader.parts])
+
+ # All bas.h5 files should have raw base calls. 2.1 bas.h5 files don't
+ # have consensus base calls
+ nose.tools.assert_true(reader.hasRawBasecalls)
+
+
+ for zmw in reader.sequencingZmws:
+ nose.tools.assert_in(zmw, reader.allSequencingZmws)
+ nose.tools.assert_is_instance(reader[zmw], Zmw)
+
+ nose.tools.assert_less_equal(len(reader.sequencingZmws),
+ len(reader.allSequencingZmws))
+
+ reader.close()
+
+ def test_multippart_constructor_baxh5(self):
+ # Test constructor of baxh5 files
+ for filename in self.baxh5_filenames:
+ reader = pbcore.io.BasH5Reader(filename)
+ nose.tools.assert_is_instance(reader.file, h5py.File)
+
+ nose.tools.assert_equal(len(reader.parts), 1)
+ nose.tools.assert_true(reader.hasRawBasecalls)
+
+ for zmw in reader.sequencingZmws:
+ nose.tools.assert_in(zmw, reader.allSequencingZmws)
+ nose.tools.assert_is_instance(reader[zmw], Zmw)
+
+ nose.tools.assert_less_equal(len(reader.sequencingZmws),
+ len(reader.allSequencingZmws))
+
+ reader.close()
+
+ def test_multipart_hole_lookup(self):
+ # Test that multipart files look up files and hole numbers correctly
+ hole_number_to_filename = {}
+ for filename in self.baxh5_filenames:
+ f = h5py.File(filename, 'r')
+ for hole_number in f['PulseData/BaseCalls/ZMW/HoleNumber']:
+ hole_number_to_filename[hole_number] = filename
+ f.close()
+
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+
+ for hole_number in hole_number_to_filename:
+ zmw = reader[hole_number]
+ nose.tools.assert_equal(zmw.baxH5.filename,
+ hole_number_to_filename[hole_number])
+ nose.tools.assert_is_instance(zmw, Zmw)
+
+ reader.close()
+
+ def _clip_region(self, region, hq_region):
+ end = min(region[1], hq_region[1])
+ start = max(region[0], hq_region[0])
+ if start >= end:
+ return None
+ else:
+ return (start, end)
+
+ def test_zmw_multipart_regions(self):
+
+ regions = []
+
+ # First read in the regions from the h5 files directly
+ for filename in self.baxh5_filenames:
+ with h5py.File(filename, 'r') as f:
+ region_table = f['PulseData/Regions']
+ regions.extend(region_table.value.tolist())
+
+ # Now see what BasH5Reader reports for regions
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+ for zmw in reader.allSequencingZmws:
+ region_table = reader[zmw].regionTable.tolist()
+
+ true_regions = [k for k in regions if k[0] == zmw]
+ true_hq_region = [k for k in true_regions if k[1] == 2][0]
+
+ reported_hq_region = reader[zmw].hqRegion
+ nose.tools.assert_equal(reported_hq_region[0], true_hq_region[2])
+ nose.tools.assert_equal(reported_hq_region[1], true_hq_region[3])
+
+ # Check the reported adapter regions
+ reported_adapter_regions = reader[zmw].adapterRegions
+ true_adapter_regions = [k for k in true_regions if k[1] == 0]
+ region_count = 0
+ for region in true_adapter_regions:
+ bound = (region[2], region[3])
+ clipped_region = self._clip_region(bound, reported_hq_region)
+ if clipped_region:
+ nose.tools.assert_in(clipped_region,
+ reported_adapter_regions)
+ region_count += 1
+ nose.tools.assert_equal(region_count, len(reported_adapter_regions))
+
+ # And the reported insert regions
+ reported_insert_regions = reader[zmw].insertRegions
+ true_insert_regions = [k for k in true_regions if k[1] == 1]
+ region_count = 0
+ for region in true_insert_regions:
+ bound = (region[2], region[3])
+ clipped_region = self._clip_region(bound, reported_hq_region)
+ if clipped_region:
+ nose.tools.assert_in(clipped_region,
+ reported_insert_regions)
+ region_count += 1
+ nose.tools.assert_equal(region_count, len(reported_insert_regions))
+
+
+class TestBasH5Reader_20(CommonTests, CommonMultiPartTests, ReadIteratorTests):
+ """Tests of BasH5Reader against a 2.0 ba[sx].h5 files, consisting of a
+ bas.h5 file and three bas.h5 files. The bax.h5 files also contain CCS.
+ """
+
+ def __init__(self):
+ """Get the full paths to the bas and bax.h5 files."""
+
+ self.bash5_filename = pbcore.data.getBasH5_v20()
+ self.baxh5_filenames = pbcore.data.getBaxH5_v20()
+
+
+ def test_20_constructor_bash5(self):
+ # Tests specific to the v2.0 bas.h5 constructor
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+ nose.tools.assert_true(reader.hasConsensusBasecalls)
+ nose.tools.assert_equal(reader.movieName, pbcore.data.MOVIE_NAME_20)
+
+ reader.close()
+
+ def test_20_constructor_baxh5(self):
+ # Tests specific to the v2.0 bax.h5 constructor
+ for filename in self.baxh5_filenames:
+ reader = pbcore.io.BasH5Reader(filename)
+ nose.tools.assert_true(reader.hasConsensusBasecalls)
+ nose.tools.assert_equal(reader.movieName, pbcore.data.MOVIE_NAME_20)
+ reader.close()
+
+ @nose.tools.raises(ChemistryLookupError)
+ def test_20_missing_chemistry(self):
+ """Tests that we raise an exception when we can't find chemistry information"""
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+ reader.sequencingChemistry
+
+ def test_productivity(self):
+ """Test that productivities are set correctly for the ZMW objects."""
+ productivities = {}
+ for filename in self.baxh5_filenames:
+ f = h5py.File(filename, 'r')
+ hn_to_prod = dict(zip(f["PulseData/BaseCalls/ZMW/HoleNumber"],
+ f["PulseData/BaseCalls/ZMWMetrics/Productivity"]))
+ productivities.update(hn_to_prod)
+ f.close()
+
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+
+ for hn in productivities:
+ nose.tools.assert_equal(reader[hn].productivity,
+ productivities[hn])
+
+
+class TestBasH5Reader_21(CommonTests, CommonMultiPartTests, ReadIteratorTests):
+ """Tests of BasH5Reader against a 2.1 ba[sx].h5 files, consisting of a
+ bas.h5 file and three bas.h5 files. The bax.h5 files do not contain CCS.
+ """
+
+ def __init__(self):
+ """Get the full paths to the bas and bax.h5 files."""
+ self.bash5_filename = pbcore.data.getBasH5_v21()
+ self.baxh5_filenames = pbcore.data.getBaxH5_v21()
+
+ def test_21_constructor_bash5(self):
+ # Tests specific to the v2.0 bas.h5 constructor
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+ nose.tools.assert_false(reader.hasConsensusBasecalls)
+ nose.tools.assert_equal(reader.movieName, pbcore.data.MOVIE_NAME_21)
+
+ reader.close()
+
+ def test_21_constructor_baxh5(self):
+ # Tests specific to the v2.0 bax.h5 constructor
+ for filename in self.baxh5_filenames:
+ reader = pbcore.io.BasH5Reader(filename)
+ nose.tools.assert_false(reader.hasConsensusBasecalls)
+ nose.tools.assert_equal(reader.movieName, pbcore.data.MOVIE_NAME_21)
+ reader.close()
+
+ def test_21_external_region_baxh5(self):
+ """Test the optional region file override"""
+ for baxfile in self.baxh5_filenames:
+ # Count of the subreads using internal region table
+ reader = pbcore.io.BaxH5Reader(baxfile)
+ bax_subread_count = len([x for x in reader.subreads()])
+
+ # Count of subreads using external region table
+ rgnfile = baxfile.replace('bax.h5','rgn.h5')
+ reader.loadExternalRegions(rgnfile)
+ rgn_subread_count = len([x for x in reader.subreads()])
+
+ nose.tools.assert_true(rgn_subread_count < bax_subread_count)
+
+ @nose.tools.raises(ChemistryLookupError)
+ def test_21_missing_chemistry(self):
+ """Tests that we raise an exception when we can't find chemistry information"""
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+ reader.sequencingChemistry
+
+ @nose.tools.raises(IOError)
+ def test_21_bad_external_region_baxh5(self):
+ """Tests that we raise an exception when incorrect region file given"""
+ baxfiles = self.baxh5_filenames
+ baxfile = baxfiles[0]
+ rgnfile = baxfiles[1].replace('bax.h5','rgn.h5')
+ pbcore.io.BaxH5Reader(baxfile, regionH5Filename=rgnfile)
+
+class TestBasH5Reader_23(CommonTests, CommonMultiPartTests, ReadIteratorTests):
+ """Tests of BasH5Reader against a 2.3 ba[sx].h5 files, consisting of a
+ bas.h5 file and three bas.h5 files. The bax.h5 files do not contain CCS,
+ but do contain Chemistry information
+ """
+
+ def __init__(self):
+ """Get the full paths to the bas and bax.h5 files."""
+ self.bash5_filename = pbcore.data.getBasH5_v23()
+ self.baxh5_filenames = pbcore.data.getBaxH5_v23()
+
+ def test_23_constructor_bash5(self):
+ # Tests specific to the v2.0 bas.h5 constructor
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+ nose.tools.assert_false(reader.hasConsensusBasecalls)
+ nose.tools.assert_equal(reader.movieName, pbcore.data.MOVIE_NAME_23)
+ nose.tools.assert_equal(reader.sequencingChemistry, 'P6-C4')
+
+ reader.close()
+
+ def test_23_constructor_baxh5(self):
+ # Tests specific to the v2.0 bax.h5 constructor
+ for filename in self.baxh5_filenames:
+ reader = pbcore.io.BasH5Reader(filename)
+ nose.tools.assert_false(reader.hasConsensusBasecalls)
+ nose.tools.assert_equal(reader.movieName, pbcore.data.MOVIE_NAME_23)
+ nose.tools.assert_equal(reader.sequencingChemistry, 'P6-C4')
+ reader.close()
+
+class TestBasH5Reader_CCS(ReadIteratorTests):
+ """Test BasH5Reader with a ccs.h5 file produced by P_CCS."""
+
+ def __init__(self):
+ """Get the full paths to the bas and bax.h5 files."""
+ self.bash5_filename = pbcore.data.getCCSH5()
+ self.baxh5_filenames = []
+
+ def test_constructor_ccsh5(self):
+ # Test that BasH5Reader initializes correctly with a ccs.h5 file
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+ nose.tools.assert_is_instance(reader.file, h5py.File)
+
+ nose.tools.assert_true(reader.hasConsensusBasecalls)
+ nose.tools.assert_false(reader.hasRawBasecalls)
+ nose.tools.assert_equal(reader.movieName, pbcore.data.MOVIE_NAME_CCS)
+
+ nose.tools.assert_equal(len(reader.parts), 1)
+
+ for zmw in reader.sequencingZmws:
+ nose.tools.assert_in(zmw, reader.allSequencingZmws)
+ nose.tools.assert_is_instance(reader[zmw], Zmw)
+
+ nose.tools.assert_less_equal(len(reader.sequencingZmws),
+ len(reader.allSequencingZmws))
+
+ reader.close()
+
+ def test_ccs_zmw(self):
+ # Test Zmw objects derived from a BasH5Reader reading a ccs.h5
+ reader = pbcore.io.BasH5Reader(self.bash5_filename)
+
+ sequencing_zmws = set(reader.sequencingZmws)
+ for zmw in reader.allSequencingZmws:
+ region_table = reader[zmw].regionTable
+ nose.tools.assert_equal(len(region_table), 1)
+ nose.tools.assert_equal(region_table[0][0], zmw)
+ nose.tools.assert_equal(region_table[0][1], 2)
+
+ nose.tools.assert_equal(len(reader[zmw].insertRegions), 0)
+ nose.tools.assert_equal(len(reader[zmw].adapterRegions), 0)
+
+ with nose.tools.assert_raises(ValueError):
+ reader[zmw].subreads
+
+ with nose.tools.assert_raises(ValueError):
+ reader[zmw].read()
+
+ if zmw in sequencing_zmws:
+ nose.tools.assert_is_instance(reader[zmw].ccsRead,
+ CCSZmwRead)
+ else:
+ nose.tools.assert_is_none(reader[zmw].ccsRead)
diff --git a/tests/test_pbcore_io_BlasrIO.py b/tests/test_pbcore_io_BlasrIO.py
new file mode 100644
index 0000000..abbb0bf
--- /dev/null
+++ b/tests/test_pbcore_io_BlasrIO.py
@@ -0,0 +1,10 @@
+
+from pbcore.io import M4Reader, M5Reader
+import pbcore.data as D
+
+
+def test_m4():
+ l = list(M4Reader(D.getBlasrM4()))
+
+def test_m5():
+ l = list(M5Reader(D.getBlasrM5()))
diff --git a/tests/test_pbcore_io_FastaIO.py b/tests/test_pbcore_io_FastaIO.py
new file mode 100644
index 0000000..ae14241
--- /dev/null
+++ b/tests/test_pbcore_io_FastaIO.py
@@ -0,0 +1,133 @@
+from nose.tools import assert_equal, assert_true, assert_false
+from pbcore import data
+from pbcore.io import FastaReader, FastaWriter, FastaRecord
+from StringIO import StringIO
+
+class TestFastaRecord:
+
+ def setup(self):
+ self.header = "chr1|blah|blah\tblah blah"
+ self.rc_header = "chr1|blah|blah\tblah blah [revcomp]"
+ self.id = "chr1|blah|blah"
+ self.comment = "blah blah"
+ self.sequence = "GATTACA" * 20
+ self.rc_sequence = "TGTAATC" * 20
+ self.length = 140
+ self.expected__str__ = (
+ ">chr1|blah|blah\tblah blah\n"
+ "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n"
+ "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n"
+ "ATTACAGATTACAGATTACA")
+ self.rc1_expected__str__ = (
+ ">chr1|blah|blah\tblah blah [revcomp]\n"
+ "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
+ "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
+ "GTAATCTGTAATCTGTAATC")
+ self.rc2_expected__str__ = (
+ ">chr1|blah|blah\tblah blah\n"
+ "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
+ "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
+ "GTAATCTGTAATCTGTAATC")
+ self.record = FastaRecord(self.header, self.sequence)
+ self.rc1_record = self.record.reverseComplement()
+ self.rc2_record = self.record.reverseComplement(True)
+
+ def test__init__(self):
+ assert_equal(self.header, self.record.header)
+ assert_equal(self.sequence, self.record.sequence)
+ assert_equal(self.id, self.record.id)
+ assert_equal(self.comment, self.record.comment)
+
+ def test__str__(self):
+ assert_equal(self.expected__str__, str(self.record))
+
+ def test_fromString(self):
+ recordFromString = FastaRecord.fromString(self.expected__str__)
+ assert_equal(self.header, recordFromString.header)
+ assert_equal(self.sequence, recordFromString.sequence)
+
+ def test_md5(self):
+ assert_equal("67fc75ce599ed0ca1fc8ed2dcbccc95d",
+ self.record.md5)
+
+ def test_reverse_complement1(self):
+ assert_equal(self.rc1_record.header, self.rc_header)
+ assert_equal(self.rc1_record.sequence, self.rc_sequence)
+ assert_equal(self.rc1_expected__str__, str(self.rc1_record))
+
+ def test_reverse_complement2(self):
+ assert_equal(self.rc2_record.header, self.header)
+ assert_equal(self.rc2_record.sequence, self.rc_sequence)
+ assert_equal(self.rc2_expected__str__, str(self.rc2_record))
+
+ def test_len(self):
+ assert_equal(self.length, len(self.record))
+ assert_equal(self.length, len(self.rc1_record))
+ assert_equal(self.length, len(self.rc2_record))
+
+ def test_eq(self):
+ header = 'r1'
+ seq = 'ACGT'
+ r1 = FastaRecord(header, seq)
+ r2 = FastaRecord(header, seq)
+ assert_true(r1 == r2)
+
+ def test_not_equal(self):
+ r1 = FastaRecord('r1', 'ACGT')
+ r2 = FastaRecord('r2', 'ACGT')
+ r3 = FastaRecord('r1', 'ACGT')
+ assert_true(r1 != r2)
+ assert_false(r1 != r3)
+
+
+class TestFastaReader:
+
+ def test_readFasta(self):
+ f = FastaReader(data.getFasta())
+ entries = list(f)
+ assert_equal(48, len(entries))
+ assert_equal("ref000001|EGFR_Exon_2", entries[0].header)
+ assert_equal("TTTCTTCCAGTTTGCCAAGGCACGAGTAACAAGCTCACGCAGTTGGGCACTTT"
+ "TGAAGATCATTTTCTCAGCCTCCAGAGGATGTTCAATAACTGTGAGGTGGTCC"
+ "TTGGGAATTTGGAAATTACCTATGTGCAGAGGAATTATGATCTTTCCTTCTTA"
+ "AAGGTTGGTGACTTTGATTTTCCT",
+ entries[0].sequence)
+ assert_equal("e3912e9ceacd6538ede8c1b2adda7423",
+ entries[0].md5)
+
+ def test_dosLineEndingsFasta(self):
+ f = FastaReader(data.getDosFormattedFasta())
+ entries = list(f)
+ for e in entries:
+ assert_true("\r" not in e.header)
+ assert_equal(16, len(e.sequence))
+
+
+
+class TestFastaWriter:
+
+ def setup(self):
+ self.fasta1 = StringIO(
+ ">chr1|blah|blah\n" \
+ "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n" \
+ "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n" \
+ "ATTACAGATTACAGATTACA\n")
+ self.fasta2 = StringIO(self.fasta1.getvalue() + "\n" + \
+ ">chr2|blah|blah\n" \
+ "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n" \
+ "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n" \
+ "ATTACAGATTACAGATTACA\n")
+
+ def test_writeFasta1(self):
+ f = StringIO()
+ w = FastaWriter(f)
+ for record in FastaReader(self.fasta1):
+ w.writeRecord(record)
+ assert_equal(self.fasta1.getvalue(), f.getvalue())
+
+ def test_writeFasta2(self):
+ f = StringIO()
+ w = FastaWriter(f)
+ for record in FastaReader(self.fasta1):
+ w.writeRecord(record.header, record.sequence)
+ assert_equal(self.fasta1.getvalue(), f.getvalue())
diff --git a/tests/test_pbcore_io_FastaTable.py b/tests/test_pbcore_io_FastaTable.py
new file mode 100644
index 0000000..7321f11
--- /dev/null
+++ b/tests/test_pbcore_io_FastaTable.py
@@ -0,0 +1,80 @@
+from nose.tools import assert_equal, assert_true, assert_false
+from pbcore import data
+from pbcore.io import FastaReader, FastaWriter, IndexedFastaReader
+
+
+class TestIndexedFastaReader:
+
+ def setup(self):
+ self.fastaPath = data.getFasta()
+
+ def testIteration(self):
+ ft = IndexedFastaReader(self.fastaPath)
+ fr = FastaReader(self.fastaPath)
+ ftContigs = list(ft)
+ frContigs = list(fr)
+ assert_equal(len(frContigs), len(ftContigs))
+ assert_equal(48, len(ftContigs))
+ for ftC, frC in zip(ftContigs, frContigs):
+ assert_equal(frC.header, ftC.header)
+ assert_equal(frC.sequence, ftC.sequence[:])
+
+ # Unlike FastaReader, IndexedFastaReader iteration is repeatable.
+ assert_equal(48, len(list(ft)))
+
+ def testAccessByName(self):
+ ft = IndexedFastaReader(self.fastaPath)
+ r000021 = ft["ref000021|EGFR_Exon_22\tMetadataTest"]
+ assert_equal("ref000021|EGFR_Exon_22\tMetadataTest", r000021.header)
+ assert_equal("ref000021|EGFR_Exon_22", r000021.id)
+ assert_equal("MetadataTest", r000021.comment)
+ assert_equal("CACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCAAGTGGATGGCATTGGAATCAATT"
+ "TTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGTGAGTCATAATCCTGA"
+ "TGCTAATGAGTTTGTACTGAGGCCAAGCTGG",
+ r000021.sequence[:])
+
+ def testAccessById(self):
+ ft = IndexedFastaReader(self.fastaPath)
+ r000021 = ft["ref000021|EGFR_Exon_22"]
+ assert_equal("ref000021|EGFR_Exon_22\tMetadataTest", r000021.header)
+ assert_equal("ref000021|EGFR_Exon_22", r000021.id)
+ assert_equal("MetadataTest", r000021.comment)
+ assert_equal("CACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCAAGTGGATGGCATTGGAATCAATT"
+ "TTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGTGAGTCATAATCCTGA"
+ "TGCTAATGAGTTTGTACTGAGGCCAAGCTGG",
+ r000021.sequence[:])
+
+ def testAccessByPosition(self):
+ ft = IndexedFastaReader(self.fastaPath)
+ r000001 = ft[0]
+ assert_equal("<IndexedFastaRecord: ref000001|EGFR_Exon_2>", repr(r000001))
+ firstTwo = ft[:2]
+ assert_equal([ft[0], ft[1]], firstTwo)
+ lastTwo = ft[-2:]
+ assert_equal([ft[-2], ft[-1]], lastTwo)
+
+ def testSlice(self):
+ ft = IndexedFastaReader(self.fastaPath)
+ r000021 = ft["ref000021|EGFR_Exon_22"]
+ sequence = r000021.sequence
+ assert_equal("CACTGCCTCA",
+ sequence[0:10])
+ assert_equal("GCCAAGCTGG",
+ sequence[-10:])
+ assert_equal("G", sequence[-1])
+ assert_equal("T", sequence[-3])
+ assert_equal("C", sequence[0])
+ assert_equal("A", sequence[1])
+
+
+ def test_dosLineEndingsFasta(self):
+ fr = FastaReader(data.getDosFormattedFasta())
+ frEntries = list(fr)
+
+ ft = IndexedFastaReader(data.getDosFormattedFasta())
+ ftEntries = list(ft)
+
+ assert_equal(len(frEntries), len(ftEntries))
+ for (frE, ftE) in zip(frEntries, ftEntries):
+ assert_equal(frE.header, ftE.header)
+ assert_equal(frE.sequence, ftE.sequence[:])
diff --git a/tests/test_pbcore_io_FastqIO.py b/tests/test_pbcore_io_FastqIO.py
new file mode 100644
index 0000000..15e68bc
--- /dev/null
+++ b/tests/test_pbcore_io_FastqIO.py
@@ -0,0 +1,183 @@
+from nose.tools import assert_equal, assert_true, assert_false
+from numpy.testing import assert_array_equal
+from pbcore import data
+from StringIO import StringIO
+
+from pbcore.io.FastqIO import *
+
+
+# Test QV <-> string conversion routines
+class TestQvConversion:
+ def setup(self):
+ self.ascii = \
+ "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`" + \
+ "abcdefghijklmnopqrstuvwxyz{|}~"
+ self.qvs = range(0, 94)
+
+ def testAsciiFromQvs(self):
+ assert_equal(self.ascii, asciiFromQvs(self.qvs))
+
+ def testQvsFromAscii(self):
+ assert_array_equal(self.qvs, qvsFromAscii(self.ascii))
+
+
+class TestFastqRecord:
+
+ def setup(self):
+ self.header = "chr1|blah|blah\tblah blah"
+ self.rc_header = "chr1|blah|blah\tblah blah [revcomp]"
+ self.id = "chr1|blah|blah"
+ self.comment = "blah blah"
+ self.sequence = "GATTACA" * 20
+ self.rc_sequence = "TGTAATC" * 20
+ self.length = 140
+ self.quality = [10,11,12,13,14,15,16] * 20
+ self.rc_quality = [16,15,14,13,12,11,10] * 20
+ self.qualityString = "+,-./01" * 20
+ self.rc_qualityString = "10/.-,+" * 20
+ self.expected__str__ = (
+ "@chr1|blah|blah\tblah blah\n"
+ "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT"
+ "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG"
+ "ATTACAGATTACAGATTACA\n"
+ "+\n"
+ "+,-./01+,-./01+,-./01+,-./01+,-./01+,-./01+,-./01+,-./01+,-."
+ "/01+,-./01+,-./01+,-./01+,-./01+,-./01+,-./01+,-./01+,-./01+"
+ ",-./01+,-./01+,-./01")
+ self.rc1_expected__str__ = (
+ "@chr1|blah|blah\tblah blah [revcomp]\n"
+ "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA"
+ "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT"
+ "GTAATCTGTAATCTGTAATC\n"
+ "+\n"
+ "10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/."
+ "-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+1"
+ "0/.-,+10/.-,+10/.-,+")
+ self.rc2_expected__str__ = (
+ "@chr1|blah|blah\tblah blah\n"
+ "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA"
+ "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT"
+ "GTAATCTGTAATCTGTAATC\n"
+ "+\n"
+ "10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/."
+ "-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+10/.-,+1"
+ "0/.-,+10/.-,+10/.-,+")
+ self.record = FastqRecord(self.header, self.sequence, self.quality)
+ self.record2 = FastqRecord(self.header, self.sequence,
+ qualityString=self.qualityString)
+ self.rc1_record = self.record.reverseComplement()
+ self.rc2_record = self.record.reverseComplement(True)
+
+ def test__init__(self):
+ assert_equal(self.header, self.record.header)
+ assert_equal(self.sequence, self.record.sequence)
+ assert_equal(self.id, self.record.id)
+ assert_equal(self.comment, self.record.comment)
+ assert_array_equal(self.quality, self.record.quality)
+ assert_equal(self.record, self.record2)
+
+ def test__str__(self):
+ assert_equal(self.expected__str__, str(self.record))
+
+ def test_fromString(self):
+ recordFromString = FastqRecord.fromString(self.expected__str__)
+ assert_equal(self.header, recordFromString.header)
+ assert_equal(self.sequence, recordFromString.sequence)
+ assert_array_equal(self.quality, recordFromString.quality)
+
+ def test_reverse_complement1(self):
+ assert_equal(self.rc1_record.header, self.rc_header)
+ assert_equal(self.rc1_record.sequence, self.rc_sequence)
+ assert_equal(self.rc1_record.quality, self.rc_quality)
+ assert_equal(self.rc1_record.qualityString, self.rc_qualityString)
+ assert_equal(str(self.rc1_record), self.rc1_expected__str__)
+
+ def test_reverse_complement2(self):
+ assert_equal(self.rc2_record.header, self.record.header)
+ assert_equal(self.rc2_record.sequence, self.rc_sequence)
+ assert_equal(self.rc2_record.quality, self.rc_quality)
+ assert_equal(self.rc2_record.qualityString, self.rc_qualityString)
+ assert_equal(str(self.rc2_record), self.rc2_expected__str__)
+
+ def test_len(self):
+ assert_equal(self.length, len(self.record))
+ assert_equal(self.length, len(self.rc1_record))
+ assert_equal(self.length, len(self.rc2_record))
+
+ def test_eq(self):
+ header = 'r1'
+ seq = 'ACGT'
+ qvs = list(xrange(10, 10 + len(seq)))
+ r1 = FastqRecord(header, seq, qvs)
+ r2 = FastqRecord(header, seq, qvs)
+ assert_true(r1 == r2)
+ assert_false(r1 != r2)
+
+ def test_not_equal(self):
+ header = 'r1'
+ seq = 'ACGT'
+ qvs = list(xrange(10, 10 + len(seq)))
+ r1 = FastqRecord(header, seq, qvs)
+ r2 = FastqRecord('r2', seq, qvs)
+ assert_true(r1 != r2)
+
+
+class TestFastqReader:
+
+ def setup(self):
+ self.fastq1 = StringIO("@seq1\n" +
+ "GATTACA\n" +
+ "+\n" +
+ "789:;<=\n")
+ self.fastq2 = StringIO(self.fastq1.getvalue() +
+ "@seq2\n" +
+ "CATTAGA\n" +
+ "+\n" +
+ "@@@@@@@\n")
+
+ def test_readFastq1(self):
+ r1 = FastqReader(self.fastq1)
+ l = list(r1)
+ assert_equal([FastqRecord("seq1", "GATTACA", range(22, 29))], l)
+
+ def test_readFastq2(self):
+ r2 = FastqReader(self.fastq2)
+ l = list(r2)
+ assert_equal([FastqRecord("seq1", "GATTACA", range(22, 29)),
+ FastqRecord("seq2", "CATTAGA", [31]*7) ],
+ l)
+
+
+class TestFastqWriter:
+
+ def setup(self):
+ self.fastq1 = StringIO("@seq1\n" +
+ "GATTACA\n" +
+ "+\n" +
+ "789:;<=\n")
+ self.fastq2 = StringIO(self.fastq1.getvalue() +
+ "@seq2\n" +
+ "CATTAGA\n" +
+ "+\n" +
+ "@@@@@@@\n")
+
+ def test_writeFastq1(self):
+ f = StringIO()
+ w = FastqWriter(f)
+ for record in FastqReader(self.fastq1):
+ w.writeRecord(record)
+ assert_equal(self.fastq1.getvalue(), f.getvalue())
+
+ def test_writeFastq2(self):
+ f = StringIO()
+ w = FastqWriter(f)
+ for record in FastqReader(self.fastq2):
+ w.writeRecord(record)
+ assert_equal(self.fastq2.getvalue(), f.getvalue())
+
+ def test_writeFastq3(self):
+ f = StringIO()
+ w = FastqWriter(f)
+ for record in FastqReader(self.fastq2):
+ w.writeRecord(record.header, record.sequence, record.quality)
+ assert_equal(self.fastq2.getvalue(), f.getvalue())
diff --git a/tests/test_pbcore_io_FofnIO.py b/tests/test_pbcore_io_FofnIO.py
new file mode 100644
index 0000000..1df14cd
--- /dev/null
+++ b/tests/test_pbcore_io_FofnIO.py
@@ -0,0 +1,22 @@
+from nose.tools import assert_equal, assert_true, assert_false
+from numpy.testing import assert_array_equal
+from StringIO import StringIO
+from os.path import isabs
+
+from pbcore import data
+from pbcore.io import readFofn
+
+def test_simple():
+ fofn = StringIO("/a/b\n/c/d")
+ lst = list(readFofn(fofn))
+ assert_array_equal(["/a/b", "/c/d"], lst)
+
+def test_empty_lines():
+ fofn = StringIO("/a/b\n \n/c/d\n ")
+ lst = list(readFofn(fofn))
+ assert_array_equal(["/a/b", "/c/d"], lst)
+
+def test_absolutifying():
+ for fofnPath in data.getFofns():
+ for filePath in readFofn(fofnPath):
+ assert_true(isabs(filePath))
diff --git a/tests/test_pbcore_io_GffIO.py b/tests/test_pbcore_io_GffIO.py
new file mode 100644
index 0000000..5f822e7
--- /dev/null
+++ b/tests/test_pbcore_io_GffIO.py
@@ -0,0 +1,100 @@
+from nose.tools import assert_equal, assert_raises
+from StringIO import StringIO
+from pbcore.io import GffWriter, Gff3Record, GffReader
+from pbcore import data
+
+class TestGff3Record:
+
+ def setup(self):
+ self.record = Gff3Record("chr1", 10, 11, "insertion",
+ attributes=[("cat", "1"), ("dog", "2")])
+
+ def test_str(self):
+ assert_equal("chr1\t.\tinsertion\t10\t11\t.\t.\t.\tcat=1;dog=2",
+ str(self.record))
+
+ def test_modification(self):
+ record = self.record.copy()
+ record.dog = 3
+ record.cat = 4
+ record.mouse = 5
+ record.start = 100
+ record.end = 110
+ assert_equal("chr1\t.\tinsertion\t100\t110\t.\t.\t.\tcat=4;dog=3;mouse=5",
+ str(record))
+
+ def test_fromString(self):
+ newRecord = Gff3Record.fromString(str(self.record))
+ assert_equal(str(self.record), str(newRecord))
+
+ def test_get(self):
+ """
+ Verify field access behavior
+ """
+ record = self.record
+ record.dog = 3
+ record.cat = 4
+ record.mouse = 5
+ record.start = 100
+ record.end = 110
+
+ assert_equal(3, record.dog)
+ assert_equal(100, record.start)
+ with assert_raises(AttributeError):
+ record.god
+
+ assert_equal(3, record.get("dog"))
+ assert_equal(None, record.get("god"))
+ assert_equal(100, record.get("start", 100))
+
+
+
+
+class TestGffReader:
+ def setup(self):
+ self.rawFile = open(data.getGff3())
+ self.reader = GffReader(data.getGff3())
+
+ def test_headers(self):
+ assert_equal(["##gff-version 3",
+ "##pacbio-variant-version 2.1",
+ "##date Sat Mar 22 12:16:13 2014",
+ "##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.12",
+ "##source GenomicConsensus 0.8.0",
+ "##source-commandline /Users/dalexander/.virtualenvs/VE/bin/variantCaller.py --algorithm=plurality -q20 -x5 pbcore/data/aligned_reads_1.cmp.h5 -r /Users/dalexander/Data/lambdaNEB.fa -o /tmp/v.gff",
+ "##source-alignment-file /Users/dalexander/Dropbox/Sources/git/pbcore/pbcore/data/aligned_reads_1.cmp.h5",
+ "##source-reference-file /Users/dalexander/Data/lambdaNEB.fa",
+ "##sequence-region lambda_NEB3011 1 48502"],
+ self.reader.headers)
+
+ def test__iter__(self):
+ records = list(self.reader)
+ rawLines = self.rawFile.readlines()[9:]
+ for record, rawLine in zip(records, rawLines):
+ # No newlines or whitespace allowed in records
+ assert_equal(str(record).strip(), str(record))
+ # Make sure record matches line
+ assert_equal(rawLine.strip(), str(record))
+
+
+class TestGffWriter:
+ def setup(self):
+ self.outfile = StringIO()
+ self.record1 = Gff3Record("chr1", 10, 11, "insertion",
+ attributes=[("cat", "1"), ("dog", "2")])
+ self.record2 = Gff3Record("chr1", 200, 201, "substitution",
+ attributes=[("mouse", "1"), ("moose", "2")])
+ self.gffWriter = GffWriter(self.outfile)
+
+ def test_writeHeader(self):
+ self.gffWriter.writeHeader("##foo bar")
+ assert_equal("##gff-version 3\n##foo bar\n",
+ self.outfile.getvalue())
+
+ def test_writeRecord(self):
+ self.gffWriter.writeRecord(self.record1)
+ self.gffWriter.writeRecord(self.record2)
+ expected = ("##gff-version 3\n" +
+ "chr1\t.\tinsertion\t10\t11\t.\t.\t.\tcat=1;dog=2\n" +
+ "chr1\t.\tsubstitution\t200\t201\t.\t.\t.\tmouse=1;moose=2\n")
+ assert_equal(expected, self.outfile.getvalue())
diff --git a/tests/test_pbcore_io_rangeQueries.py b/tests/test_pbcore_io_rangeQueries.py
new file mode 100644
index 0000000..6a5e1d8
--- /dev/null
+++ b/tests/test_pbcore_io_rangeQueries.py
@@ -0,0 +1,71 @@
+from nose.tools import assert_equal
+from numpy.testing import assert_array_equal
+
+import pbcore.io.rangeQueries as RQ
+from pbcore import data
+from pbcore.io import CmpH5Reader
+
+import bisect
+from numpy import *
+
+def brute_force_lm_search(vec, val):
+ if (val not in vec):
+ nvec = vec[ vec < val ]
+ if (len(nvec) == 0):
+ return(0)
+ val = max(nvec)
+ for i in range(0, len(vec)):
+ if (vec[i] == val):
+ break
+ return(i)
+
+def brute_force_rm_search(vec, val):
+ if (val not in vec):
+ nvec = vec[ vec > val ]
+ if (len(nvec) == 0):
+ return(len(vec))
+ val = min(nvec)
+ return(bisect.bisect_left(vec, val))
+ else:
+ return(bisect.bisect_right(vec, val) - 1)
+
+class TestProjectIntoRange:
+ def test_project_into_range(self):
+ tStart = array([1,1,1,1,1,2,2,2,2,10,20])
+ tEnd = array([2,3,4,5,6,3,4,5,6,15,25])
+ assert_equal(True, all(RQ.projectIntoRange(tStart, tEnd, 1, 6) == array([5, 8, 6, 4, 2])))
+ assert_equal(True, all(RQ.projectIntoRange(tStart, tEnd, 20, 26) == array([1, 1, 1, 1, 1, 0])))
+
+def brute_force_reads_in_range(rangeStart, rangeEnd, tStart, tEnd):
+ mask = ((tEnd > rangeStart) &
+ (tStart < rangeEnd))
+ return flatnonzero(mask)
+
+class TestGetReadsInRange:
+ def __init__(self):
+ self.h5FileName = data.getCmpH5()
+ self.cmpH5 = CmpH5Reader(self.h5FileName)
+
+ def test_get_reads_in_range(self):
+ assert(len(RQ.getReadsInRange(self.cmpH5, (1, 0, 100000), justIndices = True)) == 84)
+
+ def test_get_coverage_in_range(self):
+ assert(all(RQ.getCoverageInRange(self.cmpH5, (1, 0, 100)) == 2))
+
+ def test_reads_in_range2(self):
+ # This is a brute force check that reads in range returns the
+ # right answer for 50-base windows of lambda
+ for BLOCKSIZE in [50, 77]:
+ for winStart in xrange(0, 45000, BLOCKSIZE):
+ winEnd = winStart + BLOCKSIZE
+ assert_array_equal(brute_force_reads_in_range(winStart, winEnd, self.cmpH5.tStart, self.cmpH5.tEnd),
+ self.cmpH5.readsInRange(1, winStart, winEnd, justIndices=True))
+
+
+
+ def test_coverage_in_range2(self):
+ # Brute force over lambda
+ for winStart in xrange(0, 45000, 50):
+ winEnd = winStart + 1
+ assert_array_equal([len(brute_force_reads_in_range(winStart, winEnd, self.cmpH5.tStart, self.cmpH5.tEnd))],
+ RQ.getCoverageInRange(self.cmpH5, (1, winStart, winEnd)))
diff --git a/tests/test_pbcore_io_unaligned_bam.py b/tests/test_pbcore_io_unaligned_bam.py
new file mode 100644
index 0000000..d9857a3
--- /dev/null
+++ b/tests/test_pbcore_io_unaligned_bam.py
@@ -0,0 +1,68 @@
+from numpy.testing import (assert_array_almost_equal as ASIM,
+ assert_array_equal as AEQ)
+from nose.tools import (nottest,
+ assert_raises,
+ assert_equal as EQ)
+from nose import SkipTest
+
+import numpy as np
+import bisect
+import h5py
+from collections import Counter
+
+from pbcore import data
+from pbcore.io import BamReader, BaxH5Reader
+from pbcore.io.align._BamSupport import UnavailableFeature
+
+from pbcore.sequence import reverseComplement as RC
+
+class TestUnalignedBam(object):
+
+ def __init__(self):
+ self.bam = BamReader (data.getUnalignedBam())
+ self.bax = BaxH5Reader(data.getBaxForBam())
+
+ self.baxRead0 = next(self.bax.subreads())
+ self.bamRead0 = next(iter(self.bam))
+
+ def testInvalidOperations(self):
+
+ # These kinds of things presently work. Do we want them to
+ # fail?
+
+ # with assert_raises(UnavailableFeature):
+ # self.bamRead0.isForwardStrand
+ # with assert_raises(UnavailableFeature):
+ # self.bamRead0.tStart
+
+ # attempts to get read aligned or oriented
+ with assert_raises(UnavailableFeature):
+ self.bamRead0.read(aligned=True, orientation="native")
+ with assert_raises(UnavailableFeature):
+ self.bamRead0.read(aligned=False, orientation="genomic")
+ with assert_raises(UnavailableFeature):
+ self.bamRead0.read()
+ with assert_raises(UnavailableFeature):
+ self.bamRead0.InsertionQV(aligned=True, orientation="native")
+ with assert_raises(UnavailableFeature):
+ self.bamRead0.InsertionQV(aligned=False, orientation="genomic")
+ with assert_raises(UnavailableFeature):
+ self.bamRead0.InsertionQV()
+
+ def testReadAccess(self):
+ EQ(self.bamRead0.read(aligned=False, orientation="native"), self.baxRead0.basecalls())
+
+ def testQvAccess(self):
+ AEQ(self.bamRead0.SubstitutionQV(aligned=False, orientation="native"), self.baxRead0.SubstitutionQV())
+ AEQ(self.bamRead0.InsertionQV(aligned=False, orientation="native"), self.baxRead0.InsertionQV())
+ AEQ(self.bamRead0.DeletionTag(aligned=False, orientation="native"), self.baxRead0.DeletionTag())
+
+ def testZmwInfo(self):
+ # WAT. Need to make these accessors more uniform. This is
+ # totally crazy.
+ EQ(self.bamRead0.HoleNumber, self.baxRead0.holeNumber)
+ EQ(self.bamRead0.qStart, self.baxRead0.readStart)
+ EQ(self.bamRead0.qEnd, self.baxRead0.readEnd)
+
+ def testNames(self):
+ EQ(self.bamRead0.queryName, self.baxRead0.readName)
diff --git a/tests/test_pbcore_util_sequences.py b/tests/test_pbcore_util_sequences.py
new file mode 100644
index 0000000..7c475f2
--- /dev/null
+++ b/tests/test_pbcore_util_sequences.py
@@ -0,0 +1,48 @@
+import nose
+from nose.tools import assert_equal, assert_true, assert_false
+from pbcore import sequence
+
+class TestReverseComplement:
+
+ def setup(self):
+ self.sequence = "GATTACA" * 20
+ self.reverse = "ACATTAG" * 20
+ self.complement = "CTAATGT" * 20
+ self.reverse_complement = "TGTAATC" * 20
+ self.bad_sequence = "AGCTR" * 20
+
+ def test_reverse(self):
+ assert_equal(self.sequence,
+ sequence.reverse(sequence.reverse(self.sequence)))
+ assert_equal(self.reverse,
+ sequence.reverse(self.sequence))
+ assert_equal(self.complement,
+ sequence.reverse(self.reverse_complement))
+
+ def test_complement(self):
+ assert_equal(self.sequence,
+ sequence.complement(self.complement))
+ assert_equal(self.complement,
+ sequence.complement(self.sequence))
+ assert_equal(self.reverse,
+ sequence.complement(self.reverse_complement))
+
+ def test_reverseComplement(self):
+ assert_equal(self.reverse_complement,
+ sequence.reverseComplement(self.sequence))
+ assert_equal(self.sequence,
+ sequence.reverseComplement(self.reverse_complement))
+
+ @nose.tools.raises(ValueError)
+ def test_complement_error(self):
+ sequence.complement(self.bad_sequence)
+
+ @nose.tools.raises(ValueError)
+ def test_reverse_complement_error(self):
+ sequence.reverseComplement(self.bad_sequence)
+
+
+class TestSplitRecordName:
+
+ def setup(self):
+ pass
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git
More information about the debian-med-commit
mailing list