[med-svn] [pbbarcode] 01/02: Imported Upstream version 0.8.0

Afif Elghraoui afif-guest at moszumanska.debian.org
Tue Dec 15 08:13:47 UTC 2015


This is an automated email from the git hooks/post-receive script.

afif-guest pushed a commit to branch master
in repository pbbarcode.

commit 266491af26faabf51816b3a5314c77c8f525b06d
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Sun Nov 29 01:36:48 2015 -0800

    Imported Upstream version 0.8.0
---
 Makefile                                 |  47 ++
 README.rst                               |  36 ++
 doc/Makefile                             | 153 +++++++
 doc/PbbarcodeFunctionalSpecification.rst | 405 +++++++++++++++++
 doc/conf.py                              | 242 ++++++++++
 doc/index.rst                            |  16 +
 etc/barcode.fasta                        |   8 +
 etc/barcode_complete.fasta               | 192 ++++++++
 etc/pacbio_barcodes_paired.fasta         | 192 ++++++++
 setup.py                                 |  37 ++
 src/C/Makefile                           |  11 +
 src/C/sw.c                               |  56 +++
 src/python/pbbarcode/BarcodeLabeler.py   | 225 +++++++++
 src/python/pbbarcode/SWaligner.py        |  69 +++
 src/python/pbbarcode/__init__.py         |   0
 src/python/pbbarcode/_version.py         |   1 +
 src/python/pbbarcode/main.py             | 751 +++++++++++++++++++++++++++++++
 tests/cram/consensus.t.disabled          |  88 ++++
 tests/cram/sanity.t                      |  55 +++
 tests/test_basic.py                      |  32 ++
 20 files changed, 2616 insertions(+)

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..982f5ce
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,47 @@
+.PHONY: doc doc-clean
+
+SHELL = /bin/bash -e
+
+all: build install
+
+build:
+	python setup.py build --executable="/usr/bin/env python"
+
+bdist:
+	python setup.py build --executable="/usr/bin/env python"
+	python setup.py bdist --formats=egg
+
+install:
+	python setup.py install
+
+develop:
+	python setup.py develop
+
+test:
+	find tests -name "*.py" | xargs nosetests
+	find tests/cram -name "*.t" | grep -v consensus.t | xargs cram --verbose 
+
+clean: doc-clean
+	rm -rf build/;\
+	find . -name "*.egg-info" | xargs rm -rf;\
+	find . -name "*.pyc" | xargs rm -rf;\
+	rm -rf dist/
+	make -C src/C clean
+
+doc-clean:
+	make -C doc clean
+
+doc:
+	make -C doc html
+
+pip-install:
+	@which pip > /dev/null
+	@pip freeze|grep 'pbtools.barcode=='>/dev/null \
+      && pip uninstall -y pbtools.barcode \
+      || true
+	@pip freeze|grep 'pbbarcode=='>/dev/null \
+      && pip uninstall -y pbbarcode \
+      || true
+	@pip install --no-index \
+          --install-option="--install-scripts=$(PREFIX)/bin" \
+          ./
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..42610c7
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,36 @@
+Overview of the pbbarcode package
+=================================
+
+The *pbbarcode* package provides tools for annotating PacBio
+sequencing reads with barcode information. Typically, *pbbarcode*
+is called in context of a SMRTPipe workflow as opposed to directly on
+the command line, however, users are encouraged to utilize the
+command-line utility directly, as more options are available.  
+
+The *pbbarcode* package provides a multi-command line tool
+*pbbarcode* which currently has the following sub-commands:  
+
+* labelZmws
+* labelAlignments
+* emitFastqs
+* consensus
+
+The first three sub-commands depend on only *pbcore* and its
+dependencies, the fourth, *consensus*, depends on the *pbdagcon*
+package and is considered experimental.  
+
+For more details on the package, please see docs/index.rst for more
+information.
+
+Installation
+============
+
+Typically, the *pbbarcode* package is installed within an installation
+of SMRTPipe, however, it can be installed by itself using::
+
+   make install
+
+To test that everything is installed correctly, one should
+additionally issue a::
+
+   make test
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..a37efe5
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,153 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbbarcode.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbbarcode.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/pbbarcode"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbbarcode"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/doc/PbbarcodeFunctionalSpecification.rst b/doc/PbbarcodeFunctionalSpecification.rst
new file mode 100644
index 0000000..b00fe1f
--- /dev/null
+++ b/doc/PbbarcodeFunctionalSpecification.rst
@@ -0,0 +1,405 @@
+.. pbbarcode Functional Specification
+.. =======================================
+
+.. Version
+
+
+Introduction
+````````````
+This document describes the interface and input/output formats of the
+``pbbarcode`` package command line tools. The package provides
+utilities for annotating individual ZMWs directly from a bas.h5 file,
+emitting fast[a|q] files for each barcode, labeling alignments stored
+in a cmp.h5 file, and calling consensus on small amplicons (requires
+``pbdagcon``)
+
+At the moment, Barcodes can be scored in two different ways:
+``symmetric`` and ``paired``. Symmetric mode supports barcode designs
+with two identical barcodes on both sides of a SMRTbell, e.g., for
+barcodes (A, B), molecules are labeled as A--A or B--B. The ``paired``
+mode supports designs with two distinct barcodes on each side of the
+molecule, but neither barcode appears without its mate. The minimum
+example is given with the following barcodes: (ALeft, ARight, BLeft,
+BRight), where the following barcode sets are checked: ALeft--ARight,
+BLeft--BRight.
+
+It is important to highlight that a barcode FASTA file specifies a
+list of available barcodes to evaluate. Depending on the scoring mode,
+the barcodes are grouped together in different ways. For instance, in
+the ``symmetric`` case, the number of possible barcode outcomes are
+simply the number of barcodes that are supplied to the routine in the
+FASTA file (see below for usage) plus an additional ``NULL`` barcode
+indicating that no barcode could be evaluated (denoted by:
+'--'). Labels like this (A--A) are used in the final outputs. In the
+``paired`` mode, the number of possible barcode outcomes are half the
+number of the sequences in the FASTA file plus the ``NULL``
+barcode. The ``NULL`` barcode indicates that no attempt was made to
+score the molecule or it was filtered out by the user's criteria. The
+majority of cases when a molecule is not scored are related to not
+observing any adapters. If a user has executed a "hot-start" run, the
+user can try the '--scoreFirst' parameter to attempt to label the
+first adapter's barcode. This increases the yield of the labeleing
+procedure at the expense of some probably false positives. 
+
+The software is implemented as a standard python package. Barcodes are
+labeled according to the following high-level logic. For each
+molecule, all adapters are found. For each adapter, we align (using
+standard Smith-Watterman alignment) each barcode and its reverse
+complement to flanking sequence of the adapter. If two complete
+flanking sequences are available, we divide by 2, else 1 if only one
+flanking sequence was available (average score at adapter). This
+allows the scores across adapters to be on the same scale (chimera
+detection). Depending on the ``mode``, we then determine which
+barcode(s) are maximally scoring. We store the two maximally scoring
+barcodes, the sum of their alignment scores across the adapters. The
+average barcode score then can be given approximately by:
+total-score/number-of-adapters. At the moment, the alignment
+parameters are fixed at:
+
+
+.. table:: SW Match Parameters
++----------+----------+
+|type      |score     |
+|          |          |
++----------+----------+
+|insertion |-1        |
+|          |          |
++----------+----------+
+|deletion  |-1        |
+|          |          |
++----------+----------+
+|missmatch |-2        |
+|          |          |
++----------+----------+
+|match     |2         |
+|          |          |
++----------+----------+
+
+Input and output
+````````````````
+
+labelZmws
+---------
+  usage: pbbarcode labelZmws [-h] [--outDir OUTDIR] [--outFofn OUTFOFN]
+                                [--adapterSidePad ADAPTERSIDEPAD]
+                                [--insertSidePad INSERTSIDEPAD]
+                                [--scoreMode {symmetric,paired}]
+                                [--maxAdapters MAXADAPTERS] [--scoreFirst]
+                                [--startTimeCutoff STARTTIMECUTOFF]
+                                [--nZmws NZMWS] [--nProcs NPROCS]
+                                [--saveExtendedInfo]
+                                barcode.fasta input.fofn
+
+  Creates a barcode.h5 file from base h5 files.
+
+  positional arguments:
+    barcode.fasta         Input barcode fasta file
+    input.fofn            Input base fofn
+
+  optional arguments:
+    -h, --help            show this help message and exit
+    --outDir OUTDIR       Where to write the newly created barcode.h5 files.
+                          (default: /home/UNIXHOME/jbullard/projects/software/bi
+                         oinformatics/tools/pbbarcode/doc)
+    --outFofn OUTFOFN     Write to outFofn (default: barcode.fofn)
+    --adapterSidePad ADAPTERSIDEPAD
+                          Pad with adapterSidePad bases (default: 4)
+    --insertSidePad INSERTSIDEPAD
+                          Pad with insertSidePad bases (default: 4)
+    --scoreMode {symmetric,paired}
+                          The mode in which the barcodes should be scored.
+                          (default: symmetric)
+    --maxAdapters MAXADAPTERS
+                          Only score the first maxAdapters (default: 20)
+    --scoreFirst          Whether to try to score the leftmost barcode in a
+                          trace. (default: False)
+    --startTimeCutoff STARTTIMECUTOFF
+                          Reads must start before this value in order to be
+                          included when scoreFirst is set. (default: 10.0)
+    --nZmws NZMWS         Use the first n ZMWs for testing (default: -1)
+    --nProcs NPROCS       How many processes to use (default: 8)
+    --saveExtendedInfo    Whether to save extended information tothe barcode.h5
+                          files; this information is useful for debugging and
+                                                  chimera detection (default: False)
+
+The ``labelZmws`` command takes an input.fofn representing a set of
+bas.h5 files to operate on. Additionally, it takes a barcode.fasta
+file. Depending on ``scoreMode``, the FASTA file will be processed in
+different ways. Specifically, in ``paired`` mode, each two consecutive
+barcodes in the file are considered a set.
+
+The parameters, ``adapterSidePad`` and ``insertSidePad`` represents
+how many bases should be considered on each side of the putative
+barcode. These parameters are constrained such that:
+``|adapterSidePad| + |insertSidePad| + |barcode| < 65``.
+
+Users have the option to specify a different output location
+for the various outputs. Specifically, for each bas.h5 file in
+input.fofn, a bc.h5 (barcode hdf5) file is generated. These files are
+listed in the file ``outFofn`` which is typically just called
+``barcode.fofn``. See below for a description of the barcode hdf5
+file.
+
+
+labelAlignments
+---------------
+  usage: pbbarcode labelAlignments [-h]
+                                      [--minAvgBarcodeScore MINAVGBARCODESCORE]
+                                      [--minNumBarcodes MINNUMBARCODES]
+                                      [--minScoreRatio MINSCORERATIO]
+                                      barcode.fofn aligned_reads.cmp.h5
+
+  Adds information about barcode alignments to a cmp.h5 file from a previous
+  call to "labelZmws".
+
+  positional arguments:
+    barcode.fofn          input barcode fofn file
+    aligned_reads.cmp.h5  cmp.h5 file to add barcode labels
+
+  optional arguments:
+    -h, --help            show this help message and exit
+    --minAvgBarcodeScore MINAVGBARCODESCORE
+                          ZMW Filter: exclude ZMW if average barcode score is
+                          less than this value (default: 0.0)
+    --minNumBarcodes MINNUMBARCODES
+                          ZMW Filter: exclude ZMW if number of barcodes observed
+                          is less than this value (default: 1)
+    --minScoreRatio MINSCORERATIO
+                          ZMW Filter: exclude ZMWs whose best score divided by
+                          the 2nd best score is less than this ratio (default:
+                          1.0)
+                          
+
+The ``labelAlignments`` command takes as input a barcode.fofn computed
+from a call to ``labelZMWs`` and a cmp.h5 file where the barcode
+information is written to. See below for a description of the cmp.h5
+file additions.  
+
+
+
+emitFastqs
+----------
+  usage: pbbarcode emitFastqs [-h] [--outDir output.dir] [--subreads]
+                                 [--unlabeledZmws] [--trim TRIM] [--fasta]
+                                 [--minMaxInsertLength MINMAXINSERTLENGTH]
+                                 [--hqStartTime HQSTARTTIME]
+                                 [--minReadScore MINREADSCORE]
+                                 [--minAvgBarcodeScore MINAVGBARCODESCORE]
+                                 [--minNumBarcodes MINNUMBARCODES]
+                                 [--minScoreRatio MINSCORERATIO]
+                                 input.fofn barcode.fofn
+
+  Takes a bas.h5 fofn and a barcode.h5 fofn and produces a fast[a|q] file for
+  each barcode.
+
+  positional arguments:
+    input.fofn            input base or CCS fofn file
+    barcode.fofn          input barcode.h5 fofn file
+
+  optional arguments:
+    -h, --help            show this help message and exit
+    --outDir output.dir   output directory to write fastq files (default: /home/
+                          UNIXHOME/jbullard/projects/software/bioinformatics/too
+                          ls/pbbarcode/doc)
+    --subreads            whether to produce fastq files for the subreads;the
+                          default is to use the CCS reads. This option
+                          onlyapplies when input.fofn has both consensus and raw
+                          reads,otherwise the read type from input.fofn will be
+                          returned. (default: False)
+    --unlabeledZmws       whether to emit a fastq file for the unlabeled ZMWs.
+                          These are the ZMWs where no adapters are found
+                          typically (default: False)
+    --trim TRIM           trim off barcodes and any excess constant sequence
+                          (default: 20)
+    --fasta               whether the files produced should be FASTA files
+                          asopposed to FASTQ (default: False)
+    --minMaxInsertLength MINMAXINSERTLENGTH
+                          ZMW Filter: exclude ZMW if the longest subreadis less
+                          than this amount (default: 0)
+    --hqStartTime HQSTARTTIME
+                          ZMW Filter: exclude ZMW if start time of HQ
+                          regiongreater than this value (seconds) (default: inf)
+    --minReadScore MINREADSCORE
+                          ZMW Filter: exclude ZMW if readScore is less thanthis
+                          value (default: 0)
+    --minAvgBarcodeScore MINAVGBARCODESCORE
+                          ZMW Filter: exclude ZMW if average barcode score is
+                          less than this value (default: 0.0)
+    --minNumBarcodes MINNUMBARCODES
+                          ZMW Filter: exclude ZMW if number of barcodes observed
+                          is less than this value (default: 1)
+    --minScoreRatio MINSCORERATIO
+                          ZMW Filter: exclude ZMWs whose best score divided by
+                          the 2nd best score is less than this ratio (default:
+                          1.0)
+                          
+
+The ``emitFastqs`` command takes as input both an input.fofn for the
+bas.h5 files as well as a barcode.fofn from a call to labelZmws. The
+optional parameter ``outDir`` dictates where the files will be
+written. For each detected barcode, a fast[a|q] file will be emitted
+with all of the reads for that barcode. The ``trim`` parameter
+dictates how much of the read should be trimmed off. The default
+parameter for ``trim`` is the length of the barcode (which is stored
+in the barcode hdf5 files). At the moment, all barcodes in the barcode
+FASTA file must be the same length, therefore only a constant trim
+value is supported. In practice, one can aggressively trim in order to
+ensure that extra bases aren't left on the ends of reads. Finally, the
+``subreads`` parameter dictates whether subreads or CCS reads should
+be returned with the default being the appropriate reads according to
+the input file type, either CCS or subreads. This parameter is only
+inspected if the input.fofn contains both CCS and subread data, if the
+input.fofn contains only subread or CCS data then that is returned
+irrespective of the state of the the ``subreads`` parameter and a
+warning is issued.
+
+consensus
+---------
+  usage: pbbarcode consensus [-h] [--subsample SUBSAMPLE] [--nZmws NZMWS]
+                                [--outDir OUTDIR] [--keepTmpDir]
+                                [--ccsFofn CCSFOFN] [--nProcs NPROCS]
+                                [--noQuiver]
+                                [--minMaxInsertLength MINMAXINSERTLENGTH]
+                                [--hqStartTime HQSTARTTIME]
+                                [--minReadScore MINREADSCORE]
+                                [--minAvgBarcodeScore MINAVGBARCODESCORE]
+                                [--minNumBarcodes MINNUMBARCODES]
+                                [--minScoreRatio MINSCORERATIO]
+                                [--barcode BARCODE [BARCODE ...]]
+                                input.fofn barcode.fofn
+
+  Compute consensus sequences for each barcode.
+
+  positional arguments:
+    input.fofn            input bas.h5 fofn file
+    barcode.fofn          input bc.h5 fofn file
+
+  optional arguments:
+    -h, --help            show this help message and exit
+    --subsample SUBSAMPLE
+                          Subsample ZMWs (default: 1)
+    --nZmws NZMWS         Take n ZMWs (default: -1)
+    --outDir OUTDIR       Use this directory to output results (default: .)
+    --keepTmpDir
+    --ccsFofn CCSFOFN     Obtain CCS data from ccsFofn instead of input.fofn
+                          (default: )
+    --nProcs NPROCS       Use nProcs to execute. (default: 16)
+    --noQuiver
+    --minMaxInsertLength MINMAXINSERTLENGTH
+                          ZMW Filter: exclude ZMW if the longest subreadis less
+                          than this amount (default: 0)
+    --hqStartTime HQSTARTTIME
+                          ZMW Filter: exclude ZMW if start time of HQ
+                          regiongreater than this value (seconds) (default: inf)
+    --minReadScore MINREADSCORE
+                          ZMW Filter: exclude ZMW if readScore is less thanthis
+                          value (default: 0)
+    --minAvgBarcodeScore MINAVGBARCODESCORE
+                          ZMW Filter: exclude ZMW if average barcode score is
+                          less than this value (default: 0.0)
+    --minNumBarcodes MINNUMBARCODES
+                          ZMW Filter: exclude ZMW if number of barcodes observed
+                          is less than this value (default: 1)
+    --minScoreRatio MINSCORERATIO
+                          ZMW Filter: exclude ZMWs whose best score divided by
+                          the 2nd best score is less than this ratio (default:
+                          1.0)
+    --barcode BARCODE [BARCODE ...]
+                          Use this to extract consensus for just one barcode.
+                          (default: None)
+
+The ``emitFastqs`` command takes as input both an input.fofn for the
+bas.h5 files as well as a barcode.fofn from a call to labelZmws. The
+results are a FASTA file with an entry for each barcode containing the
+consensus amplicon sequence. This mode utilizes ``Quiver`` and
+``pbdagcon`` to compute consensus.   
+
+In cases where the amplicon is fewer than 2.5k bases, using CCS data
+is quite helpful. The ``--ccsFofn`` allows one to pass directly the
+ccs files. In many cases, both the CCS and raw basecalls are in the
+same file so you can check by passing the same parameter to input.fofn
+as to ccsFofn. 
+
+Dependencies
+````````````
+
+The pbbarcode package depends on a standard pbcore installation
+(https://github.com/PacificBiosciences/pbcore). If one wishes to use
+the ``consensus`` tool, ``pbdagcon`` needs to be installed
+(https://github.com/PacificBiosciences/pbdagcon).
+
+
+Barcode HDF5 File
+`````````````````
+
+The barcode hdf5 file, ``bc.h5``, represents a simple data store for
+barcode calls and their scores for each ZMW. Generally, a user need
+not interact with barcode hdf5 files, but can use the results stored in
+either the resulting cmp.h5 file or fast[a|q] files. The barcode hdf5
+file contains the following structure:
+
+/BarcodeCalls/best - (nZMWs, 6)[32-bit integer] dataset with the
+following columns: 
+
+    ``holeNumber,nAdapters,barcodeIdx1,barcodeScore1,barcodeIdx2,barcodeScore2``
+
+Additionally, the ``best`` dataset has the following attributes:
+
++-----------+------------------------------------------------------------------------+
+|movieName  |m120408_042614_richard_c100309392550000001523011508061222_s1_p0         |
+|           |                                                                        |
++-----------+------------------------------------------------------------------------+
+|columnNames|holeNumber,nAdapters,barcodeIdx1,barcodeScore1,barcodeIdx2,             |
+|           |barcodeScore2                                                           |
++-----------+------------------------------------------------------------------------+
+|scoreMode  |[symmetric|paired]                                                      |
+|           |                                                                        |
++-----------+------------------------------------------------------------------------+
+|barcodes   |'bc_1', 'bc_2', ...., 'bc_N'                                            |
+|           |                                                                        |
++-----------+------------------------------------------------------------------------+
+
+The two barcodeIdx1 and barcodeIdx2 columns are indices into
+``barcodes`` attribute. The ``scoreMode`` is scoring mode used to
+align the barcodes. The ``barcodes`` attribute correspond to the
+barcode.fasta sequence names. 
+
+Additionally, in some circumstances, it is useful to retain the entire
+history of the scoring, i.e., each barcode scored to each adapter
+across all ZMWs. In oder to retain this information, one must call:
+
+    ``pbbarcode labelZmws --saveExtendedInfo ...``
+
+In this mode, the resultant HDF5 file will have an additional dataset
+under the BarcodeCalls group, named: ``all``. This dataset has the
+following format:
+
+/BarcodeCalls/all - (nbarcodes * nadapters[zmw_i], 4) \forall i in 1 ... nZMWs 
+
+    ```holeNumber, adapterIdx, barcodeIdx, score```
+
+The ``adapterIdx`` is the index of the adapter along the molecule,
+i.e., adapterIdx 1 is the first adapter scored.
+
+Additions to the compare HDF5 (cmp.h5) File
+```````````````````````````````````````````
+
+In addition to the barcode hdf5 file, a call to ``labelAlignments``
+will annotate a cmp.h5 file. This annotation is stored in ways
+consistent with the cmp.h5 file format. Specifically, a new group: 
+
+| /BarcodeInfo/
+|   ID   (nBarcodeLabels + 1, 1)[32-bit integer] 
+|   Name (nBarcodeLabels + 1, 1)[variable length string]
+
+In addition to the /BarcodeInfo/ group, the key dataset which assigns
+alignments to barcodes is located at:
+
+/AlnInfo/Barcode (nAlignments, 3)[32-bit integer] with the following
+colums:
+
+     ``index,count,bestIndex,bestScore,secondBestIndex,secondBestScore``
+
+Here index refers to the index into the ``Name`` vector, score
+corresponds to the sum of the scores for the barcodes, and finally,
+count refers to the number of adapters found in the molecule.
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100755
index 0000000..7a1cd30
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,242 @@
+# -*- coding: utf-8 -*-
+#
+# pbbarcode documentation build configuration file, created by
+# sphinx-quickstart on Mon Apr 30 18:28:57 2012.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'pbbarcode'
+copyright = u'2012, PacBio'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '.1'
+# The full version, including alpha/beta/rc tags.
+release = '.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pbbarcodedoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('index', 'pbbarcode.tex', u'pbbarcode Documentation',
+   u'PacBio', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'pbbarcode', u'pbbarcode Documentation',
+     [u'PacBio'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'pbbarcode', u'pbbarcode Documentation',
+   u'PacBio', 'pbbarcode', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..ea69335
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,16 @@
+.. pbbarcode documentation master file, created by
+   sphinx-quickstart on Mon Apr 30 18:28:57 2012.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+pbbarcode
+=========
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+   PbbarcodeFunctionalSpecification
+
+
diff --git a/etc/barcode.fasta b/etc/barcode.fasta
new file mode 100644
index 0000000..e524539
--- /dev/null
+++ b/etc/barcode.fasta
@@ -0,0 +1,8 @@
+>bc3
+tatctatcgtatacgc
+>bc4
+atcacactgcatctga
+>bc5
+acgtacgctcgtcata
+>bc10
+tcatgcacgtctcgct
diff --git a/etc/barcode_complete.fasta b/etc/barcode_complete.fasta
new file mode 100644
index 0000000..9803f37
--- /dev/null
+++ b/etc/barcode_complete.fasta
@@ -0,0 +1,192 @@
+>bc_1
+GCGCTCTGTGTGCAGC
+>bc_2
+TCATGAGTCGACACTA
+>bc_3
+TATCTATCGTATACGC
+>bc_4
+ATCACACTGCATCTGA
+>bc_5
+ACGTACGCTCGTCATA
+>bc_6
+TGTGAGTCAGTACGCG
+>bc_7
+AGAGACACGATACTCA
+>bc_8
+CTGCTAGAGTCTACAG
+>bc_9
+AGCACTCGCGTCAGTG
+>bc_10
+TCATGCACGTCTCGCT
+>bc_11
+AGAGCATCTCTGTACT
+>bc_12
+CGCATCGACTACGCTA
+>bc_13
+CGTAGCGTGCTATCAC
+>bc_14
+ATGCTGATGACTGCGA
+>bc_15
+TGCGTGAGCTGTACAT
+>bc_16
+CGATCATCTATAGACA
+>bc_17
+CGACGTATCTGACAGT
+>bc_18
+CACGTCACTAGAGCGA
+>bc_19
+TGTCGCAGCTACTAGT
+>bc_20
+CATACGCTGTGTAGCA
+>bc_21
+AGTCGCATGACTGTGT
+>bc_22
+CAGTACTGCACGATCG
+>bc_23
+GTGCTGAGCATCAGAC
+>bc_24
+CACTGATCGATATGCA
+>bc_25
+TACAGTGTCTGCTGCG
+>bc_26
+TACAGATAGTGTAGCG
+>bc_27
+TCGTAGAGCTCGAGAC
+>bc_28
+GAGCTGCGCACTCGAT
+>bc_29
+GCGATGTCGCTATGTG
+>bc_30
+CGAGAGTCAGCGCATA
+>bc_31
+TCACGATGAGCACGTA
+>bc_32
+GACTGAGATCATGATC
+>bc_33
+ACGACATGATACTGCT
+>bc_34
+ATACAGCACAGATGTG
+>bc_35
+ACAGTCGATATCTCTC
+>bc_36
+GCTCGATCACATGACG
+>bc_37
+GTCGTACACGTGCGAC
+>bc_38
+ACTCATATCTAGAGTG
+>bc_39
+ACTGATCTGTCGCGCT
+>bc_40
+CACTAGCTCTGACTAC
+>bc_41
+GCTGTCATGTACTAGC
+>bc_42
+TATACATACACGCACT
+>bc_43
+TGTGACGACGCGTCTC
+>bc_44
+GACGTGAGCATGCACT
+>bc_45
+CTCGATACGTGTAGCT
+>bc_46
+GTGTCTAGACAGCTGT
+>bc_47
+GATGCATGCGTACGCA
+>bc_48
+TATCAGAGCAGCGATG
+>bc_49
+TCATATGTAGTACTCT
+>bc_50
+GCGATCTATGCACACG
+>bc_51
+TGCAGTCGAGATACAT
+>bc_52
+GACTCTGCGTCGAGTC
+>bc_53
+TACAGCGACGTCATCG
+>bc_54
+GCGCAGACTACGTGTG
+>bc_55
+GTCTCTGCGATACAGC
+>bc_56
+AGTATGAGATAGCTCG
+>bc_57
+GCGACGAGTACTCATG
+>bc_58
+AGTATCACAGTCGCTG
+>bc_59
+ATCATATGATGCGACA
+>bc_60
+AGACGTAGATCACAGC
+>bc_61
+CGTGTCATGCTACTCA
+>bc_62
+TGTGAGACTGCATGTC
+>bc_63
+GCTCAGTGCGCTACTG
+>bc_64
+ACTATCGCGCACGCAG
+>bc_65
+TGACACTCTGCACGCG
+>bc_66
+CAGACGTGACTGATAT
+>bc_67
+GCACTGTAGTGATCGT
+>bc_68
+CAGTGCGAGACAGTAG
+>bc_69
+AGTAGTGCTACTCGAC
+>bc_70
+ATGCGAGATCTGCTCA
+>bc_71
+TGAGACATACTGAGTG
+>bc_72
+ATGTGCACTAGTGTAC
+>bc_73
+TCAGCTGACGATGTGA
+>bc_74
+ACTGATGCGCACATGT
+>bc_75
+CTACTCTCAGCAGTGA
+>bc_76
+ATCTACATCACGACTC
+>bc_77
+ATATAGTACAGCGTCT
+>bc_78
+GACACGACTAGATCGC
+>bc_79
+TACGAGTCTGTCATAC
+>bc_80
+ACTCAGCTACATAGTG
+>bc_81
+ACGTATCATAGTGAGA
+>bc_82
+GAGTCGTATCGCTCAT
+>bc_83
+GCGATCACGAGTAGAC
+>bc_84
+CTAGACGTACATGTCG
+>bc_85
+TAGCAGTCACTGTGCG
+>bc_86
+GCTCATGCGATAGCTA
+>bc_87
+GCGCAGTCGTCTGTAT
+>bc_88
+ATGAGCTACGTACAGA
+>bc_89
+GTCGCGAGTCTATCAG
+>bc_90
+ACATCGATCTGCACTA
+>bc_91
+AGTATAGCATAGACGC
+>bc_92
+GTGAGAGCGTGACTCT
+>bc_93
+TGTCAGTAGATGACTC
+>bc_94
+TCGTACGAGATCGACA
+>bc_95
+CTACATGTGACTCGAG
+>bc_96
+GCGCTATAGTGCTCGT
diff --git a/etc/pacbio_barcodes_paired.fasta b/etc/pacbio_barcodes_paired.fasta
new file mode 100755
index 0000000..1ba2e7a
--- /dev/null
+++ b/etc/pacbio_barcodes_paired.fasta
@@ -0,0 +1,192 @@
+>F_1
+GGTAGGCGCTCTGTGTGCAGC
+>R_1
+AGAGTACTACATATGAGATGG
+>F_2
+GGTAGTCATGAGTCGACACTA
+>R_2
+CGTGTGCATAGATCGCGATGG
+>F_3
+GGTAGTATCTATCGTATACGC
+>R_3
+ATGTATCTCGACTGCAGATGG
+>F_4
+GGTAGATCACACTGCATCTGA
+>R_4
+GACTCGACGCAGAGTCGATGG
+>F_5
+GGTAGACGTACGCTCGTCATA
+>R_5
+CGATGACGTCGCTGTAGATGG
+>F_6
+GGTAGTGTGAGTCAGTACGCG
+>R_6
+CACACGTAGTCTGCGCGATGG
+>F_7
+GGTAGAGAGACACGATACTCA
+>R_7
+GCTGTATCGCAGAGACGATGG
+>F_8
+GGTAGCTGCTAGAGTCTACAG
+>R_8
+CGAGCTATCTCATACTGATGG
+>F_9
+GGTAGAGCACTCGCGTCAGTG
+>R_9
+CATGAGTACTCGTCGCGATGG
+>F_10
+GGTAGTCATGCACGTCTCGCT
+>R_10
+CAGCGACTGTGATACTGATGG
+>F_11
+GGTAGAGAGCATCTCTGTACT
+>R_11
+TGTCGCATCATATGATGATGG
+>F_12
+GGTAGCGCATCGACTACGCTA
+>R_12
+GCTGTGATCTACGTCTGATGG
+>F_13
+GGTAGCGTAGCGTGCTATCAC
+>R_13
+TGAGTAGCATGACACGGATGG
+>F_14
+GGTAGATGCTGATGACTGCGA
+>R_14
+GACATGCAGTCTCACAGATGG
+>F_15
+GGTAGTGCGTGAGCTGTACAT
+>R_15
+CAGTAGCGCACTGAGCGATGG
+>F_16
+GGTAGCGATCATCTATAGACA
+>R_16
+CTGCGTGCGCGATAGTGATGG
+>F_17
+GGTAGCGACGTATCTGACAGT
+>R_17
+CGCGTGCAGAGTGTCAGATGG
+>F_18
+GGTAGCACGTCACTAGAGCGA
+>R_18
+ATATCAGTCACGTCTGGATGG
+>F_19
+GGTAGTGTCGCAGCTACTAGT
+>R_19
+ACGATCACTACAGTGCGATGG
+>F_20
+GGTAGCATACGCTGTGTAGCA
+>R_20
+CTACTGTCTCGCACTGGATGG
+>F_21
+GGTAGAGTCGCATGACTGTGT
+>R_21
+GTCGAGTAGCACTACTGATGG
+>F_22
+GGTAGCAGTACTGCACGATCG
+>R_22
+TGAGCAGATCTCGCATGATGG
+>F_23
+GGTAGGTGCTGAGCATCAGAC
+>R_23
+CACTCAGTATGTCTCAGATGG
+>F_24
+GGTAGCACTGATCGATATGCA
+>R_24
+GTACACTAGTGCACATGATGG
+>F_25
+GGTAGTACAGTGTCTGCTGCG
+>R_25
+TCACATCGTCAGCTGAGATGG
+>F_26
+GGTAGTACAGATAGTGTAGCG
+>R_26
+ACATGTGCGCATCAGTGATGG
+>F_27
+GGTAGTCGTAGAGCTCGAGAC
+>R_27
+TCACTGCTGAGAGTAGGATGG
+>F_28
+GGTAGGAGCTGCGCACTCGAT
+>R_28
+GAGTCGTGATGTAGATGATGG
+>F_29
+GGTAGGCGATGTCGCTATGTG
+>R_29
+AGACGCTGTACTATATGATGG
+>F_30
+GGTAGCGAGAGTCAGCGCATA
+>R_30
+GCGATCTAGTCGTGTCGATGG
+>F_31
+GGTAGTCACGATGAGCACGTA
+>R_31
+GTATGACAGACTCGTAGATGG
+>F_32
+GGTAGGACTGAGATCATGATC
+>R_32
+CACTATGTAGCTGAGTGATGG
+>F_33
+GGTAGACGACATGATACTGCT
+>R_33
+TCTCACTATGATACGTGATGG
+>F_34
+GGTAGATACAGCACAGATGTG
+>R_34
+ATGAGCGATACGACTCGATGG
+>F_35
+GGTAGACAGTCGATATCTCTC
+>R_35
+GTCTACTCGTGATCGCGATGG
+>F_36
+GGTAGGCTCGATCACATGACG
+>R_36
+CGACATGTACGTCTAGGATGG
+>F_37
+GGTAGGTCGTACACGTGCGAC
+>R_37
+CGCACAGTGACTGCTAGATGG
+>F_38
+GGTAGACTCATATCTAGAGTG
+>R_38
+TAGCTATCGCATGAGCGATGG
+>F_39
+GGTAGACTGATCTGTCGCGCT
+>R_39
+ATACAGACGACTGCGCGATGG
+>F_40
+GGTAGCACTAGCTCTGACTAC
+>R_40
+TCTGTACGTAGCTCATGATGG
+>F_41
+GGTAGGCTGTCATGTACTAGC
+>R_41
+CTGATAGACTCGCGACGATGG
+>F_42
+GGTAGTATACATACACGCACT
+>R_42
+TAGTGCAGATCGATGTGATGG
+>F_43
+GGTAGTGTGACGACGCGTCTC
+>R_43
+GCGTCTATGCTATACTGATGG
+>F_44
+GGTAGGACGTGAGCATGCACT
+>R_44
+AGAGTCACGCTCTCACGATGG
+>F_45
+GGTAGCTCGATACGTGTAGCT
+>R_45
+GAGTCATCTACTGACAGATGG
+>F_46
+GGTAGGTGTCTAGACAGCTGT
+>R_46
+TGTCGATCTCGTACGAGATGG
+>F_47
+GGTAGGATGCATGCGTACGCA
+>R_47
+CTCGAGTCACATGTAGGATGG
+>F_48
+GGTAGTATCAGAGCAGCGATG
+>R_48
+ACGAGCACTATAGCGCGATGG
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..5a017e3
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,37 @@
+from setuptools import setup, Extension, find_packages
+import os
+import sys
+
+vFile = 'src/python/pbbarcode/_version.py'
+
+if os.path.exists(vFile):
+    lines = open(vFile, 'r').read().splitlines()
+    for line in lines:
+        elts = line.split('=')
+        elts = [e.strip() for e in elts]
+        if len(elts) == 2 and elts[0] == '__version__':
+            _ReadVersion = elts[1].replace('\'', '').replace('\"', '')
+            break
+else:
+    _ReadVersion = '0.0.0'
+    
+setup(
+    name = 'pbbarcode',
+    version=_ReadVersion,
+    author='pbiDevNet',
+    author_email='pbiDevNet at pacificbiosciences.com',
+    license='LICENSE.txt',
+    packages = find_packages('src/python'),  
+    package_dir = {'':'src/python'},
+    ext_modules=[Extension('pbbarcode/sw', ['src/C/sw.c'], extra_compile_args=["-O3","-shared"])], 
+    zip_safe = False,
+    entry_points={
+        'console_scripts': [
+            'pbbarcode = pbbarcode.main:main']
+    },
+    install_requires=[
+        'pbcore >= 0.6.3',
+        'numpy >= 1.6.0',
+        'h5py >= 1.3.0'
+        ]
+    )
diff --git a/src/C/Makefile b/src/C/Makefile
new file mode 100644
index 0000000..4913cf3
--- /dev/null
+++ b/src/C/Makefile
@@ -0,0 +1,11 @@
+.PHONY: clean all
+SHELL = /bin/bash -e
+
+all: build/sw.so
+
+build/sw.so: sw.c
+	mkdir -p ./build;\
+	gcc -O4 -DGETPROB -shared -fPIC sw.c -o build/sw.so
+clean:
+	rm -rf build
+
diff --git a/src/C/sw.c b/src/C/sw.c
new file mode 100644
index 0000000..4cdffdb
--- /dev/null
+++ b/src/C/sw.c
@@ -0,0 +1,56 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#define M 64
+#define N 64
+#define MAX(x,y) (((x) > (y)) ? (x) : (y))
+
+int* allocate_dp_mat() {
+    return (int*) calloc(N*M, sizeof(int));
+}
+
+int compute_align_score(int* dp_mat, char* tSeq, char* qSeq) {
+    int ipenalty   = -1;
+    int dpenalty   = -1;
+    int match      =  2;
+    int mpenalty   = -2;
+    int best_score = 0;
+    int iscore     = 0;
+    int dscore     = 0;
+    int mscore     = 0;
+    int i,j;
+
+    memset(dp_mat, 0, M*N*sizeof(int));
+
+    for (i = 1; i < strlen(tSeq) + 1; i++) {
+	for (j = 1; j < strlen(qSeq) + 1; j++) {
+	    iscore = dp_mat[i*M + j-1] + ipenalty;
+	    dscore = dp_mat[(i-1)*M + j] + dpenalty;
+	    mscore = dp_mat[(i-1)*M + j-1] + ((tSeq[i-1] == qSeq[j-1]) ? match : mpenalty);
+	    dp_mat[i*M + j] = MAX(MAX(0, iscore), MAX(dscore, mscore));
+ 	    if (dp_mat[i*M + j] >= best_score) 
+		best_score = dp_mat[i*M + j];
+	}
+    }
+    return best_score;
+}
+
+void compute_align_scores(int* scores, int n, int* dp_mat, char* tSeq, 
+                          char** qSeqs) {
+    int i = 0;
+    for (i; i < n; i++) {
+        scores[i] = compute_align_score(dp_mat, tSeq, qSeqs[i]);
+    }
+}
+
+
+void print_dp_mat(int* dp_mat, char* tSeq, char* qSeq) {
+    int i,j;
+    for (j = 0; j < strlen(qSeq) + 1; j++) {
+    	for (i = 0; i < strlen(tSeq) + 1; i++) {
+    	    printf("%d ", dp_mat[i*M + j]);
+    	}
+    	printf("\n");
+    }
+}
diff --git a/src/python/pbbarcode/BarcodeLabeler.py b/src/python/pbbarcode/BarcodeLabeler.py
new file mode 100755
index 0000000..354338b
--- /dev/null
+++ b/src/python/pbbarcode/BarcodeLabeler.py
@@ -0,0 +1,225 @@
+#################################################################################$$
+# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without 
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this 
+#   list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice, 
+#   this list of conditions and the following disclaimer in the documentation 
+#   and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its contributors 
+#   may be used to endorse or promote products derived from this software 
+#   without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS 
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS 
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#################################################################################$$
+import logging
+
+from pbcore.io import BasH5Reader, BaxH5Reader
+from pbcore.io.FastaIO import *
+import pbbarcode.SWaligner as Aligner
+import numpy as n
+
+from pbcore.io.BarcodeH5Reader import LabeledZmw, \
+    BARCODE_DELIMITER
+
+__RC_MAP__ = dict(zip('ACGTacgt-N','TGCAtgca-N'))
+
+class BarcodeScorer(object):
+    def __init__(self, basH5, barcodeFasta,  
+                 adapterSidePad = 0, insertSidePad = 4, 
+                 scoreMode = 'symmetric', maxHits = 10, 
+                 scoreFirst = False, startTimeCutoff = 1):
+        """A BarcodeScorer object scores ZMWs and produces summaries
+        of the scores. Various parameters control the behavior of the
+        object, specifically the padding allows the user to add a
+        little extra on each side of the adapter find for safety. The
+        most relevant parameter is the scoreMode which dictates how
+        the barcodes are scored, either paired or symmetric."""
+
+        self.basH5 = basH5
+        self.barcodeFasta = list(barcodeFasta)
+        self.aligner = Aligner.SWaligner()
+        self.barcodeLength = n.unique(map(lambda x : len(x.sequence), 
+                                          self.barcodeFasta))
+        if len(self.barcodeLength) > 1:
+            raise Exception("Currently, all barcodes must be the same length.")
+        else:
+            self.barcodeLength = int(self.barcodeLength)
+
+        self.barcodeSeqs = [(barcode.sequence.upper(), 
+                             self._rc(barcode.sequence.upper())) 
+                            for barcode in self.barcodeFasta]
+
+        self.adapterSidePad = adapterSidePad
+        self.insertSidePad = insertSidePad
+        self.maxHits = maxHits
+
+        if scoreMode not in ['symmetric', 'paired']:
+            raise Exception("scoreMode must either be symmetric or paired")
+        self._scoreMode = scoreMode
+
+        self.scoreFirst = scoreFirst
+        self.startTimeCutoff = startTimeCutoff
+
+        self.forwardScorer = self.aligner.makeScorer([x[0] for x in self.barcodeSeqs])
+        self.reverseScorer = self.aligner.makeScorer([x[1] for x in self.barcodeSeqs])
+        
+        logging.debug(("Constructed BarcodeScorer with scoreMode: %s," + \
+                           "adapterSidePad: %d, insertSidePad: %d, and scoreFirst: %r") \
+                          % (scoreMode, adapterSidePad, insertSidePad, scoreFirst))
+    
+    @property
+    def movieName(self):
+        return self.basH5.movieName
+    
+    def makeBCLabel(self, s1, s2):
+        return BARCODE_DELIMITER.join((s1, s2))
+
+    @property
+    def barcodeLabels(self):
+        """The barcode labels are function of the barcodeNames and the
+        scoreMode, they represent the user-visible names."""
+        if self.scoreMode == 'paired':
+            return n.array([self.makeBCLabel(self.barcodeFasta[i].name,
+                                             self.barcodeFasta[i+1].name) for i
+                            in xrange(0, len(self.barcodeSeqs), 2)])
+        else:
+            return n.array([self.makeBCLabel(x.name, x.name) for x in self.barcodeFasta])
+
+    @property
+    def barcodeNames(self):
+        """The barcode names are the FASTA names"""
+        return n.array([x.name for x in self.barcodeFasta])
+
+    @property
+    def scoreMode(self):
+        return self._scoreMode
+
+    def _rc(self, s):
+        return "".join([__RC_MAP__[c] for c in s[::-1]])
+    
+    def _flankingSeqs(self, zmw):
+        def fromRange(rStart, rEnd):
+            try:
+                qSeqLeft = zmw.read(rStart - (self.barcodeLength + self.insertSidePad), 
+                                    rStart + self.adapterSidePad).basecalls()
+            except IndexError:
+                qSeqLeft = None
+            try:
+                qSeqRight = zmw.read(rEnd - self.adapterSidePad, 
+                                     rEnd + self.barcodeLength + 
+                                     self.insertSidePad).basecalls()
+            except IndexError:
+                qSeqRight = None
+
+            return (qSeqLeft, qSeqRight)
+
+        adapterRegions = zmw.adapterRegions
+        if len(adapterRegions) > self.maxHits:
+            adapterRegions = adapterRegions[0:self.maxHits]
+        
+        seqs = [fromRange(start, end) for (start, end) in adapterRegions]
+
+        # We only score the first barcode if we don't find any adapters
+        # *and* the start time is less than the threshold. 
+        scoredFirst = False
+        if self.scoreFirst and not len(seqs):
+            s = zmw.zmwMetric('HQRegionStartTime')
+            e = zmw.zmwMetric('HQRegionEndTime')
+            # s<e => has HQ. 
+            if s < e and s <= self.startTimeCutoff:
+                l = self.barcodeLength + self.insertSidePad
+                l = l if zmw.hqRegion[1] > l else zmw.hqRegion[1]
+                try:
+                    bc = zmw.read(0, l).basecalls()
+                    if len(bc) >= self.barcodeLength:
+                        seqs.insert(0, (bc, None))
+                        scoredFirst = True
+                except IndexError:
+                    pass
+        
+        return (seqs, scoredFirst)
+
+    def labelZmws(self, holeNumbers):
+        """Return a list of LabeledZmws for input holeNumbers"""
+        def scoreZmw(zmw):
+            adapters, scoredFirst = self._flankingSeqs(zmw)
+            adapterScores = [[]]*len(adapters)
+            barcodeScores = n.zeros(len(self.barcodeSeqs))
+
+            for i,adapter in enumerate(adapters):
+                fscores  = self.forwardScorer(adapter[0])
+                rscores  = self.reverseScorer(adapter[0])
+                ffscores = self.forwardScorer(adapter[1])
+                rrscores = self.reverseScorer(adapter[1])
+
+                scored = 2.0 if adapter[0] and adapter[1] else \
+                    1.0 if adapter[0] or adapter[1] else 0
+                
+                # An adapter score is the average barcode score for
+                # each barcode -- that way, you can compare across
+                # adapters even if the different adapters have
+                # different numbers of flanking sequence. 
+                if scored == 0:
+                    adapterScores[i] = barcodeScores
+                else:
+                    adapterScores[i] = n.maximum((fscores + rrscores)/scored, 
+                                                 (rscores + ffscores)/scored)
+
+            barcodeScores = reduce(lambda x, y: x + y, adapterScores) if adapterScores \
+                else n.zeros(len(self.barcodeSeqs))
+
+            return (zmw.holeNumber, len(adapters), barcodeScores, adapterScores,
+                    scoredFirst)
+
+        # o here is the record immediately above.
+        def chooseSymmetric(o):
+            p = n.argsort(-o[2])
+            return LabeledZmw(o[0], o[1], p[0], o[2][p[0]], p[1], o[2][p[1]], o[3])
+        def choosePaired(o):
+            if o[1] == 1:
+                s = n.array([max(o[2][i], o[2][i + 1]) for i in \
+                                 xrange(0, len(self.barcodeSeqs), 2)])
+                p = n.argsort(-s)
+                s = s[p]
+            else:
+                # score the pairs by scoring the two alternate
+                # ways they could have been put on the molecule. A
+                # missed adapter will confuse this computation.
+                scores  = o[3]
+                results = n.zeros(len(self.barcodeSeqs)/2)
+                for i in xrange(0, len(self.barcodeSeqs), 2):
+                    pths = [0,0]
+                    for j in xrange(0, len(scores)):
+                        pths[j % 2] += scores[j][i]
+                        pths[1 - j % 2] += scores[j][i + 1]
+                    results[i/2] = max(pths)
+                        
+                p = n.argsort(-results)
+                s = results[p]
+
+            return LabeledZmw(o[0], o[1], p[0], s[0], p[1], s[1], o[3])
+         
+        if self.scoreMode == 'symmetric':
+            choose = chooseSymmetric
+        elif self.scoreMode == 'paired':
+            choose = choosePaired
+        else:
+            raise Exception("Unsupported scoring mode in BarcodeLabeler.py")
+
+        scored = [scoreZmw(self.basH5[zmw]) for zmw in holeNumbers] 
+        return [choose(scoreTup) for scoreTup in scored if scoreTup[1]]
diff --git a/src/python/pbbarcode/SWaligner.py b/src/python/pbbarcode/SWaligner.py
new file mode 100755
index 0000000..d6dae46
--- /dev/null
+++ b/src/python/pbbarcode/SWaligner.py
@@ -0,0 +1,69 @@
+#################################################################################$$
+# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without 
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this 
+#   list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice, 
+#   this list of conditions and the following disclaimer in the documentation 
+#   and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its contributors 
+#   may be used to endorse or promote products derived from this software 
+#   without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS 
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS 
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#################################################################################$$
+from ctypes import *
+import os
+import numpy
+import pkg_resources
+
+class SWaligner(object):
+    def __init__(self):
+        # setup.py should put sw.so in the following path.
+        self.SW_DLL_PATH = os.path.dirname(os.path.abspath(__file__)) + os.path.sep + "sw.so" 
+        self._dll        = CDLL(self.SW_DLL_PATH)
+        self.dpMat       = self._dll.allocate_dp_mat()
+    
+    def score(self, tSeq, qSeq):
+        return self._dll.compute_align_score(self.dpMat, tSeq, qSeq)
+    
+    def makeScorer(self, targets):
+        ScoreType = c_int * len(targets)
+        scores = ScoreType()
+        for i in range(0, len(scores)):
+            scores[i] = 0
+        
+        TargetType = c_char_p * len(targets)
+        targetSeqs = TargetType()
+        for i in range(0, len(targetSeqs)):
+            targetSeqs[i] = targets[i]
+
+        targetLen = len(targets)
+
+        def scorer(query):
+            if not query:
+                return numpy.zeros(len(targets))
+
+            self._dll.compute_align_scores(scores, 
+                                           targetLen, 
+                                           self.dpMat, 
+                                           query,
+                                           targetSeqs)
+            return numpy.array([scores[i] for i in xrange(0, len(scores))])
+        return scorer
+
+        
diff --git a/src/python/pbbarcode/__init__.py b/src/python/pbbarcode/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/src/python/pbbarcode/_version.py b/src/python/pbbarcode/_version.py
new file mode 100755
index 0000000..4f0196d
--- /dev/null
+++ b/src/python/pbbarcode/_version.py
@@ -0,0 +1 @@
+__version__='0.8.0'
diff --git a/src/python/pbbarcode/main.py b/src/python/pbbarcode/main.py
new file mode 100755
index 0000000..0b6ddd0
--- /dev/null
+++ b/src/python/pbbarcode/main.py
@@ -0,0 +1,751 @@
+#!/usr/bin/env python
+#################################################################################$$
+# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without 
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this 
+#   list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice, 
+#   this list of conditions and the following disclaimer in the documentation 
+#   and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its contributors 
+#   may be used to endorse or promote products derived from this software 
+#   without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS 
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS 
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#################################################################################$$
+import os
+import sys
+import argparse
+import logging
+import tempfile
+import shutil
+import pkg_resources
+import re
+import subprocess
+import random
+import shutil
+
+from multiprocessing import Pool
+
+import h5py as h5
+import numpy as n
+
+from pbcore.util.ToolRunner import PBMultiToolRunner
+from pbcore.io import BaxH5Reader, BasH5Reader
+from pbcore.io import CmpH5Reader, CmpH5Alignment
+from pbcore.io.BarcodeH5Reader import *
+from pbcore.io import FastaReader, FastqWriter, FastqRecord, \
+    FastaWriter, FastaRecord
+
+from pbbarcode.BarcodeLabeler import *
+from pbbarcode._version import __version__
+
+from pbh5tools.CmpH5Utils import copyAttributes
+
+# Paths to the Barcode Datasets in the cmp.h5 file.
+BC_ALN_INFO_DS = "AlnInfo/Barcode"
+BC_INFO_NAME   = "BarcodeInfo/Name"
+BC_INFO_ID     = "BarcodeInfo/ID"
+
+SCORE_MODES    = ['symmetric', 'paired']
+
+BAS_PLS_REGEX = r'\.ba[x|s]\.h5$|\.pl[x|s]\.h5$|\.cc[x|s]\.h5$'
+BARCODE_EXT   = '.bc.h5'
+BC_REGEX      = r'\.bc\.h5'
+
+def movieNameFromFile(fn):
+    return re.sub('|'.join((BC_REGEX, BAS_PLS_REGEX)) , '', 
+                  os.path.basename(fn))
+
+def makeBarcodeH5FromBasH5(basH5):
+    """The workhorse function for creating a barcode H5 file from a
+    base H5 file."""
+    labeler = BarcodeScorer(basH5, FastaReader(runner.args.barcodeFile),
+                            runner.args.adapterSidePad, runner.args.insertSidePad,
+                            scoreMode = runner.args.scoreMode, 
+                            maxHits = runner.args.maxAdapters,
+                            scoreFirst = runner.args.scoreFirst, 
+                            startTimeCutoff = runner.args.startTimeCutoff)
+    if runner.args.nZmws < 0:
+        zmws = basH5.sequencingZmws
+    else:
+        zmws = basH5.sequencingZmws[0:runner.args.nZmws]
+
+    logging.debug("Labeling %d ZMWs from: %s" % (len(zmws), basH5.filename))
+    labeledZmws = labeler.labelZmws(zmws)
+    logging.debug("Labeled %d ZMWs" % len(labeledZmws))
+    
+    outBase = re.sub(BAS_PLS_REGEX, BARCODE_EXT, 
+                     os.path.basename(basH5.filename))
+    outFile = '/'.join((runner.args.outDir, outBase))
+    logging.debug("Writing to: %s" % outFile)
+
+    writeBarcodeH5(labeledZmws, labeler, outFile, 
+                   runner.args.saveExtendedInfo)
+    return outFile
+
+def mpWrapper(f):
+    return makeBarcodeH5FromBasH5(BasH5Reader(f))
+
+def makeBarcodeFofnFromBasFofn():
+    inputFofn = runner.args.inputFile
+    inFiles = open(inputFofn).read().splitlines()
+
+    if not all(map(os.path.exists, inFiles)):
+        raise IOError("All files in input.fofn must exist.")
+
+    logging.debug("Using %d processes." % runner.args.nProcs)
+    if runner.args.nProcs <= 1:
+        newFiles = map(mpWrapper, inFiles)
+    else:
+        pool = Pool(runner.args.nProcs)
+        newFiles = pool.map(mpWrapper, inFiles)
+
+    oFile = open(runner.args.outFofn, 'w')
+    for nF in newFiles:
+        oFile.write(nF + "\n")
+    oFile.close()
+
+def labelAlignments():
+    logging.info("Labeling alignments using: %s" % runner.args.inputFofn)
+    bcFofn = BarcodeH5Fofn(runner.args.inputFofn)
+
+    with CmpH5Reader(runner.args.cmpH5) as cmpH5:
+        bcDS = n.zeros((len(cmpH5), 5), dtype = "int32")
+
+        for (i, aln) in enumerate(cmpH5):
+            bcReader = bcFofn.readerForMovie(aln.movieInfo.Name)
+            try:
+                lZmw = bcReader.labeledZmwFromHoleNumber(aln.HoleNumber)
+                if lZmw.nScored < runner.args.minNumBarcodes or \
+                        lZmw.averageScore < runner.args.minAvgBarcodeScore or \
+                        lZmw.scoreRatio < runner.args.minScoreRatio:
+                    lZmw = None
+            except KeyError:
+                lZmw = None
+
+            if lZmw:
+                bcDS[i,:] = n.array([lZmw.nScored, lZmw.bestIdx, lZmw.bestScore,
+                                     lZmw.secondBestIdx, lZmw.secondBestScore])
+            else:
+                # either no barcode was found for this guy or they got
+                # filtered, hence the NULL_BARCODE
+                bcDS[i,:] = n.array([0, 
+                                     len(bcReader.barcodeLabels), 0, 
+                                     len(bcReader.barcodeLabels), 0])
+
+    # write to the cmp.h5 file.
+    H5 = h5.File(runner.args.cmpH5, 'r+')
+    if BC_INFO_ID in H5:
+        del H5[BC_INFO_ID]
+    if BC_INFO_NAME in H5:
+        del H5[BC_INFO_NAME]
+
+    # we use the first one to get the labels, if somehow they
+    # don't have all of the same stuff that will be an issue.
+    bcLabels = n.concatenate((bcFofn.barcodeLabels, n.array([BARCODE_DELIMITER]))) 
+    H5.create_dataset(BC_INFO_ID, data = n.array(range(0, len(bcLabels))), 
+                      dtype = 'int32')
+    H5.create_dataset(BC_INFO_NAME, data = bcLabels, dtype = h5.new_vlen(str))
+    if BC_ALN_INFO_DS in H5:
+        del H5[BC_ALN_INFO_DS]
+    bcDS = H5.create_dataset(BC_ALN_INFO_DS, data = bcDS, dtype = 'int32')
+    bcDS.attrs['ColumnNames'] = n.array(['count', 'index1', 'score1', 'index2', 
+                                         'score2'])
+    #force BarcodeMode to have numpy dtype for CmpH5Sort 'extra datasets' routine
+    bcDS.attrs['BarcodeMode'] = n.array( bcFofn.scoreMode )
+    H5.close()
+
+def zipFofns(*inFofns):
+    """Take inputFofns and return n tuples of length len(inFofns)
+    where n is the number of entries in each FOFN."""
+    def readAndSort(inFile):
+        lines = n.array(open(inFile).read().splitlines())
+        lines = lines[n.array(n.argsort([movieNameFromFile(fofnLine) for 
+                                         fofnLine in lines]))]
+        return lines
+
+    sortedFofns = [readAndSort(inFofn) for inFofn in inFofns]
+    l = map(len, sortedFofns)
+    if len(n.unique(l)) != 1:
+        raise Exception("Fofns don't match, unequal number of inputs.")
+    else:
+        for i in xrange(0, n.unique(l)):
+            if len(n.unique([movieNameFromFile(sortedFofn[i]) for 
+                             sortedFofn in sortedFofns])) != 1:
+                raise Exception("Fofn elements don't match, movies differ.")
+    
+    # need to un-arrayify these guys
+    return zip(*map(list, sortedFofns))
+
+def filterZmws(zmwsForBCs):
+    """Apply various filterings passed by the user. There are somewhat
+    different semantics for CCS filtering and subread filtering in
+    terms of the raw primary metrics available, e.g.,
+    HQRegionStartTime is unavailable for the CCS data and somewhat
+    irrelevant."""
+    def getHQStart(zmw):
+        try:
+            return zmw.zmwMetric('HQRegionStartTime')
+        except:
+            return 0
+        
+    def getReadScore(zmw):
+        return zmw.zmwMetric("ReadScore")
+
+    def molLenGuess(zmw):
+        if zmw.baxH5.hasRawBasecalls:
+            return max(map(len, zmw.subreads)) if zmw.subreads else 0
+        else:
+            return len(zmw.ccsRead) if zmw.ccsRead else 0
+
+    def zmwFilterFx(tup):
+        zmw, lZmw = tup
+
+        mlGuess = molLenGuess(zmw)
+        if not mlGuess:
+            return False
+
+        avgScore    = lZmw.averageScore
+        numScored   = lZmw.nScored
+        scoreRatio  = lZmw.scoreRatio
+        hqStart     = getHQStart(zmw)
+        readScore   = getReadScore(zmw)
+
+        ## XXX : still need to detect the chimeras
+        if mlGuess < runner.args.minMaxInsertLength or \
+                hqStart > runner.args.hqStartTime or \
+                readScore < runner.args.minReadScore or \
+                avgScore < runner.args.minAvgBarcodeScore or \
+                numScored < runner.args.minNumBarcodes or \
+                scoreRatio < runner.args.minScoreRatio:
+            return False
+        else:
+            return True
+
+    return { k:filter(zmwFilterFx, v) for k,v in zmwsForBCs.items() }
+
+def _warnOnce():
+    var = []
+    def warnOnce(msg):
+        if not var:
+            logging.warn(msg)
+        var.append(1)
+    return warnOnce
+warnOnce = _warnOnce()
+
+def getFastqRecords(zmw, lZmw = None):
+    if zmw.baxH5.hasRawBasecalls and zmw.baxH5.hasConsensusBasecalls:
+        # Only examine this parameter when passed both.
+        if runner.args.subreads:
+            reads = zmw.subreads
+        else:
+            reads = [zmw.ccsRead]
+    elif zmw.baxH5.hasRawBasecalls:
+        if runner.args.subreads:
+            warnOnce("`subreads` argument is ignored when using >= 2.1" + 
+                     "bas.h5 data as input.")
+        reads = zmw.subreads
+    else:
+        if runner.args.subreads:
+            warnOnce("`subreads` argument is ignored when using >= 2.1" + 
+                     "ccs.h5 data as input.")
+        reads = [zmw.ccsRead]
+
+    extra = (" %g %g" % (round(zmw.zmwMetric("ReadScore"), 2), 
+                         round(lZmw.averageScore, 2))) if lZmw else ""
+    
+    return [FastqRecord(read.readName + extra,
+                        read.basecalls(),
+                        read.QualityValue()) for read in reads if read]
+
+def getFastqs():
+    zmwsByBarcode = getZmwsForBarcodes()
+    logging.debug("Pre-filter: Average number of ZMWs per barcode: %d" % 
+                  n.mean([len(zmwsByBarcode[k]) for k in zmwsByBarcode.keys()]))
+
+    zmwsByBarcode = filterZmws(zmwsByBarcode) 
+    logging.debug("Post-filter: Average number of ZMWs per barcode: %d" % 
+                  n.mean([len(zmwsByBarcode[k]) for k in zmwsByBarcode.keys()]))
+    
+    def getReadData(zmws):
+        recs = [getFastqRecords(zmw,lZmw) for zmw,lZmw in zmws]
+        recs = filter(lambda x : x, recs)
+        return [elt for sublst in recs for elt in sublst]
+
+    return {k:getReadData(zmws) for k, zmws in zmwsByBarcode.iteritems()}
+
+def emitFastqs():
+    outFiles = getFastqs()
+    outDir   = runner.args.outDir
+    fasta    = runner.args.fasta
+
+    if runner.args.unlabeledZmws:
+        outFiles['UNLABELED'] = getUnlabeledZmws()
+
+    if not os.path.exists(runner.args.outDir):
+        os.makedirs(runner.args.outDir)
+
+    if fasta:
+        writer = FastaWriter
+        def record(n, s, qv):
+            return FastaRecord(n, s)
+    else:
+        writer = FastqWriter
+        record = FastqRecord
+    
+    l = 'a' if runner.args.fasta else 'q'
+    for k in outFiles.keys():
+        if outFiles[k]:
+            with writer("%s/%s.fast%s" % (runner.args.outDir, k, l)) as w:
+                for e in outFiles[k]:
+                    tlen = len(e.sequence)-runner.args.trim
+                    r = record(e.name, e.sequence[runner.args.trim:tlen],
+                               e.quality[runner.args.trim:tlen])
+                    if r:
+                        w.writeRecord(r)
+
+def getUnlabeledZmws():
+    """Return FASTQ records for ZMWs which do not have a barcode label"""
+    unlabeledZmws = []
+
+    for basFile, barcodeFile in zipFofns(runner.args.inputFofn, 
+                                         runner.args.barcodeFofn):
+        basH5 = BasH5Reader(basFile)
+        bcH5  = BarcodeH5Reader(barcodeFile)
+        sdiff = basH5.sequencingZmws[~n.in1d(basH5.sequencingZmws,  
+                                             bcH5.labeledZmws.keys())]
+        for hn in sdiff:
+            unlabeledZmws.append(basH5[hn])
+
+    return reduce(lambda x,y : x+y, [getFastqRecords(unlabeledZmw) for 
+                                     unlabeledZmw in unlabeledZmws])
+
+def getZmwsForBarcodes(labels = None):
+    """dictionary of pbcore.io.Zmw and LabeledZmw indexed by barcode
+    label"""
+    zmwsForBCs = {} 
+    for basFile, barcodeFile in zipFofns(runner.args.inputFofn, 
+                                         runner.args.barcodeFofn):
+        basH5   = BasH5Reader(basFile)
+        bcH5    = BarcodeH5Reader(barcodeFile)
+        allLabs = bcH5.barcodeLabels
+        if labels:
+            allLabs = [x for x in allLabs if x in labels]
+            logging.info("Processing only: %s" % ",".join(allLabs))
+        for label in allLabs:
+            lZmws = bcH5.labeledZmwsFromBarcodeLabel(label)
+            for lZmw in lZmws:
+                zmw = basH5[lZmw.holeNumber]
+                if not label in zmwsForBCs.keys():
+                    zmwsForBCs[label] = []
+                zmwsForBCs[label].append((zmw, lZmw))
+
+    return zmwsForBCs
+
+def gconFunc(tp):
+    # called bcause multiprocess
+    rootDir, barcode = tp
+    bcdir = "/".join((rootDir, barcode))
+
+    ## call gcon
+    logging.info("In gconFunc for: %s" % barcode)
+
+    cmd = "gcon.py r --min_cov 3 %s/subreads.fasta %s/seed_read.fasta -d %s" % \
+        (bcdir, bcdir, bcdir)
+    subprocess.call(cmd, shell = True)
+
+    ## check to see if the file is empty
+    r = FastaReader("%s/g_consensus.fa" % bcdir)
+    
+    if not list(r)[0].sequence:
+        return None
+
+    ## check to see if we are going to run quiver
+    if not runner.args.noQuiver:
+        # setup the blasr / sam / quiver stuff.
+        logging.info("Setup regions file, now running blasr through quiver.")
+
+        cmd = ('blasr %s %s/g_consensus.fa -nproc 1 -sam -regionTable %s/region.fofn -out ' + \
+                   '%s/aligned_reads.sam') % (runner.args.inputFofn, bcdir, bcdir, bcdir)
+        logging.debug(cmd)
+        subprocess.call(cmd, shell = True)
+            
+        cmd = 'samtoh5 %s/aligned_reads.sam %s/g_consensus.fa %s/aligned_reads.cmp.h5' % \
+            (bcdir, bcdir, bcdir)
+        logging.debug(cmd)
+        subprocess.call(cmd, shell = True)
+    
+        cmd = ('loadPulses %s %s/aligned_reads.cmp.h5 -byread -metrics ' + \
+                   'QualityValue,InsertionQV,MergeQV,DeletionQV,DeletionTag,SubstitutionTag,' + \
+                   'SubstitutionQV') % (runner.args.inputFofn, bcdir)
+        logging.debug(cmd)
+        subprocess.call(cmd, shell = True)
+    
+        cmd = 'cmph5tools.py sort --inPlace %s/aligned_reads.cmp.h5' % bcdir
+        logging.debug(cmd)
+        subprocess.call(cmd, shell = True)
+
+        cmd = ('quiver -vv --algorithm quiver -p P4-C2.AllQVsMergingByChannelModel ' \
+                   '%s/aligned_reads.cmp.h5 --outputFilename %s/q_consensus.fasta ' + \
+                   '--referenceFilename %s/g_consensus.fa') % (bcdir, bcdir, bcdir)
+        logging.debug(cmd)
+        subprocess.call(cmd, shell = True)
+        cFilename = 'q_consensus.fasta'
+    else:
+        cFilename = 'g_consensus.fa'
+        
+    ## append results to output file.
+    bcCons = "%s/%s/%s" % (rootDir, barcode, cFilename)
+    if os.path.exists(bcCons):
+        return FastaRecord(barcode, list(FastaReader(bcCons))[0].sequence)
+    else:
+        return None
+
+def subsampleReads(e):
+    logging.debug("starting with %d zmws" % len(e))
+    if runner.args.nZmws > 0:
+        k = runner.args.nZmws if runner.args.nZmws < len(e) else len(e)    
+    elif runner.args.subsample < 1:
+        k = int(len(e)*runner.args.subsample)
+    else:
+        k = len(e)
+    i = n.array(random.sample(range(0, len(e)), k), dtype = int)
+    logging.debug("subsampled down to: %d" % len(i))
+    return [e[j] for j in i]
+
+def callConsensus():
+    def makeReadAndReads(zmwsForBC):
+        ccsData = filter(lambda x:x, [zmw.ccsRead for _,_,zmw in zmwsForBC if zmw])
+        srData  = reduce(lambda x,y : x+y, [zmw.subreads for zmw,_,_ in
+                                            zmwsForBC if zmw], [])
+        if not srData and not ccsData:
+            return (None,None)
+
+        def getSeedRead(reads, lq = 80, uq = 90, 
+                        sLambda = lambda x : -x.zmw.readScore):
+            lens = map(len, reads)
+            candidateRange = (n.percentile(lens, lq), 
+                              n.percentile(lens, uq))
+            pfReads = [read for read,l in zip(reads, lens) if 
+                       l >= candidateRange[0] and l <= candidateRange[1]]
+            pfReads.sort(key = sLambda)
+            return pfReads[0] if len(pfReads) else None
+
+        if ccsData:
+            ## all CCS reads should be the *same* length for an
+            ## amplicon. Let's take the middle ones
+            seedRead = getSeedRead(ccsData, lq = 30, uq = 70,
+                                   sLambda = lambda x: -x.zmw.numPasses)
+            if not seedRead:
+                seedRead = getSeedRead(srData)
+                logging.info("Unable to use a CCS read for the seed read.")
+            else:
+                logging.info("Using a CCS read for the seed read.")
+        else:
+            logging.info("Using a raw read for the seed read")
+            seedRead = getSeedRead(srData)
+        
+        return (seedRead, srData)
+    
+    # check to make sure that you have the necessary dependencies,
+    # i.e., hgap script, blasr, etc.
+    try:
+        import pbtools.pbdagcon
+    except ImportError:
+        raise ImportError("Unable to find dependency `pbdagcon` - please install.")
+
+    # retrieve ZMWs by barcode
+    if runner.args.barcode:
+        zmwsForBCs = getZmwsForBarcodes(runner.args.barcode)
+    else:
+        zmwsForBCs = getZmwsForBarcodes()
+    
+    # subsample
+    zmwsForBCs = {k:subsampleReads(v) for k,v in zmwsForBCs.items()}
+
+    logging.info("unfiltered average zmws per barcode: %g" % 
+                 n.round(n.mean(map(len, zmwsForBCs.values()))))
+
+    # filter ZMWs
+    zmwsForBCs = filterZmws(zmwsForBCs)
+    
+    logging.info("filtered average zmws per barcode: %g" % 
+                 n.round(n.mean(map(len, zmwsForBCs.values()))))
+
+    # now choose the best subread to seed the assembly
+    if runner.args.ccsFofn:
+        # XXX: This part depends on the filenames of the ccs and input
+        # fofns, this is essentially a workaround to the fact the the
+        # part isn't part of the API
+        ccsReaders = {movieNameFromFile(l):BasH5Reader(l) for l in 
+                      open(runner.args.ccsFofn).read().splitlines()}
+        
+        # fill in the CCS spot.
+        for k,v in zmwsForBCs.items():
+            l = []
+            for zmw,lZmw in v:
+                r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)]
+                l.append((zmw,lZmw,r[zmw.holeNumber]))
+            zmwsForBCs[k] = l
+    else:
+        # add none to the CCS spot.
+        zmwsForBCs = {k:[(zmw,lZmw,None) for zmw,lZmw in v] 
+                      for k,v in zmwsForBCs.iteritems()}
+
+    readAndReads = { k:makeReadAndReads(v) for k,v in zmwsForBCs.items() }
+
+    # remove barcodes that don't have a seed read and a set of useable reads.
+    readAndReads = { k:v for k,v in readAndReads.items() if v[0] and v[1] }
+   
+    # generate FASTA files
+    outDir = runner.args.outDir
+
+    for barcode, reads in readAndReads.items():
+        bcdir = '/'.join((outDir, barcode))
+        if not os.path.exists(bcdir):
+            os.makedirs(bcdir)
+
+        # emit the seeds to separte files
+        with FastaWriter("%s/seed_read.fasta" % bcdir) as w:
+            w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls()))
+
+        subreads = reads[1]
+        
+        # emit the subreads to a single file
+        with FastaWriter("%s/subreads.fasta" % bcdir) as w:
+            for r in subreads:
+                w.writeRecord(FastaRecord(r.readName, r.basecalls()))
+
+        # construct the region file by subsetting the ZMWs that you
+        # are interested in.
+        nfofn = []
+        for inFof, in zipFofns(runner.args.inputFofn):
+            bh5 = BaxH5Reader(inFof)
+            reg = bh5.file['/PulseData/Regions']
+            inMovie = filter(lambda z : z.baxH5.movieName == bh5.movieName, 
+                             subreads)
+            holes = n.in1d(reg[:,0], n.array([a.holeNumber for a in inMovie]))
+            if any(holes): 
+                nreg = reg[holes,:]
+            else:
+                nreg = n.empty(shape = (0, reg.shape[1]), dtype = 'int32')
+
+            fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof))
+            nfile = h5.File(fname, 'w')
+            ndset = nfile.create_dataset('/PulseData/Regions', data = nreg, 
+                                         maxshape = (None, None))
+            copyAttributes(reg, ndset)
+            nfile.close()
+            nfofn.append(fname)
+        
+        ofile = open('%s/region.fofn' % bcdir, 'w')
+        ofile.writelines("\n".join(nfofn))
+        ofile.close()
+    
+    ## call gcon
+    outDirs  = [ (outDir, k) for k in readAndReads.keys() ]
+    if runner.args.nProcs == 1:
+        outFasta = filter(lambda z: z, map(gconFunc, outDirs))
+    else:
+        pool = Pool(runner.args.nProcs)
+        outFasta = filter(lambda z : z, pool.map(gconFunc, outDirs))
+
+    ## write the results
+    with FastaWriter('/'.join((outDir, "consensus.fa"))) as w:
+        for r in outFasta:
+            w.writeRecord(r)
+
+    ## optionally cleanup
+    if not runner.args.keepTmpDir:
+        for barcode, reads in readAndReads.items():
+             bcdir = '/'.join((outDir, barcode))
+             shutil.rmtree(bcdir)
+        
+
+class Pbbarcode(PBMultiToolRunner):
+    def __init__(self):
+        desc = ['Utilities for labeling and annoting reads with barcode information.']
+        super(Pbbarcode, self).__init__('\n'.join(desc))
+        subparsers = self.subParsers
+                
+        desc = ['Creates a barcode.h5 file from base h5 files.']
+        parser_m = subparsers.add_parser('labelZmws', description = "\n".join(desc), 
+                                         help = 'Label zmws with barcode annotation',
+                                         formatter_class = \
+                                             argparse.ArgumentDefaultsHelpFormatter)
+        parser_m.add_argument('--outDir', 
+                              help = 'Where to write the newly created barcode.h5 files.',
+                              default = os.getcwd())
+        parser_m.add_argument('--outFofn', help = 'Write to outFofn',
+                              default = 'barcode.fofn')
+        parser_m.add_argument('--adapterSidePad', help = 'Pad with adapterSidePad bases',
+                              default = 4, type = int)
+        parser_m.add_argument('--insertSidePad', help = 'Pad with insertSidePad bases',
+                              default = 4, type = int)
+        parser_m.add_argument('--scoreMode', 
+                              help = 'The mode in which the barcodes should be scored.',
+                              choices = SCORE_MODES, default = 'symmetric', type = str)
+        parser_m.add_argument('--maxAdapters', type = int, default = 20, 
+                              help = 'Only score the first maxAdapters')
+        parser_m.add_argument('--scoreFirst', action = 'store_true', default = False,
+                              help = 'Whether to try to score the leftmost barcode in a trace.')
+        parser_m.add_argument('--startTimeCutoff', 
+                              help = 'Reads must start before this value in order to be ' + \
+                                  'included when scoreFirst is set.', type = float, 
+                              default = 10.0)
+        parser_m.add_argument('--nZmws', type = int, default = -1, 
+                              help = 'Use the first n ZMWs for testing')
+        parser_m.add_argument('--nProcs', type = int, default = 8, 
+                              help = 'How many processes to use')
+        parser_m.add_argument('--saveExtendedInfo', action = 'store_true', default = False,\
+                                  help = 'Whether to save extended information to' + \
+                                  'the barcode.h5 files; this information is useful for ' + \
+                                  'debugging and chimera detection')
+        parser_m.add_argument('barcodeFile', metavar = 'barcode.fasta', 
+                              help = 'Input barcode fasta file')
+        parser_m.add_argument('inputFile', metavar = 'input.fofn',
+                              help = 'Input base fofn')
+
+        def addFilteringOpts(parser, justBarcode = False):
+            ## These are independent of the barcode scoring
+            if not justBarcode: 
+                parser.add_argument('--minMaxInsertLength', default = 0, type = int, 
+                                    help = "ZMW Filter: exclude ZMW if the longest subread" + \
+                                        "is less than this amount")
+                parser.add_argument('--hqStartTime', default = float("inf"), type = float,
+                                    help = "ZMW Filter: exclude ZMW if start time of HQ region" + \
+                                        "greater than this value (seconds)")
+                parser.add_argument('--minReadScore', default = 0, type = float,
+                                    help = "ZMW Filter: exclude ZMW if readScore is less than" + \
+                                        "this value")
+     
+            ## These obviously need the barcode score
+            parser.add_argument('--minAvgBarcodeScore', default = 0.0, type = float,
+                                help = "ZMW Filter: exclude ZMW if average barcode score " + \
+                                    "is less than this value")
+            parser.add_argument('--minNumBarcodes', default = 1, type = int,
+                                help = "ZMW Filter: exclude ZMW if number of barcodes observed " + \
+                                "is less than this value")
+            parser.add_argument('--minScoreRatio', default = 1.0, type = float,
+                                help = "ZMW Filter: exclude ZMWs whose best score divided by " + \
+                                    "the 2nd best score is less than this ratio")
+
+            # Not yet implemented
+            # parser.add_argument('--filterChimeras', default = False, action = 'store_true',
+            #                     help = "ZMW Filter: exclude ZMWs that appear to be chimeric")
+
+        
+        desc = ['Adds information about barcode alignments to a cmp.h5 file',
+                'from a previous call to "labelZmws".']
+        parser_s = subparsers.add_parser('labelAlignments', description = "\n".join(desc),
+                                         help = "Label reads from a barcode or region h5 file",
+                                         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        addFilteringOpts(parser_s, justBarcode = True)
+        parser_s.add_argument('inputFofn', metavar = 'barcode.fofn',
+                              help = 'input barcode fofn file')
+        parser_s.add_argument('cmpH5', metavar = 'aligned_reads.cmp.h5',
+                              help = 'cmp.h5 file to add barcode labels')
+       
+        desc = ['Takes a bas.h5 fofn and a barcode.h5 fofn and produces',
+                'a fast[a|q] file for each barcode.']
+        parser_s = subparsers.add_parser('emitFastqs', description = "\n".join(desc),
+                                         help = "Write fastq files", 
+                                         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        parser_s.add_argument('--outDir', metavar = 'output.dir',
+                              help = 'output directory to write fastq files',
+                              default = os.getcwd())
+
+        parser_s.add_argument('--subreads', 
+                              help = 'whether to produce fastq files for the subreads;' + \
+                                  'the default is to use the CCS reads. This option only' + \
+                                  'applies when input.fofn has both consensus and raw reads,' + \
+                                  'otherwise the read type from input.fofn will be returned.',
+                              action = 'store_true',
+                              default = False)
+        parser_s.add_argument('--unlabeledZmws', 
+                              help = 'whether to emit a fastq file for the unlabeled ZMWs.' + \
+                              ' These are the ZMWs where no adapters are found typically',
+                              action = 'store_true',
+                              default = False)
+
+        parser_s.add_argument('--trim', help = 'trim off barcodes and any excess constant sequence',
+                              default = 20, type = int)
+        parser_s.add_argument('--fasta', help = ('whether the files produced should be FASTA files as' +
+                                                 'opposed to FASTQ'),
+                              action = 'store_true',
+                              default = False)
+        addFilteringOpts(parser_s)
+        parser_s.add_argument('inputFofn', metavar = 'input.fofn',
+                              help = 'input base or CCS fofn file')
+        parser_s.add_argument('barcodeFofn', metavar = 'barcode.fofn',
+                              help = 'input barcode.h5 fofn file')
+
+        desc = ['Compute consensus sequences for each barcode.']
+        parser_s = subparsers.add_parser('consensus', description = "\n".join(desc),
+                                         help = "Compute a consensus sequence for each barcode." + \
+                                             "This command relies on the presence of pbdagcon",
+                                         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        parser_s.add_argument('--subsample', default = 1, type = float,
+                              help = "Subsample ZMWs")
+        parser_s.add_argument('--nZmws', default = -1, type = int,
+                              help = "Take n ZMWs")
+        parser_s.add_argument('--outDir', default = '.', type = str,
+                              help = "Use this directory to output results")
+        parser_s.add_argument('--keepTmpDir', action = 'store_true', default = False)
+        parser_s.add_argument('--ccsFofn', default = '', type = str,
+                              help = 'Obtain CCS data from ccsFofn instead of input.fofn')
+        parser_s.add_argument('--nProcs', default = 16, type = int,
+                              help = 'Use nProcs to execute.')
+        parser_s.add_argument('--noQuiver', action = 'store_true',
+                              default = False)
+        addFilteringOpts(parser_s)
+        
+        parser_s.add_argument('inputFofn', metavar = 'input.fofn',
+                              help = 'input bas.h5 fofn file')
+        parser_s.add_argument('barcodeFofn', metavar = 'barcode.fofn',
+                              help = 'input bc.h5 fofn file')
+
+        parser_s.add_argument('--barcode', default = None, type = str, nargs = "+",
+                              help = "Use this to extract consensus for just one barcode.")
+
+    def getVersion(self):
+        return  __version__
+
+    def run(self):
+        logging.debug("Arguments" + str(self.args))
+        
+        if self.args.subCommand == 'labelZmws':
+            makeBarcodeFofnFromBasFofn()
+        elif self.args.subCommand == 'labelAlignments':
+            labelAlignments()
+        elif self.args.subCommand == 'emitFastqs':
+            emitFastqs()
+        elif self.args.subCommand == 'consensus':
+            callConsensus()
+        else:
+            sys.exit(1)
+
+runner = Pbbarcode()
+
+def main():
+    """The entry point for pbbarcode"""
+    sys.exit(runner.start())
+          
+#if __name__ == '__main__':    
+#    runner = Pbbarcode()
+#    sys.exit(runner.start())
diff --git a/tests/cram/consensus.t.disabled b/tests/cram/consensus.t.disabled
new file mode 100644
index 0000000..5bd9285
--- /dev/null
+++ b/tests/cram/consensus.t.disabled
@@ -0,0 +1,88 @@
+  $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"`
+  $ export INBH51=`python -c "from pbcore import data ; print data.geBasH5s[0]"`
+  $ export INBH52=`python -c "from pbcore import data ; print data.getBasH5s[1]"`
+  $ export BARCODE_FASTA=$TESTDIR/../../etc/barcode.fasta
+  $ echo $INBH51 > bas.fofn
+  $ echo $INBH52 >> bas.fofn
+  $ pbbarcode labelZmws $BARCODE_FASTA bas.fofn
+  $ pbbarcode consensus bas.fofn barcode.fofn
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46[INFO]  [blasr] started.
+  2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] [INFO] 2013-08-02T00:28:462013-08-02T00:28:46 [blasr] started. [blasr] started.
+  
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
+  [INFO] 2013-08-02T00:28:46 [blasr] started.
+  [INFO] 2013-08-02T00:28:46 [blasr] ended.
diff --git a/tests/cram/sanity.t b/tests/cram/sanity.t
new file mode 100644
index 0000000..310c4db
--- /dev/null
+++ b/tests/cram/sanity.t
@@ -0,0 +1,55 @@
+  $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"`
+  $ export INBH51=`python -c "from pbcore import data ; print data.getBasH5s()[0]"`
+  $ export INBH52=`python -c "from pbcore import data ; print data.getBasH5s()[1]"`
+  $ export BARCODE_FASTA=$TESTDIR/../../etc/barcode.fasta
+  $ echo $INBH51 > bas.fofn
+  $ echo $INBH52 >> bas.fofn
+  $ pbbarcode labelZmws $BARCODE_FASTA bas.fofn
+  $ pbbarcode labelZmws --scoreMode paired $BARCODE_FASTA bas.fofn
+  $ pbbarcode labelZmws --scoreMode paired --scoreFirst $BARCODE_FASTA bas.fofn
+  $ pbbarcode labelZmws --scoreMode paired --scoreFirst --adapterSidePad 0 --insertSidePad 0 $BARCODE_FASTA bas.fofn
+  $ pbbarcode emitFastqs --fasta bas.fofn barcode.fofn
+  $ pbbarcode emitFastqs --trim 20 bas.fofn barcode.fofn
+  $ pbbarcode emitFastqs --subreads --trim 20 bas.fofn barcode.fofn
+  $ cp $INH5 ./aligned_reads.cmp.h5         
+  $ chmod 766 ./aligned_reads.cmp.h5
+  $ pbbarcode labelAlignments barcode.fofn aligned_reads.cmp.h5  
+Check that same holes get the same barcode (consistent scoring)
+  $ cmph5tools.py stats --what "(Movie,HoleNumber,Barcode,AverageBarcodeScore)" aligned_reads.cmp.h5 | uniq
+                         Movie                      Barcode                    AverageBarcodeScore                    HoleNumber
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  13.00                    3008                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                    bc5--bc10                                  12.50                    2001                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                    bc5--bc10                                  12.00                    4009                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                     bc3--bc4                                  12.57                    2008                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  14.33                    3006                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  12.00                    1000                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  12.00                    4004                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                    bc5--bc10                                  14.50                    1006                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                    bc5--bc10                                  12.00                    4006                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  13.33                    2006                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  12.67                    3002                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  13.33                    2006                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  12.67                    1009                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  12.67                    3002                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  12.67                    1009                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                    bc5--bc10                                  13.33                    1000                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                    bc5--bc10                                  12.33                    1007                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  12.50                    9                             
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  13.00                    1004                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  12.00                    2002                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  12.80                    2004                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  12.00                    4007                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  12.80                    2004                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                     bc3--bc4                                  12.00                    3008                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  14.33                    2009                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                    bc5--bc10                                  14.50                    2007                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                     bc3--bc4                                  12.57                    2008                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  16.00                    1002                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  13.33                    1008                          
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                    bc5--bc10                                  12.50                    9                             
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  14.00                    2000                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                     bc3--bc4                                  11.67                    9                             
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  14.00                    2000                          
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                     bc3--bc4                                  11.67                    9                             
+  m110818_075520_42141_c100129202555500000315043109121112_s2_p0                     bc3--bc4                                  14.33                    8                             
+  m110818_075520_42141_c100129202555500000315043109121112_s1_p0                     bc3--bc4                                  14.33                    2003                          
diff --git a/tests/test_basic.py b/tests/test_basic.py
new file mode 100755
index 0000000..064bf71
--- /dev/null
+++ b/tests/test_basic.py
@@ -0,0 +1,32 @@
+import logging
+import unittest
+
+# this is purely for the coverage to not fail when it's generated
+import pbbarcode
+
+log = logging.getLogger(__name__)
+
+
+class TestBasic(unittest.TestCase):
+    def test_01(self):
+        """Place holder so jenkins will generate a coverage report"""
+        self.assertTrue(True)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbbarcode.git



More information about the debian-med-commit mailing list