[med-svn] [pbbarcode] 01/02: Imported Upstream version 0.8.0
Afif Elghraoui
afif-guest at moszumanska.debian.org
Tue Dec 15 08:13:47 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository pbbarcode.
commit 266491af26faabf51816b3a5314c77c8f525b06d
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Sun Nov 29 01:36:48 2015 -0800
Imported Upstream version 0.8.0
---
Makefile | 47 ++
README.rst | 36 ++
doc/Makefile | 153 +++++++
doc/PbbarcodeFunctionalSpecification.rst | 405 +++++++++++++++++
doc/conf.py | 242 ++++++++++
doc/index.rst | 16 +
etc/barcode.fasta | 8 +
etc/barcode_complete.fasta | 192 ++++++++
etc/pacbio_barcodes_paired.fasta | 192 ++++++++
setup.py | 37 ++
src/C/Makefile | 11 +
src/C/sw.c | 56 +++
src/python/pbbarcode/BarcodeLabeler.py | 225 +++++++++
src/python/pbbarcode/SWaligner.py | 69 +++
src/python/pbbarcode/__init__.py | 0
src/python/pbbarcode/_version.py | 1 +
src/python/pbbarcode/main.py | 751 +++++++++++++++++++++++++++++++
tests/cram/consensus.t.disabled | 88 ++++
tests/cram/sanity.t | 55 +++
tests/test_basic.py | 32 ++
20 files changed, 2616 insertions(+)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..982f5ce
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,47 @@
+.PHONY: doc doc-clean
+
+SHELL = /bin/bash -e
+
+all: build install
+
+build:
+ python setup.py build --executable="/usr/bin/env python"
+
+bdist:
+ python setup.py build --executable="/usr/bin/env python"
+ python setup.py bdist --formats=egg
+
+install:
+ python setup.py install
+
+develop:
+ python setup.py develop
+
+test:
+ find tests -name "*.py" | xargs nosetests
+ find tests/cram -name "*.t" | grep -v consensus.t | xargs cram --verbose
+
+clean: doc-clean
+ rm -rf build/;\
+ find . -name "*.egg-info" | xargs rm -rf;\
+ find . -name "*.pyc" | xargs rm -rf;\
+ rm -rf dist/
+ make -C src/C clean
+
+doc-clean:
+ make -C doc clean
+
+doc:
+ make -C doc html
+
+pip-install:
+ @which pip > /dev/null
+ @pip freeze|grep 'pbtools.barcode=='>/dev/null \
+ && pip uninstall -y pbtools.barcode \
+ || true
+ @pip freeze|grep 'pbbarcode=='>/dev/null \
+ && pip uninstall -y pbbarcode \
+ || true
+ @pip install --no-index \
+ --install-option="--install-scripts=$(PREFIX)/bin" \
+ ./
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..42610c7
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,36 @@
+Overview of the pbbarcode package
+=================================
+
+The *pbbarcode* package provides tools for annotating PacBio
+sequencing reads with barcode information. Typically, *pbbarcode*
+is called in context of a SMRTPipe workflow as opposed to directly on
+the command line, however, users are encouraged to utilize the
+command-line utility directly, as more options are available.
+
+The *pbbarcode* package provides a multi-command line tool
+*pbbarcode* which currently has the following sub-commands:
+
+* labelZmws
+* labelAlignments
+* emitFastqs
+* consensus
+
+The first three sub-commands depend on only *pbcore* and its
+dependencies, the fourth, *consensus*, depends on the *pbdagcon*
+package and is considered experimental.
+
+For more details on the package, please see docs/index.rst for more
+information.
+
+Installation
+============
+
+Typically, the *pbbarcode* package is installed within an installation
+of SMRTPipe, however, it can be installed by itself using::
+
+ make install
+
+To test that everything is installed correctly, one should
+additionally issue a::
+
+ make test
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..a37efe5
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,153 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ -rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbbarcode.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbbarcode.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/pbbarcode"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbbarcode"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/doc/PbbarcodeFunctionalSpecification.rst b/doc/PbbarcodeFunctionalSpecification.rst
new file mode 100644
index 0000000..b00fe1f
--- /dev/null
+++ b/doc/PbbarcodeFunctionalSpecification.rst
@@ -0,0 +1,405 @@
+.. pbbarcode Functional Specification
+.. =======================================
+
+.. Version
+
+
+Introduction
+````````````
+This document describes the interface and input/output formats of the
+``pbbarcode`` package command line tools. The package provides
+utilities for annotating individual ZMWs directly from a bas.h5 file,
+emitting fast[a|q] files for each barcode, labeling alignments stored
+in a cmp.h5 file, and calling consensus on small amplicons (requires
+``pbdagcon``)
+
+At the moment, Barcodes can be scored in two different ways:
+``symmetric`` and ``paired``. Symmetric mode supports barcode designs
+with two identical barcodes on both sides of a SMRTbell, e.g., for
+barcodes (A, B), molecules are labeled as A--A or B--B. The ``paired``
+mode supports designs with two distinct barcodes on each side of the
+molecule, but neither barcode appears without its mate. The minimum
+example is given with the following barcodes: (ALeft, ARight, BLeft,
+BRight), where the following barcode sets are checked: ALeft--ARight,
+BLeft--BRight.
+
+It is important to highlight that a barcode FASTA file specifies a
+list of available barcodes to evaluate. Depending on the scoring mode,
+the barcodes are grouped together in different ways. For instance, in
+the ``symmetric`` case, the number of possible barcode outcomes are
+simply the number of barcodes that are supplied to the routine in the
+FASTA file (see below for usage) plus an additional ``NULL`` barcode
+indicating that no barcode could be evaluated (denoted by:
+'--'). Labels like this (A--A) are used in the final outputs. In the
+``paired`` mode, the number of possible barcode outcomes are half the
+number of the sequences in the FASTA file plus the ``NULL``
+barcode. The ``NULL`` barcode indicates that no attempt was made to
+score the molecule or it was filtered out by the user's criteria. The
+majority of cases when a molecule is not scored are related to not
+observing any adapters. If a user has executed a "hot-start" run, the
+user can try the '--scoreFirst' parameter to attempt to label the
+first adapter's barcode. This increases the yield of the labeleing
+procedure at the expense of some probably false positives.
+
+The software is implemented as a standard python package. Barcodes are
+labeled according to the following high-level logic. For each
+molecule, all adapters are found. For each adapter, we align (using
+standard Smith-Watterman alignment) each barcode and its reverse
+complement to flanking sequence of the adapter. If two complete
+flanking sequences are available, we divide by 2, else 1 if only one
+flanking sequence was available (average score at adapter). This
+allows the scores across adapters to be on the same scale (chimera
+detection). Depending on the ``mode``, we then determine which
+barcode(s) are maximally scoring. We store the two maximally scoring
+barcodes, the sum of their alignment scores across the adapters. The
+average barcode score then can be given approximately by:
+total-score/number-of-adapters. At the moment, the alignment
+parameters are fixed at:
+
+
+.. table:: SW Match Parameters
++----------+----------+
+|type |score |
+| | |
++----------+----------+
+|insertion |-1 |
+| | |
++----------+----------+
+|deletion |-1 |
+| | |
++----------+----------+
+|missmatch |-2 |
+| | |
++----------+----------+
+|match |2 |
+| | |
++----------+----------+
+
+Input and output
+````````````````
+
+labelZmws
+---------
+ usage: pbbarcode labelZmws [-h] [--outDir OUTDIR] [--outFofn OUTFOFN]
+ [--adapterSidePad ADAPTERSIDEPAD]
+ [--insertSidePad INSERTSIDEPAD]
+ [--scoreMode {symmetric,paired}]
+ [--maxAdapters MAXADAPTERS] [--scoreFirst]
+ [--startTimeCutoff STARTTIMECUTOFF]
+ [--nZmws NZMWS] [--nProcs NPROCS]
+ [--saveExtendedInfo]
+ barcode.fasta input.fofn
+
+ Creates a barcode.h5 file from base h5 files.
+
+ positional arguments:
+ barcode.fasta Input barcode fasta file
+ input.fofn Input base fofn
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --outDir OUTDIR Where to write the newly created barcode.h5 files.
+ (default: /home/UNIXHOME/jbullard/projects/software/bi
+ oinformatics/tools/pbbarcode/doc)
+ --outFofn OUTFOFN Write to outFofn (default: barcode.fofn)
+ --adapterSidePad ADAPTERSIDEPAD
+ Pad with adapterSidePad bases (default: 4)
+ --insertSidePad INSERTSIDEPAD
+ Pad with insertSidePad bases (default: 4)
+ --scoreMode {symmetric,paired}
+ The mode in which the barcodes should be scored.
+ (default: symmetric)
+ --maxAdapters MAXADAPTERS
+ Only score the first maxAdapters (default: 20)
+ --scoreFirst Whether to try to score the leftmost barcode in a
+ trace. (default: False)
+ --startTimeCutoff STARTTIMECUTOFF
+ Reads must start before this value in order to be
+ included when scoreFirst is set. (default: 10.0)
+ --nZmws NZMWS Use the first n ZMWs for testing (default: -1)
+ --nProcs NPROCS How many processes to use (default: 8)
+ --saveExtendedInfo Whether to save extended information tothe barcode.h5
+ files; this information is useful for debugging and
+ chimera detection (default: False)
+
+The ``labelZmws`` command takes an input.fofn representing a set of
+bas.h5 files to operate on. Additionally, it takes a barcode.fasta
+file. Depending on ``scoreMode``, the FASTA file will be processed in
+different ways. Specifically, in ``paired`` mode, each two consecutive
+barcodes in the file are considered a set.
+
+The parameters, ``adapterSidePad`` and ``insertSidePad`` represents
+how many bases should be considered on each side of the putative
+barcode. These parameters are constrained such that:
+``|adapterSidePad| + |insertSidePad| + |barcode| < 65``.
+
+Users have the option to specify a different output location
+for the various outputs. Specifically, for each bas.h5 file in
+input.fofn, a bc.h5 (barcode hdf5) file is generated. These files are
+listed in the file ``outFofn`` which is typically just called
+``barcode.fofn``. See below for a description of the barcode hdf5
+file.
+
+
+labelAlignments
+---------------
+ usage: pbbarcode labelAlignments [-h]
+ [--minAvgBarcodeScore MINAVGBARCODESCORE]
+ [--minNumBarcodes MINNUMBARCODES]
+ [--minScoreRatio MINSCORERATIO]
+ barcode.fofn aligned_reads.cmp.h5
+
+ Adds information about barcode alignments to a cmp.h5 file from a previous
+ call to "labelZmws".
+
+ positional arguments:
+ barcode.fofn input barcode fofn file
+ aligned_reads.cmp.h5 cmp.h5 file to add barcode labels
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --minAvgBarcodeScore MINAVGBARCODESCORE
+ ZMW Filter: exclude ZMW if average barcode score is
+ less than this value (default: 0.0)
+ --minNumBarcodes MINNUMBARCODES
+ ZMW Filter: exclude ZMW if number of barcodes observed
+ is less than this value (default: 1)
+ --minScoreRatio MINSCORERATIO
+ ZMW Filter: exclude ZMWs whose best score divided by
+ the 2nd best score is less than this ratio (default:
+ 1.0)
+
+
+The ``labelAlignments`` command takes as input a barcode.fofn computed
+from a call to ``labelZMWs`` and a cmp.h5 file where the barcode
+information is written to. See below for a description of the cmp.h5
+file additions.
+
+
+
+emitFastqs
+----------
+ usage: pbbarcode emitFastqs [-h] [--outDir output.dir] [--subreads]
+ [--unlabeledZmws] [--trim TRIM] [--fasta]
+ [--minMaxInsertLength MINMAXINSERTLENGTH]
+ [--hqStartTime HQSTARTTIME]
+ [--minReadScore MINREADSCORE]
+ [--minAvgBarcodeScore MINAVGBARCODESCORE]
+ [--minNumBarcodes MINNUMBARCODES]
+ [--minScoreRatio MINSCORERATIO]
+ input.fofn barcode.fofn
+
+ Takes a bas.h5 fofn and a barcode.h5 fofn and produces a fast[a|q] file for
+ each barcode.
+
+ positional arguments:
+ input.fofn input base or CCS fofn file
+ barcode.fofn input barcode.h5 fofn file
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --outDir output.dir output directory to write fastq files (default: /home/
+ UNIXHOME/jbullard/projects/software/bioinformatics/too
+ ls/pbbarcode/doc)
+ --subreads whether to produce fastq files for the subreads;the
+ default is to use the CCS reads. This option
+ onlyapplies when input.fofn has both consensus and raw
+ reads,otherwise the read type from input.fofn will be
+ returned. (default: False)
+ --unlabeledZmws whether to emit a fastq file for the unlabeled ZMWs.
+ These are the ZMWs where no adapters are found
+ typically (default: False)
+ --trim TRIM trim off barcodes and any excess constant sequence
+ (default: 20)
+ --fasta whether the files produced should be FASTA files
+ asopposed to FASTQ (default: False)
+ --minMaxInsertLength MINMAXINSERTLENGTH
+ ZMW Filter: exclude ZMW if the longest subreadis less
+ than this amount (default: 0)
+ --hqStartTime HQSTARTTIME
+ ZMW Filter: exclude ZMW if start time of HQ
+ regiongreater than this value (seconds) (default: inf)
+ --minReadScore MINREADSCORE
+ ZMW Filter: exclude ZMW if readScore is less thanthis
+ value (default: 0)
+ --minAvgBarcodeScore MINAVGBARCODESCORE
+ ZMW Filter: exclude ZMW if average barcode score is
+ less than this value (default: 0.0)
+ --minNumBarcodes MINNUMBARCODES
+ ZMW Filter: exclude ZMW if number of barcodes observed
+ is less than this value (default: 1)
+ --minScoreRatio MINSCORERATIO
+ ZMW Filter: exclude ZMWs whose best score divided by
+ the 2nd best score is less than this ratio (default:
+ 1.0)
+
+
+The ``emitFastqs`` command takes as input both an input.fofn for the
+bas.h5 files as well as a barcode.fofn from a call to labelZmws. The
+optional parameter ``outDir`` dictates where the files will be
+written. For each detected barcode, a fast[a|q] file will be emitted
+with all of the reads for that barcode. The ``trim`` parameter
+dictates how much of the read should be trimmed off. The default
+parameter for ``trim`` is the length of the barcode (which is stored
+in the barcode hdf5 files). At the moment, all barcodes in the barcode
+FASTA file must be the same length, therefore only a constant trim
+value is supported. In practice, one can aggressively trim in order to
+ensure that extra bases aren't left on the ends of reads. Finally, the
+``subreads`` parameter dictates whether subreads or CCS reads should
+be returned with the default being the appropriate reads according to
+the input file type, either CCS or subreads. This parameter is only
+inspected if the input.fofn contains both CCS and subread data, if the
+input.fofn contains only subread or CCS data then that is returned
+irrespective of the state of the the ``subreads`` parameter and a
+warning is issued.
+
+consensus
+---------
+ usage: pbbarcode consensus [-h] [--subsample SUBSAMPLE] [--nZmws NZMWS]
+ [--outDir OUTDIR] [--keepTmpDir]
+ [--ccsFofn CCSFOFN] [--nProcs NPROCS]
+ [--noQuiver]
+ [--minMaxInsertLength MINMAXINSERTLENGTH]
+ [--hqStartTime HQSTARTTIME]
+ [--minReadScore MINREADSCORE]
+ [--minAvgBarcodeScore MINAVGBARCODESCORE]
+ [--minNumBarcodes MINNUMBARCODES]
+ [--minScoreRatio MINSCORERATIO]
+ [--barcode BARCODE [BARCODE ...]]
+ input.fofn barcode.fofn
+
+ Compute consensus sequences for each barcode.
+
+ positional arguments:
+ input.fofn input bas.h5 fofn file
+ barcode.fofn input bc.h5 fofn file
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --subsample SUBSAMPLE
+ Subsample ZMWs (default: 1)
+ --nZmws NZMWS Take n ZMWs (default: -1)
+ --outDir OUTDIR Use this directory to output results (default: .)
+ --keepTmpDir
+ --ccsFofn CCSFOFN Obtain CCS data from ccsFofn instead of input.fofn
+ (default: )
+ --nProcs NPROCS Use nProcs to execute. (default: 16)
+ --noQuiver
+ --minMaxInsertLength MINMAXINSERTLENGTH
+ ZMW Filter: exclude ZMW if the longest subreadis less
+ than this amount (default: 0)
+ --hqStartTime HQSTARTTIME
+ ZMW Filter: exclude ZMW if start time of HQ
+ regiongreater than this value (seconds) (default: inf)
+ --minReadScore MINREADSCORE
+ ZMW Filter: exclude ZMW if readScore is less thanthis
+ value (default: 0)
+ --minAvgBarcodeScore MINAVGBARCODESCORE
+ ZMW Filter: exclude ZMW if average barcode score is
+ less than this value (default: 0.0)
+ --minNumBarcodes MINNUMBARCODES
+ ZMW Filter: exclude ZMW if number of barcodes observed
+ is less than this value (default: 1)
+ --minScoreRatio MINSCORERATIO
+ ZMW Filter: exclude ZMWs whose best score divided by
+ the 2nd best score is less than this ratio (default:
+ 1.0)
+ --barcode BARCODE [BARCODE ...]
+ Use this to extract consensus for just one barcode.
+ (default: None)
+
+The ``emitFastqs`` command takes as input both an input.fofn for the
+bas.h5 files as well as a barcode.fofn from a call to labelZmws. The
+results are a FASTA file with an entry for each barcode containing the
+consensus amplicon sequence. This mode utilizes ``Quiver`` and
+``pbdagcon`` to compute consensus.
+
+In cases where the amplicon is fewer than 2.5k bases, using CCS data
+is quite helpful. The ``--ccsFofn`` allows one to pass directly the
+ccs files. In many cases, both the CCS and raw basecalls are in the
+same file so you can check by passing the same parameter to input.fofn
+as to ccsFofn.
+
+Dependencies
+````````````
+
+The pbbarcode package depends on a standard pbcore installation
+(https://github.com/PacificBiosciences/pbcore). If one wishes to use
+the ``consensus`` tool, ``pbdagcon`` needs to be installed
+(https://github.com/PacificBiosciences/pbdagcon).
+
+
+Barcode HDF5 File
+`````````````````
+
+The barcode hdf5 file, ``bc.h5``, represents a simple data store for
+barcode calls and their scores for each ZMW. Generally, a user need
+not interact with barcode hdf5 files, but can use the results stored in
+either the resulting cmp.h5 file or fast[a|q] files. The barcode hdf5
+file contains the following structure:
+
+/BarcodeCalls/best - (nZMWs, 6)[32-bit integer] dataset with the
+following columns:
+
+ ``holeNumber,nAdapters,barcodeIdx1,barcodeScore1,barcodeIdx2,barcodeScore2``
+
+Additionally, the ``best`` dataset has the following attributes:
+
++-----------+------------------------------------------------------------------------+
+|movieName |m120408_042614_richard_c100309392550000001523011508061222_s1_p0 |
+| | |
++-----------+------------------------------------------------------------------------+
+|columnNames|holeNumber,nAdapters,barcodeIdx1,barcodeScore1,barcodeIdx2, |
+| |barcodeScore2 |
++-----------+------------------------------------------------------------------------+
+|scoreMode |[symmetric|paired] |
+| | |
++-----------+------------------------------------------------------------------------+
+|barcodes |'bc_1', 'bc_2', ...., 'bc_N' |
+| | |
++-----------+------------------------------------------------------------------------+
+
+The two barcodeIdx1 and barcodeIdx2 columns are indices into
+``barcodes`` attribute. The ``scoreMode`` is scoring mode used to
+align the barcodes. The ``barcodes`` attribute correspond to the
+barcode.fasta sequence names.
+
+Additionally, in some circumstances, it is useful to retain the entire
+history of the scoring, i.e., each barcode scored to each adapter
+across all ZMWs. In oder to retain this information, one must call:
+
+ ``pbbarcode labelZmws --saveExtendedInfo ...``
+
+In this mode, the resultant HDF5 file will have an additional dataset
+under the BarcodeCalls group, named: ``all``. This dataset has the
+following format:
+
+/BarcodeCalls/all - (nbarcodes * nadapters[zmw_i], 4) \forall i in 1 ... nZMWs
+
+ ```holeNumber, adapterIdx, barcodeIdx, score```
+
+The ``adapterIdx`` is the index of the adapter along the molecule,
+i.e., adapterIdx 1 is the first adapter scored.
+
+Additions to the compare HDF5 (cmp.h5) File
+```````````````````````````````````````````
+
+In addition to the barcode hdf5 file, a call to ``labelAlignments``
+will annotate a cmp.h5 file. This annotation is stored in ways
+consistent with the cmp.h5 file format. Specifically, a new group:
+
+| /BarcodeInfo/
+| ID (nBarcodeLabels + 1, 1)[32-bit integer]
+| Name (nBarcodeLabels + 1, 1)[variable length string]
+
+In addition to the /BarcodeInfo/ group, the key dataset which assigns
+alignments to barcodes is located at:
+
+/AlnInfo/Barcode (nAlignments, 3)[32-bit integer] with the following
+colums:
+
+ ``index,count,bestIndex,bestScore,secondBestIndex,secondBestScore``
+
+Here index refers to the index into the ``Name`` vector, score
+corresponds to the sum of the scores for the barcodes, and finally,
+count refers to the number of adapters found in the molecule.
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100755
index 0000000..7a1cd30
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,242 @@
+# -*- coding: utf-8 -*-
+#
+# pbbarcode documentation build configuration file, created by
+# sphinx-quickstart on Mon Apr 30 18:28:57 2012.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'pbbarcode'
+copyright = u'2012, PacBio'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '.1'
+# The full version, including alpha/beta/rc tags.
+release = '.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pbbarcodedoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ('index', 'pbbarcode.tex', u'pbbarcode Documentation',
+ u'PacBio', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'pbbarcode', u'pbbarcode Documentation',
+ [u'PacBio'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ ('index', 'pbbarcode', u'pbbarcode Documentation',
+ u'PacBio', 'pbbarcode', 'One line description of project.',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..ea69335
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,16 @@
+.. pbbarcode documentation master file, created by
+ sphinx-quickstart on Mon Apr 30 18:28:57 2012.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+pbbarcode
+=========
+
+Contents:
+
+.. toctree::
+ :maxdepth: 2
+
+ PbbarcodeFunctionalSpecification
+
+
diff --git a/etc/barcode.fasta b/etc/barcode.fasta
new file mode 100644
index 0000000..e524539
--- /dev/null
+++ b/etc/barcode.fasta
@@ -0,0 +1,8 @@
+>bc3
+tatctatcgtatacgc
+>bc4
+atcacactgcatctga
+>bc5
+acgtacgctcgtcata
+>bc10
+tcatgcacgtctcgct
diff --git a/etc/barcode_complete.fasta b/etc/barcode_complete.fasta
new file mode 100644
index 0000000..9803f37
--- /dev/null
+++ b/etc/barcode_complete.fasta
@@ -0,0 +1,192 @@
+>bc_1
+GCGCTCTGTGTGCAGC
+>bc_2
+TCATGAGTCGACACTA
+>bc_3
+TATCTATCGTATACGC
+>bc_4
+ATCACACTGCATCTGA
+>bc_5
+ACGTACGCTCGTCATA
+>bc_6
+TGTGAGTCAGTACGCG
+>bc_7
+AGAGACACGATACTCA
+>bc_8
+CTGCTAGAGTCTACAG
+>bc_9
+AGCACTCGCGTCAGTG
+>bc_10
+TCATGCACGTCTCGCT
+>bc_11
+AGAGCATCTCTGTACT
+>bc_12
+CGCATCGACTACGCTA
+>bc_13
+CGTAGCGTGCTATCAC
+>bc_14
+ATGCTGATGACTGCGA
+>bc_15
+TGCGTGAGCTGTACAT
+>bc_16
+CGATCATCTATAGACA
+>bc_17
+CGACGTATCTGACAGT
+>bc_18
+CACGTCACTAGAGCGA
+>bc_19
+TGTCGCAGCTACTAGT
+>bc_20
+CATACGCTGTGTAGCA
+>bc_21
+AGTCGCATGACTGTGT
+>bc_22
+CAGTACTGCACGATCG
+>bc_23
+GTGCTGAGCATCAGAC
+>bc_24
+CACTGATCGATATGCA
+>bc_25
+TACAGTGTCTGCTGCG
+>bc_26
+TACAGATAGTGTAGCG
+>bc_27
+TCGTAGAGCTCGAGAC
+>bc_28
+GAGCTGCGCACTCGAT
+>bc_29
+GCGATGTCGCTATGTG
+>bc_30
+CGAGAGTCAGCGCATA
+>bc_31
+TCACGATGAGCACGTA
+>bc_32
+GACTGAGATCATGATC
+>bc_33
+ACGACATGATACTGCT
+>bc_34
+ATACAGCACAGATGTG
+>bc_35
+ACAGTCGATATCTCTC
+>bc_36
+GCTCGATCACATGACG
+>bc_37
+GTCGTACACGTGCGAC
+>bc_38
+ACTCATATCTAGAGTG
+>bc_39
+ACTGATCTGTCGCGCT
+>bc_40
+CACTAGCTCTGACTAC
+>bc_41
+GCTGTCATGTACTAGC
+>bc_42
+TATACATACACGCACT
+>bc_43
+TGTGACGACGCGTCTC
+>bc_44
+GACGTGAGCATGCACT
+>bc_45
+CTCGATACGTGTAGCT
+>bc_46
+GTGTCTAGACAGCTGT
+>bc_47
+GATGCATGCGTACGCA
+>bc_48
+TATCAGAGCAGCGATG
+>bc_49
+TCATATGTAGTACTCT
+>bc_50
+GCGATCTATGCACACG
+>bc_51
+TGCAGTCGAGATACAT
+>bc_52
+GACTCTGCGTCGAGTC
+>bc_53
+TACAGCGACGTCATCG
+>bc_54
+GCGCAGACTACGTGTG
+>bc_55
+GTCTCTGCGATACAGC
+>bc_56
+AGTATGAGATAGCTCG
+>bc_57
+GCGACGAGTACTCATG
+>bc_58
+AGTATCACAGTCGCTG
+>bc_59
+ATCATATGATGCGACA
+>bc_60
+AGACGTAGATCACAGC
+>bc_61
+CGTGTCATGCTACTCA
+>bc_62
+TGTGAGACTGCATGTC
+>bc_63
+GCTCAGTGCGCTACTG
+>bc_64
+ACTATCGCGCACGCAG
+>bc_65
+TGACACTCTGCACGCG
+>bc_66
+CAGACGTGACTGATAT
+>bc_67
+GCACTGTAGTGATCGT
+>bc_68
+CAGTGCGAGACAGTAG
+>bc_69
+AGTAGTGCTACTCGAC
+>bc_70
+ATGCGAGATCTGCTCA
+>bc_71
+TGAGACATACTGAGTG
+>bc_72
+ATGTGCACTAGTGTAC
+>bc_73
+TCAGCTGACGATGTGA
+>bc_74
+ACTGATGCGCACATGT
+>bc_75
+CTACTCTCAGCAGTGA
+>bc_76
+ATCTACATCACGACTC
+>bc_77
+ATATAGTACAGCGTCT
+>bc_78
+GACACGACTAGATCGC
+>bc_79
+TACGAGTCTGTCATAC
+>bc_80
+ACTCAGCTACATAGTG
+>bc_81
+ACGTATCATAGTGAGA
+>bc_82
+GAGTCGTATCGCTCAT
+>bc_83
+GCGATCACGAGTAGAC
+>bc_84
+CTAGACGTACATGTCG
+>bc_85
+TAGCAGTCACTGTGCG
+>bc_86
+GCTCATGCGATAGCTA
+>bc_87
+GCGCAGTCGTCTGTAT
+>bc_88
+ATGAGCTACGTACAGA
+>bc_89
+GTCGCGAGTCTATCAG
+>bc_90
+ACATCGATCTGCACTA
+>bc_91
+AGTATAGCATAGACGC
+>bc_92
+GTGAGAGCGTGACTCT
+>bc_93
+TGTCAGTAGATGACTC
+>bc_94
+TCGTACGAGATCGACA
+>bc_95
+CTACATGTGACTCGAG
+>bc_96
+GCGCTATAGTGCTCGT
diff --git a/etc/pacbio_barcodes_paired.fasta b/etc/pacbio_barcodes_paired.fasta
new file mode 100755
index 0000000..1ba2e7a
--- /dev/null
+++ b/etc/pacbio_barcodes_paired.fasta
@@ -0,0 +1,192 @@
+>F_1
+GGTAGGCGCTCTGTGTGCAGC
+>R_1
+AGAGTACTACATATGAGATGG
+>F_2
+GGTAGTCATGAGTCGACACTA
+>R_2
+CGTGTGCATAGATCGCGATGG
+>F_3
+GGTAGTATCTATCGTATACGC
+>R_3
+ATGTATCTCGACTGCAGATGG
+>F_4
+GGTAGATCACACTGCATCTGA
+>R_4
+GACTCGACGCAGAGTCGATGG
+>F_5
+GGTAGACGTACGCTCGTCATA
+>R_5
+CGATGACGTCGCTGTAGATGG
+>F_6
+GGTAGTGTGAGTCAGTACGCG
+>R_6
+CACACGTAGTCTGCGCGATGG
+>F_7
+GGTAGAGAGACACGATACTCA
+>R_7
+GCTGTATCGCAGAGACGATGG
+>F_8
+GGTAGCTGCTAGAGTCTACAG
+>R_8
+CGAGCTATCTCATACTGATGG
+>F_9
+GGTAGAGCACTCGCGTCAGTG
+>R_9
+CATGAGTACTCGTCGCGATGG
+>F_10
+GGTAGTCATGCACGTCTCGCT
+>R_10
+CAGCGACTGTGATACTGATGG
+>F_11
+GGTAGAGAGCATCTCTGTACT
+>R_11
+TGTCGCATCATATGATGATGG
+>F_12
+GGTAGCGCATCGACTACGCTA
+>R_12
+GCTGTGATCTACGTCTGATGG
+>F_13
+GGTAGCGTAGCGTGCTATCAC
+>R_13
+TGAGTAGCATGACACGGATGG
+>F_14
+GGTAGATGCTGATGACTGCGA
+>R_14
+GACATGCAGTCTCACAGATGG
+>F_15
+GGTAGTGCGTGAGCTGTACAT
+>R_15
+CAGTAGCGCACTGAGCGATGG
+>F_16
+GGTAGCGATCATCTATAGACA
+>R_16
+CTGCGTGCGCGATAGTGATGG
+>F_17
+GGTAGCGACGTATCTGACAGT
+>R_17
+CGCGTGCAGAGTGTCAGATGG
+>F_18
+GGTAGCACGTCACTAGAGCGA
+>R_18
+ATATCAGTCACGTCTGGATGG
+>F_19
+GGTAGTGTCGCAGCTACTAGT
+>R_19
+ACGATCACTACAGTGCGATGG
+>F_20
+GGTAGCATACGCTGTGTAGCA
+>R_20
+CTACTGTCTCGCACTGGATGG
+>F_21
+GGTAGAGTCGCATGACTGTGT
+>R_21
+GTCGAGTAGCACTACTGATGG
+>F_22
+GGTAGCAGTACTGCACGATCG
+>R_22
+TGAGCAGATCTCGCATGATGG
+>F_23
+GGTAGGTGCTGAGCATCAGAC
+>R_23
+CACTCAGTATGTCTCAGATGG
+>F_24
+GGTAGCACTGATCGATATGCA
+>R_24
+GTACACTAGTGCACATGATGG
+>F_25
+GGTAGTACAGTGTCTGCTGCG
+>R_25
+TCACATCGTCAGCTGAGATGG
+>F_26
+GGTAGTACAGATAGTGTAGCG
+>R_26
+ACATGTGCGCATCAGTGATGG
+>F_27
+GGTAGTCGTAGAGCTCGAGAC
+>R_27
+TCACTGCTGAGAGTAGGATGG
+>F_28
+GGTAGGAGCTGCGCACTCGAT
+>R_28
+GAGTCGTGATGTAGATGATGG
+>F_29
+GGTAGGCGATGTCGCTATGTG
+>R_29
+AGACGCTGTACTATATGATGG
+>F_30
+GGTAGCGAGAGTCAGCGCATA
+>R_30
+GCGATCTAGTCGTGTCGATGG
+>F_31
+GGTAGTCACGATGAGCACGTA
+>R_31
+GTATGACAGACTCGTAGATGG
+>F_32
+GGTAGGACTGAGATCATGATC
+>R_32
+CACTATGTAGCTGAGTGATGG
+>F_33
+GGTAGACGACATGATACTGCT
+>R_33
+TCTCACTATGATACGTGATGG
+>F_34
+GGTAGATACAGCACAGATGTG
+>R_34
+ATGAGCGATACGACTCGATGG
+>F_35
+GGTAGACAGTCGATATCTCTC
+>R_35
+GTCTACTCGTGATCGCGATGG
+>F_36
+GGTAGGCTCGATCACATGACG
+>R_36
+CGACATGTACGTCTAGGATGG
+>F_37
+GGTAGGTCGTACACGTGCGAC
+>R_37
+CGCACAGTGACTGCTAGATGG
+>F_38
+GGTAGACTCATATCTAGAGTG
+>R_38
+TAGCTATCGCATGAGCGATGG
+>F_39
+GGTAGACTGATCTGTCGCGCT
+>R_39
+ATACAGACGACTGCGCGATGG
+>F_40
+GGTAGCACTAGCTCTGACTAC
+>R_40
+TCTGTACGTAGCTCATGATGG
+>F_41
+GGTAGGCTGTCATGTACTAGC
+>R_41
+CTGATAGACTCGCGACGATGG
+>F_42
+GGTAGTATACATACACGCACT
+>R_42
+TAGTGCAGATCGATGTGATGG
+>F_43
+GGTAGTGTGACGACGCGTCTC
+>R_43
+GCGTCTATGCTATACTGATGG
+>F_44
+GGTAGGACGTGAGCATGCACT
+>R_44
+AGAGTCACGCTCTCACGATGG
+>F_45
+GGTAGCTCGATACGTGTAGCT
+>R_45
+GAGTCATCTACTGACAGATGG
+>F_46
+GGTAGGTGTCTAGACAGCTGT
+>R_46
+TGTCGATCTCGTACGAGATGG
+>F_47
+GGTAGGATGCATGCGTACGCA
+>R_47
+CTCGAGTCACATGTAGGATGG
+>F_48
+GGTAGTATCAGAGCAGCGATG
+>R_48
+ACGAGCACTATAGCGCGATGG
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..5a017e3
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,37 @@
+from setuptools import setup, Extension, find_packages
+import os
+import sys
+
+vFile = 'src/python/pbbarcode/_version.py'
+
+if os.path.exists(vFile):
+ lines = open(vFile, 'r').read().splitlines()
+ for line in lines:
+ elts = line.split('=')
+ elts = [e.strip() for e in elts]
+ if len(elts) == 2 and elts[0] == '__version__':
+ _ReadVersion = elts[1].replace('\'', '').replace('\"', '')
+ break
+else:
+ _ReadVersion = '0.0.0'
+
+setup(
+ name = 'pbbarcode',
+ version=_ReadVersion,
+ author='pbiDevNet',
+ author_email='pbiDevNet at pacificbiosciences.com',
+ license='LICENSE.txt',
+ packages = find_packages('src/python'),
+ package_dir = {'':'src/python'},
+ ext_modules=[Extension('pbbarcode/sw', ['src/C/sw.c'], extra_compile_args=["-O3","-shared"])],
+ zip_safe = False,
+ entry_points={
+ 'console_scripts': [
+ 'pbbarcode = pbbarcode.main:main']
+ },
+ install_requires=[
+ 'pbcore >= 0.6.3',
+ 'numpy >= 1.6.0',
+ 'h5py >= 1.3.0'
+ ]
+ )
diff --git a/src/C/Makefile b/src/C/Makefile
new file mode 100644
index 0000000..4913cf3
--- /dev/null
+++ b/src/C/Makefile
@@ -0,0 +1,11 @@
+.PHONY: clean all
+SHELL = /bin/bash -e
+
+all: build/sw.so
+
+build/sw.so: sw.c
+ mkdir -p ./build;\
+ gcc -O4 -DGETPROB -shared -fPIC sw.c -o build/sw.so
+clean:
+ rm -rf build
+
diff --git a/src/C/sw.c b/src/C/sw.c
new file mode 100644
index 0000000..4cdffdb
--- /dev/null
+++ b/src/C/sw.c
@@ -0,0 +1,56 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#define M 64
+#define N 64
+#define MAX(x,y) (((x) > (y)) ? (x) : (y))
+
+int* allocate_dp_mat() {
+ return (int*) calloc(N*M, sizeof(int));
+}
+
+int compute_align_score(int* dp_mat, char* tSeq, char* qSeq) {
+ int ipenalty = -1;
+ int dpenalty = -1;
+ int match = 2;
+ int mpenalty = -2;
+ int best_score = 0;
+ int iscore = 0;
+ int dscore = 0;
+ int mscore = 0;
+ int i,j;
+
+ memset(dp_mat, 0, M*N*sizeof(int));
+
+ for (i = 1; i < strlen(tSeq) + 1; i++) {
+ for (j = 1; j < strlen(qSeq) + 1; j++) {
+ iscore = dp_mat[i*M + j-1] + ipenalty;
+ dscore = dp_mat[(i-1)*M + j] + dpenalty;
+ mscore = dp_mat[(i-1)*M + j-1] + ((tSeq[i-1] == qSeq[j-1]) ? match : mpenalty);
+ dp_mat[i*M + j] = MAX(MAX(0, iscore), MAX(dscore, mscore));
+ if (dp_mat[i*M + j] >= best_score)
+ best_score = dp_mat[i*M + j];
+ }
+ }
+ return best_score;
+}
+
+void compute_align_scores(int* scores, int n, int* dp_mat, char* tSeq,
+ char** qSeqs) {
+ int i = 0;
+ for (i; i < n; i++) {
+ scores[i] = compute_align_score(dp_mat, tSeq, qSeqs[i]);
+ }
+}
+
+
+void print_dp_mat(int* dp_mat, char* tSeq, char* qSeq) {
+ int i,j;
+ for (j = 0; j < strlen(qSeq) + 1; j++) {
+ for (i = 0; i < strlen(tSeq) + 1; i++) {
+ printf("%d ", dp_mat[i*M + j]);
+ }
+ printf("\n");
+ }
+}
diff --git a/src/python/pbbarcode/BarcodeLabeler.py b/src/python/pbbarcode/BarcodeLabeler.py
new file mode 100755
index 0000000..354338b
--- /dev/null
+++ b/src/python/pbbarcode/BarcodeLabeler.py
@@ -0,0 +1,225 @@
+#################################################################################$$
+# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#################################################################################$$
+import logging
+
+from pbcore.io import BasH5Reader, BaxH5Reader
+from pbcore.io.FastaIO import *
+import pbbarcode.SWaligner as Aligner
+import numpy as n
+
+from pbcore.io.BarcodeH5Reader import LabeledZmw, \
+ BARCODE_DELIMITER
+
+__RC_MAP__ = dict(zip('ACGTacgt-N','TGCAtgca-N'))
+
+class BarcodeScorer(object):
+ def __init__(self, basH5, barcodeFasta,
+ adapterSidePad = 0, insertSidePad = 4,
+ scoreMode = 'symmetric', maxHits = 10,
+ scoreFirst = False, startTimeCutoff = 1):
+ """A BarcodeScorer object scores ZMWs and produces summaries
+ of the scores. Various parameters control the behavior of the
+ object, specifically the padding allows the user to add a
+ little extra on each side of the adapter find for safety. The
+ most relevant parameter is the scoreMode which dictates how
+ the barcodes are scored, either paired or symmetric."""
+
+ self.basH5 = basH5
+ self.barcodeFasta = list(barcodeFasta)
+ self.aligner = Aligner.SWaligner()
+ self.barcodeLength = n.unique(map(lambda x : len(x.sequence),
+ self.barcodeFasta))
+ if len(self.barcodeLength) > 1:
+ raise Exception("Currently, all barcodes must be the same length.")
+ else:
+ self.barcodeLength = int(self.barcodeLength)
+
+ self.barcodeSeqs = [(barcode.sequence.upper(),
+ self._rc(barcode.sequence.upper()))
+ for barcode in self.barcodeFasta]
+
+ self.adapterSidePad = adapterSidePad
+ self.insertSidePad = insertSidePad
+ self.maxHits = maxHits
+
+ if scoreMode not in ['symmetric', 'paired']:
+ raise Exception("scoreMode must either be symmetric or paired")
+ self._scoreMode = scoreMode
+
+ self.scoreFirst = scoreFirst
+ self.startTimeCutoff = startTimeCutoff
+
+ self.forwardScorer = self.aligner.makeScorer([x[0] for x in self.barcodeSeqs])
+ self.reverseScorer = self.aligner.makeScorer([x[1] for x in self.barcodeSeqs])
+
+ logging.debug(("Constructed BarcodeScorer with scoreMode: %s," + \
+ "adapterSidePad: %d, insertSidePad: %d, and scoreFirst: %r") \
+ % (scoreMode, adapterSidePad, insertSidePad, scoreFirst))
+
+ @property
+ def movieName(self):
+ return self.basH5.movieName
+
+ def makeBCLabel(self, s1, s2):
+ return BARCODE_DELIMITER.join((s1, s2))
+
+ @property
+ def barcodeLabels(self):
+ """The barcode labels are function of the barcodeNames and the
+ scoreMode, they represent the user-visible names."""
+ if self.scoreMode == 'paired':
+ return n.array([self.makeBCLabel(self.barcodeFasta[i].name,
+ self.barcodeFasta[i+1].name) for i
+ in xrange(0, len(self.barcodeSeqs), 2)])
+ else:
+ return n.array([self.makeBCLabel(x.name, x.name) for x in self.barcodeFasta])
+
+ @property
+ def barcodeNames(self):
+ """The barcode names are the FASTA names"""
+ return n.array([x.name for x in self.barcodeFasta])
+
+ @property
+ def scoreMode(self):
+ return self._scoreMode
+
+ def _rc(self, s):
+ return "".join([__RC_MAP__[c] for c in s[::-1]])
+
+ def _flankingSeqs(self, zmw):
+ def fromRange(rStart, rEnd):
+ try:
+ qSeqLeft = zmw.read(rStart - (self.barcodeLength + self.insertSidePad),
+ rStart + self.adapterSidePad).basecalls()
+ except IndexError:
+ qSeqLeft = None
+ try:
+ qSeqRight = zmw.read(rEnd - self.adapterSidePad,
+ rEnd + self.barcodeLength +
+ self.insertSidePad).basecalls()
+ except IndexError:
+ qSeqRight = None
+
+ return (qSeqLeft, qSeqRight)
+
+ adapterRegions = zmw.adapterRegions
+ if len(adapterRegions) > self.maxHits:
+ adapterRegions = adapterRegions[0:self.maxHits]
+
+ seqs = [fromRange(start, end) for (start, end) in adapterRegions]
+
+ # We only score the first barcode if we don't find any adapters
+ # *and* the start time is less than the threshold.
+ scoredFirst = False
+ if self.scoreFirst and not len(seqs):
+ s = zmw.zmwMetric('HQRegionStartTime')
+ e = zmw.zmwMetric('HQRegionEndTime')
+ # s<e => has HQ.
+ if s < e and s <= self.startTimeCutoff:
+ l = self.barcodeLength + self.insertSidePad
+ l = l if zmw.hqRegion[1] > l else zmw.hqRegion[1]
+ try:
+ bc = zmw.read(0, l).basecalls()
+ if len(bc) >= self.barcodeLength:
+ seqs.insert(0, (bc, None))
+ scoredFirst = True
+ except IndexError:
+ pass
+
+ return (seqs, scoredFirst)
+
+ def labelZmws(self, holeNumbers):
+ """Return a list of LabeledZmws for input holeNumbers"""
+ def scoreZmw(zmw):
+ adapters, scoredFirst = self._flankingSeqs(zmw)
+ adapterScores = [[]]*len(adapters)
+ barcodeScores = n.zeros(len(self.barcodeSeqs))
+
+ for i,adapter in enumerate(adapters):
+ fscores = self.forwardScorer(adapter[0])
+ rscores = self.reverseScorer(adapter[0])
+ ffscores = self.forwardScorer(adapter[1])
+ rrscores = self.reverseScorer(adapter[1])
+
+ scored = 2.0 if adapter[0] and adapter[1] else \
+ 1.0 if adapter[0] or adapter[1] else 0
+
+ # An adapter score is the average barcode score for
+ # each barcode -- that way, you can compare across
+ # adapters even if the different adapters have
+ # different numbers of flanking sequence.
+ if scored == 0:
+ adapterScores[i] = barcodeScores
+ else:
+ adapterScores[i] = n.maximum((fscores + rrscores)/scored,
+ (rscores + ffscores)/scored)
+
+ barcodeScores = reduce(lambda x, y: x + y, adapterScores) if adapterScores \
+ else n.zeros(len(self.barcodeSeqs))
+
+ return (zmw.holeNumber, len(adapters), barcodeScores, adapterScores,
+ scoredFirst)
+
+ # o here is the record immediately above.
+ def chooseSymmetric(o):
+ p = n.argsort(-o[2])
+ return LabeledZmw(o[0], o[1], p[0], o[2][p[0]], p[1], o[2][p[1]], o[3])
+ def choosePaired(o):
+ if o[1] == 1:
+ s = n.array([max(o[2][i], o[2][i + 1]) for i in \
+ xrange(0, len(self.barcodeSeqs), 2)])
+ p = n.argsort(-s)
+ s = s[p]
+ else:
+ # score the pairs by scoring the two alternate
+ # ways they could have been put on the molecule. A
+ # missed adapter will confuse this computation.
+ scores = o[3]
+ results = n.zeros(len(self.barcodeSeqs)/2)
+ for i in xrange(0, len(self.barcodeSeqs), 2):
+ pths = [0,0]
+ for j in xrange(0, len(scores)):
+ pths[j % 2] += scores[j][i]
+ pths[1 - j % 2] += scores[j][i + 1]
+ results[i/2] = max(pths)
+
+ p = n.argsort(-results)
+ s = results[p]
+
+ return LabeledZmw(o[0], o[1], p[0], s[0], p[1], s[1], o[3])
+
+ if self.scoreMode == 'symmetric':
+ choose = chooseSymmetric
+ elif self.scoreMode == 'paired':
+ choose = choosePaired
+ else:
+ raise Exception("Unsupported scoring mode in BarcodeLabeler.py")
+
+ scored = [scoreZmw(self.basH5[zmw]) for zmw in holeNumbers]
+ return [choose(scoreTup) for scoreTup in scored if scoreTup[1]]
diff --git a/src/python/pbbarcode/SWaligner.py b/src/python/pbbarcode/SWaligner.py
new file mode 100755
index 0000000..d6dae46
--- /dev/null
+++ b/src/python/pbbarcode/SWaligner.py
@@ -0,0 +1,69 @@
+#################################################################################$$
+# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#################################################################################$$
+from ctypes import *
+import os
+import numpy
+import pkg_resources
+
+class SWaligner(object):
+ def __init__(self):
+ # setup.py should put sw.so in the following path.
+ self.SW_DLL_PATH = os.path.dirname(os.path.abspath(__file__)) + os.path.sep + "sw.so"
+ self._dll = CDLL(self.SW_DLL_PATH)
+ self.dpMat = self._dll.allocate_dp_mat()
+
+ def score(self, tSeq, qSeq):
+ return self._dll.compute_align_score(self.dpMat, tSeq, qSeq)
+
+ def makeScorer(self, targets):
+ ScoreType = c_int * len(targets)
+ scores = ScoreType()
+ for i in range(0, len(scores)):
+ scores[i] = 0
+
+ TargetType = c_char_p * len(targets)
+ targetSeqs = TargetType()
+ for i in range(0, len(targetSeqs)):
+ targetSeqs[i] = targets[i]
+
+ targetLen = len(targets)
+
+ def scorer(query):
+ if not query:
+ return numpy.zeros(len(targets))
+
+ self._dll.compute_align_scores(scores,
+ targetLen,
+ self.dpMat,
+ query,
+ targetSeqs)
+ return numpy.array([scores[i] for i in xrange(0, len(scores))])
+ return scorer
+
+
diff --git a/src/python/pbbarcode/__init__.py b/src/python/pbbarcode/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/src/python/pbbarcode/_version.py b/src/python/pbbarcode/_version.py
new file mode 100755
index 0000000..4f0196d
--- /dev/null
+++ b/src/python/pbbarcode/_version.py
@@ -0,0 +1 @@
+__version__='0.8.0'
diff --git a/src/python/pbbarcode/main.py b/src/python/pbbarcode/main.py
new file mode 100755
index 0000000..0b6ddd0
--- /dev/null
+++ b/src/python/pbbarcode/main.py
@@ -0,0 +1,751 @@
+#!/usr/bin/env python
+#################################################################################$$
+# Copyright (c) 2011,2012, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#################################################################################$$
+import os
+import sys
+import argparse
+import logging
+import tempfile
+import shutil
+import pkg_resources
+import re
+import subprocess
+import random
+import shutil
+
+from multiprocessing import Pool
+
+import h5py as h5
+import numpy as n
+
+from pbcore.util.ToolRunner import PBMultiToolRunner
+from pbcore.io import BaxH5Reader, BasH5Reader
+from pbcore.io import CmpH5Reader, CmpH5Alignment
+from pbcore.io.BarcodeH5Reader import *
+from pbcore.io import FastaReader, FastqWriter, FastqRecord, \
+ FastaWriter, FastaRecord
+
+from pbbarcode.BarcodeLabeler import *
+from pbbarcode._version import __version__
+
+from pbh5tools.CmpH5Utils import copyAttributes
+
+# Paths to the Barcode Datasets in the cmp.h5 file.
+BC_ALN_INFO_DS = "AlnInfo/Barcode"
+BC_INFO_NAME = "BarcodeInfo/Name"
+BC_INFO_ID = "BarcodeInfo/ID"
+
+SCORE_MODES = ['symmetric', 'paired']
+
+BAS_PLS_REGEX = r'\.ba[x|s]\.h5$|\.pl[x|s]\.h5$|\.cc[x|s]\.h5$'
+BARCODE_EXT = '.bc.h5'
+BC_REGEX = r'\.bc\.h5'
+
+def movieNameFromFile(fn):
+ return re.sub('|'.join((BC_REGEX, BAS_PLS_REGEX)) , '',
+ os.path.basename(fn))
+
+def makeBarcodeH5FromBasH5(basH5):
+ """The workhorse function for creating a barcode H5 file from a
+ base H5 file."""
+ labeler = BarcodeScorer(basH5, FastaReader(runner.args.barcodeFile),
+ runner.args.adapterSidePad, runner.args.insertSidePad,
+ scoreMode = runner.args.scoreMode,
+ maxHits = runner.args.maxAdapters,
+ scoreFirst = runner.args.scoreFirst,
+ startTimeCutoff = runner.args.startTimeCutoff)
+ if runner.args.nZmws < 0:
+ zmws = basH5.sequencingZmws
+ else:
+ zmws = basH5.sequencingZmws[0:runner.args.nZmws]
+
+ logging.debug("Labeling %d ZMWs from: %s" % (len(zmws), basH5.filename))
+ labeledZmws = labeler.labelZmws(zmws)
+ logging.debug("Labeled %d ZMWs" % len(labeledZmws))
+
+ outBase = re.sub(BAS_PLS_REGEX, BARCODE_EXT,
+ os.path.basename(basH5.filename))
+ outFile = '/'.join((runner.args.outDir, outBase))
+ logging.debug("Writing to: %s" % outFile)
+
+ writeBarcodeH5(labeledZmws, labeler, outFile,
+ runner.args.saveExtendedInfo)
+ return outFile
+
+def mpWrapper(f):
+ return makeBarcodeH5FromBasH5(BasH5Reader(f))
+
+def makeBarcodeFofnFromBasFofn():
+ inputFofn = runner.args.inputFile
+ inFiles = open(inputFofn).read().splitlines()
+
+ if not all(map(os.path.exists, inFiles)):
+ raise IOError("All files in input.fofn must exist.")
+
+ logging.debug("Using %d processes." % runner.args.nProcs)
+ if runner.args.nProcs <= 1:
+ newFiles = map(mpWrapper, inFiles)
+ else:
+ pool = Pool(runner.args.nProcs)
+ newFiles = pool.map(mpWrapper, inFiles)
+
+ oFile = open(runner.args.outFofn, 'w')
+ for nF in newFiles:
+ oFile.write(nF + "\n")
+ oFile.close()
+
+def labelAlignments():
+ logging.info("Labeling alignments using: %s" % runner.args.inputFofn)
+ bcFofn = BarcodeH5Fofn(runner.args.inputFofn)
+
+ with CmpH5Reader(runner.args.cmpH5) as cmpH5:
+ bcDS = n.zeros((len(cmpH5), 5), dtype = "int32")
+
+ for (i, aln) in enumerate(cmpH5):
+ bcReader = bcFofn.readerForMovie(aln.movieInfo.Name)
+ try:
+ lZmw = bcReader.labeledZmwFromHoleNumber(aln.HoleNumber)
+ if lZmw.nScored < runner.args.minNumBarcodes or \
+ lZmw.averageScore < runner.args.minAvgBarcodeScore or \
+ lZmw.scoreRatio < runner.args.minScoreRatio:
+ lZmw = None
+ except KeyError:
+ lZmw = None
+
+ if lZmw:
+ bcDS[i,:] = n.array([lZmw.nScored, lZmw.bestIdx, lZmw.bestScore,
+ lZmw.secondBestIdx, lZmw.secondBestScore])
+ else:
+ # either no barcode was found for this guy or they got
+ # filtered, hence the NULL_BARCODE
+ bcDS[i,:] = n.array([0,
+ len(bcReader.barcodeLabels), 0,
+ len(bcReader.barcodeLabels), 0])
+
+ # write to the cmp.h5 file.
+ H5 = h5.File(runner.args.cmpH5, 'r+')
+ if BC_INFO_ID in H5:
+ del H5[BC_INFO_ID]
+ if BC_INFO_NAME in H5:
+ del H5[BC_INFO_NAME]
+
+ # we use the first one to get the labels, if somehow they
+ # don't have all of the same stuff that will be an issue.
+ bcLabels = n.concatenate((bcFofn.barcodeLabels, n.array([BARCODE_DELIMITER])))
+ H5.create_dataset(BC_INFO_ID, data = n.array(range(0, len(bcLabels))),
+ dtype = 'int32')
+ H5.create_dataset(BC_INFO_NAME, data = bcLabels, dtype = h5.new_vlen(str))
+ if BC_ALN_INFO_DS in H5:
+ del H5[BC_ALN_INFO_DS]
+ bcDS = H5.create_dataset(BC_ALN_INFO_DS, data = bcDS, dtype = 'int32')
+ bcDS.attrs['ColumnNames'] = n.array(['count', 'index1', 'score1', 'index2',
+ 'score2'])
+ #force BarcodeMode to have numpy dtype for CmpH5Sort 'extra datasets' routine
+ bcDS.attrs['BarcodeMode'] = n.array( bcFofn.scoreMode )
+ H5.close()
+
+def zipFofns(*inFofns):
+ """Take inputFofns and return n tuples of length len(inFofns)
+ where n is the number of entries in each FOFN."""
+ def readAndSort(inFile):
+ lines = n.array(open(inFile).read().splitlines())
+ lines = lines[n.array(n.argsort([movieNameFromFile(fofnLine) for
+ fofnLine in lines]))]
+ return lines
+
+ sortedFofns = [readAndSort(inFofn) for inFofn in inFofns]
+ l = map(len, sortedFofns)
+ if len(n.unique(l)) != 1:
+ raise Exception("Fofns don't match, unequal number of inputs.")
+ else:
+ for i in xrange(0, n.unique(l)):
+ if len(n.unique([movieNameFromFile(sortedFofn[i]) for
+ sortedFofn in sortedFofns])) != 1:
+ raise Exception("Fofn elements don't match, movies differ.")
+
+ # need to un-arrayify these guys
+ return zip(*map(list, sortedFofns))
+
+def filterZmws(zmwsForBCs):
+ """Apply various filterings passed by the user. There are somewhat
+ different semantics for CCS filtering and subread filtering in
+ terms of the raw primary metrics available, e.g.,
+ HQRegionStartTime is unavailable for the CCS data and somewhat
+ irrelevant."""
+ def getHQStart(zmw):
+ try:
+ return zmw.zmwMetric('HQRegionStartTime')
+ except:
+ return 0
+
+ def getReadScore(zmw):
+ return zmw.zmwMetric("ReadScore")
+
+ def molLenGuess(zmw):
+ if zmw.baxH5.hasRawBasecalls:
+ return max(map(len, zmw.subreads)) if zmw.subreads else 0
+ else:
+ return len(zmw.ccsRead) if zmw.ccsRead else 0
+
+ def zmwFilterFx(tup):
+ zmw, lZmw = tup
+
+ mlGuess = molLenGuess(zmw)
+ if not mlGuess:
+ return False
+
+ avgScore = lZmw.averageScore
+ numScored = lZmw.nScored
+ scoreRatio = lZmw.scoreRatio
+ hqStart = getHQStart(zmw)
+ readScore = getReadScore(zmw)
+
+ ## XXX : still need to detect the chimeras
+ if mlGuess < runner.args.minMaxInsertLength or \
+ hqStart > runner.args.hqStartTime or \
+ readScore < runner.args.minReadScore or \
+ avgScore < runner.args.minAvgBarcodeScore or \
+ numScored < runner.args.minNumBarcodes or \
+ scoreRatio < runner.args.minScoreRatio:
+ return False
+ else:
+ return True
+
+ return { k:filter(zmwFilterFx, v) for k,v in zmwsForBCs.items() }
+
+def _warnOnce():
+ var = []
+ def warnOnce(msg):
+ if not var:
+ logging.warn(msg)
+ var.append(1)
+ return warnOnce
+warnOnce = _warnOnce()
+
+def getFastqRecords(zmw, lZmw = None):
+ if zmw.baxH5.hasRawBasecalls and zmw.baxH5.hasConsensusBasecalls:
+ # Only examine this parameter when passed both.
+ if runner.args.subreads:
+ reads = zmw.subreads
+ else:
+ reads = [zmw.ccsRead]
+ elif zmw.baxH5.hasRawBasecalls:
+ if runner.args.subreads:
+ warnOnce("`subreads` argument is ignored when using >= 2.1" +
+ "bas.h5 data as input.")
+ reads = zmw.subreads
+ else:
+ if runner.args.subreads:
+ warnOnce("`subreads` argument is ignored when using >= 2.1" +
+ "ccs.h5 data as input.")
+ reads = [zmw.ccsRead]
+
+ extra = (" %g %g" % (round(zmw.zmwMetric("ReadScore"), 2),
+ round(lZmw.averageScore, 2))) if lZmw else ""
+
+ return [FastqRecord(read.readName + extra,
+ read.basecalls(),
+ read.QualityValue()) for read in reads if read]
+
+def getFastqs():
+ zmwsByBarcode = getZmwsForBarcodes()
+ logging.debug("Pre-filter: Average number of ZMWs per barcode: %d" %
+ n.mean([len(zmwsByBarcode[k]) for k in zmwsByBarcode.keys()]))
+
+ zmwsByBarcode = filterZmws(zmwsByBarcode)
+ logging.debug("Post-filter: Average number of ZMWs per barcode: %d" %
+ n.mean([len(zmwsByBarcode[k]) for k in zmwsByBarcode.keys()]))
+
+ def getReadData(zmws):
+ recs = [getFastqRecords(zmw,lZmw) for zmw,lZmw in zmws]
+ recs = filter(lambda x : x, recs)
+ return [elt for sublst in recs for elt in sublst]
+
+ return {k:getReadData(zmws) for k, zmws in zmwsByBarcode.iteritems()}
+
+def emitFastqs():
+ outFiles = getFastqs()
+ outDir = runner.args.outDir
+ fasta = runner.args.fasta
+
+ if runner.args.unlabeledZmws:
+ outFiles['UNLABELED'] = getUnlabeledZmws()
+
+ if not os.path.exists(runner.args.outDir):
+ os.makedirs(runner.args.outDir)
+
+ if fasta:
+ writer = FastaWriter
+ def record(n, s, qv):
+ return FastaRecord(n, s)
+ else:
+ writer = FastqWriter
+ record = FastqRecord
+
+ l = 'a' if runner.args.fasta else 'q'
+ for k in outFiles.keys():
+ if outFiles[k]:
+ with writer("%s/%s.fast%s" % (runner.args.outDir, k, l)) as w:
+ for e in outFiles[k]:
+ tlen = len(e.sequence)-runner.args.trim
+ r = record(e.name, e.sequence[runner.args.trim:tlen],
+ e.quality[runner.args.trim:tlen])
+ if r:
+ w.writeRecord(r)
+
+def getUnlabeledZmws():
+ """Return FASTQ records for ZMWs which do not have a barcode label"""
+ unlabeledZmws = []
+
+ for basFile, barcodeFile in zipFofns(runner.args.inputFofn,
+ runner.args.barcodeFofn):
+ basH5 = BasH5Reader(basFile)
+ bcH5 = BarcodeH5Reader(barcodeFile)
+ sdiff = basH5.sequencingZmws[~n.in1d(basH5.sequencingZmws,
+ bcH5.labeledZmws.keys())]
+ for hn in sdiff:
+ unlabeledZmws.append(basH5[hn])
+
+ return reduce(lambda x,y : x+y, [getFastqRecords(unlabeledZmw) for
+ unlabeledZmw in unlabeledZmws])
+
+def getZmwsForBarcodes(labels = None):
+ """dictionary of pbcore.io.Zmw and LabeledZmw indexed by barcode
+ label"""
+ zmwsForBCs = {}
+ for basFile, barcodeFile in zipFofns(runner.args.inputFofn,
+ runner.args.barcodeFofn):
+ basH5 = BasH5Reader(basFile)
+ bcH5 = BarcodeH5Reader(barcodeFile)
+ allLabs = bcH5.barcodeLabels
+ if labels:
+ allLabs = [x for x in allLabs if x in labels]
+ logging.info("Processing only: %s" % ",".join(allLabs))
+ for label in allLabs:
+ lZmws = bcH5.labeledZmwsFromBarcodeLabel(label)
+ for lZmw in lZmws:
+ zmw = basH5[lZmw.holeNumber]
+ if not label in zmwsForBCs.keys():
+ zmwsForBCs[label] = []
+ zmwsForBCs[label].append((zmw, lZmw))
+
+ return zmwsForBCs
+
+def gconFunc(tp):
+ # called bcause multiprocess
+ rootDir, barcode = tp
+ bcdir = "/".join((rootDir, barcode))
+
+ ## call gcon
+ logging.info("In gconFunc for: %s" % barcode)
+
+ cmd = "gcon.py r --min_cov 3 %s/subreads.fasta %s/seed_read.fasta -d %s" % \
+ (bcdir, bcdir, bcdir)
+ subprocess.call(cmd, shell = True)
+
+ ## check to see if the file is empty
+ r = FastaReader("%s/g_consensus.fa" % bcdir)
+
+ if not list(r)[0].sequence:
+ return None
+
+ ## check to see if we are going to run quiver
+ if not runner.args.noQuiver:
+ # setup the blasr / sam / quiver stuff.
+ logging.info("Setup regions file, now running blasr through quiver.")
+
+ cmd = ('blasr %s %s/g_consensus.fa -nproc 1 -sam -regionTable %s/region.fofn -out ' + \
+ '%s/aligned_reads.sam') % (runner.args.inputFofn, bcdir, bcdir, bcdir)
+ logging.debug(cmd)
+ subprocess.call(cmd, shell = True)
+
+ cmd = 'samtoh5 %s/aligned_reads.sam %s/g_consensus.fa %s/aligned_reads.cmp.h5' % \
+ (bcdir, bcdir, bcdir)
+ logging.debug(cmd)
+ subprocess.call(cmd, shell = True)
+
+ cmd = ('loadPulses %s %s/aligned_reads.cmp.h5 -byread -metrics ' + \
+ 'QualityValue,InsertionQV,MergeQV,DeletionQV,DeletionTag,SubstitutionTag,' + \
+ 'SubstitutionQV') % (runner.args.inputFofn, bcdir)
+ logging.debug(cmd)
+ subprocess.call(cmd, shell = True)
+
+ cmd = 'cmph5tools.py sort --inPlace %s/aligned_reads.cmp.h5' % bcdir
+ logging.debug(cmd)
+ subprocess.call(cmd, shell = True)
+
+ cmd = ('quiver -vv --algorithm quiver -p P4-C2.AllQVsMergingByChannelModel ' \
+ '%s/aligned_reads.cmp.h5 --outputFilename %s/q_consensus.fasta ' + \
+ '--referenceFilename %s/g_consensus.fa') % (bcdir, bcdir, bcdir)
+ logging.debug(cmd)
+ subprocess.call(cmd, shell = True)
+ cFilename = 'q_consensus.fasta'
+ else:
+ cFilename = 'g_consensus.fa'
+
+ ## append results to output file.
+ bcCons = "%s/%s/%s" % (rootDir, barcode, cFilename)
+ if os.path.exists(bcCons):
+ return FastaRecord(barcode, list(FastaReader(bcCons))[0].sequence)
+ else:
+ return None
+
+def subsampleReads(e):
+ logging.debug("starting with %d zmws" % len(e))
+ if runner.args.nZmws > 0:
+ k = runner.args.nZmws if runner.args.nZmws < len(e) else len(e)
+ elif runner.args.subsample < 1:
+ k = int(len(e)*runner.args.subsample)
+ else:
+ k = len(e)
+ i = n.array(random.sample(range(0, len(e)), k), dtype = int)
+ logging.debug("subsampled down to: %d" % len(i))
+ return [e[j] for j in i]
+
+def callConsensus():
+ def makeReadAndReads(zmwsForBC):
+ ccsData = filter(lambda x:x, [zmw.ccsRead for _,_,zmw in zmwsForBC if zmw])
+ srData = reduce(lambda x,y : x+y, [zmw.subreads for zmw,_,_ in
+ zmwsForBC if zmw], [])
+ if not srData and not ccsData:
+ return (None,None)
+
+ def getSeedRead(reads, lq = 80, uq = 90,
+ sLambda = lambda x : -x.zmw.readScore):
+ lens = map(len, reads)
+ candidateRange = (n.percentile(lens, lq),
+ n.percentile(lens, uq))
+ pfReads = [read for read,l in zip(reads, lens) if
+ l >= candidateRange[0] and l <= candidateRange[1]]
+ pfReads.sort(key = sLambda)
+ return pfReads[0] if len(pfReads) else None
+
+ if ccsData:
+ ## all CCS reads should be the *same* length for an
+ ## amplicon. Let's take the middle ones
+ seedRead = getSeedRead(ccsData, lq = 30, uq = 70,
+ sLambda = lambda x: -x.zmw.numPasses)
+ if not seedRead:
+ seedRead = getSeedRead(srData)
+ logging.info("Unable to use a CCS read for the seed read.")
+ else:
+ logging.info("Using a CCS read for the seed read.")
+ else:
+ logging.info("Using a raw read for the seed read")
+ seedRead = getSeedRead(srData)
+
+ return (seedRead, srData)
+
+ # check to make sure that you have the necessary dependencies,
+ # i.e., hgap script, blasr, etc.
+ try:
+ import pbtools.pbdagcon
+ except ImportError:
+ raise ImportError("Unable to find dependency `pbdagcon` - please install.")
+
+ # retrieve ZMWs by barcode
+ if runner.args.barcode:
+ zmwsForBCs = getZmwsForBarcodes(runner.args.barcode)
+ else:
+ zmwsForBCs = getZmwsForBarcodes()
+
+ # subsample
+ zmwsForBCs = {k:subsampleReads(v) for k,v in zmwsForBCs.items()}
+
+ logging.info("unfiltered average zmws per barcode: %g" %
+ n.round(n.mean(map(len, zmwsForBCs.values()))))
+
+ # filter ZMWs
+ zmwsForBCs = filterZmws(zmwsForBCs)
+
+ logging.info("filtered average zmws per barcode: %g" %
+ n.round(n.mean(map(len, zmwsForBCs.values()))))
+
+ # now choose the best subread to seed the assembly
+ if runner.args.ccsFofn:
+ # XXX: This part depends on the filenames of the ccs and input
+ # fofns, this is essentially a workaround to the fact the the
+ # part isn't part of the API
+ ccsReaders = {movieNameFromFile(l):BasH5Reader(l) for l in
+ open(runner.args.ccsFofn).read().splitlines()}
+
+ # fill in the CCS spot.
+ for k,v in zmwsForBCs.items():
+ l = []
+ for zmw,lZmw in v:
+ r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)]
+ l.append((zmw,lZmw,r[zmw.holeNumber]))
+ zmwsForBCs[k] = l
+ else:
+ # add none to the CCS spot.
+ zmwsForBCs = {k:[(zmw,lZmw,None) for zmw,lZmw in v]
+ for k,v in zmwsForBCs.iteritems()}
+
+ readAndReads = { k:makeReadAndReads(v) for k,v in zmwsForBCs.items() }
+
+ # remove barcodes that don't have a seed read and a set of useable reads.
+ readAndReads = { k:v for k,v in readAndReads.items() if v[0] and v[1] }
+
+ # generate FASTA files
+ outDir = runner.args.outDir
+
+ for barcode, reads in readAndReads.items():
+ bcdir = '/'.join((outDir, barcode))
+ if not os.path.exists(bcdir):
+ os.makedirs(bcdir)
+
+ # emit the seeds to separte files
+ with FastaWriter("%s/seed_read.fasta" % bcdir) as w:
+ w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls()))
+
+ subreads = reads[1]
+
+ # emit the subreads to a single file
+ with FastaWriter("%s/subreads.fasta" % bcdir) as w:
+ for r in subreads:
+ w.writeRecord(FastaRecord(r.readName, r.basecalls()))
+
+ # construct the region file by subsetting the ZMWs that you
+ # are interested in.
+ nfofn = []
+ for inFof, in zipFofns(runner.args.inputFofn):
+ bh5 = BaxH5Reader(inFof)
+ reg = bh5.file['/PulseData/Regions']
+ inMovie = filter(lambda z : z.baxH5.movieName == bh5.movieName,
+ subreads)
+ holes = n.in1d(reg[:,0], n.array([a.holeNumber for a in inMovie]))
+ if any(holes):
+ nreg = reg[holes,:]
+ else:
+ nreg = n.empty(shape = (0, reg.shape[1]), dtype = 'int32')
+
+ fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof))
+ nfile = h5.File(fname, 'w')
+ ndset = nfile.create_dataset('/PulseData/Regions', data = nreg,
+ maxshape = (None, None))
+ copyAttributes(reg, ndset)
+ nfile.close()
+ nfofn.append(fname)
+
+ ofile = open('%s/region.fofn' % bcdir, 'w')
+ ofile.writelines("\n".join(nfofn))
+ ofile.close()
+
+ ## call gcon
+ outDirs = [ (outDir, k) for k in readAndReads.keys() ]
+ if runner.args.nProcs == 1:
+ outFasta = filter(lambda z: z, map(gconFunc, outDirs))
+ else:
+ pool = Pool(runner.args.nProcs)
+ outFasta = filter(lambda z : z, pool.map(gconFunc, outDirs))
+
+ ## write the results
+ with FastaWriter('/'.join((outDir, "consensus.fa"))) as w:
+ for r in outFasta:
+ w.writeRecord(r)
+
+ ## optionally cleanup
+ if not runner.args.keepTmpDir:
+ for barcode, reads in readAndReads.items():
+ bcdir = '/'.join((outDir, barcode))
+ shutil.rmtree(bcdir)
+
+
+class Pbbarcode(PBMultiToolRunner):
+ def __init__(self):
+ desc = ['Utilities for labeling and annoting reads with barcode information.']
+ super(Pbbarcode, self).__init__('\n'.join(desc))
+ subparsers = self.subParsers
+
+ desc = ['Creates a barcode.h5 file from base h5 files.']
+ parser_m = subparsers.add_parser('labelZmws', description = "\n".join(desc),
+ help = 'Label zmws with barcode annotation',
+ formatter_class = \
+ argparse.ArgumentDefaultsHelpFormatter)
+ parser_m.add_argument('--outDir',
+ help = 'Where to write the newly created barcode.h5 files.',
+ default = os.getcwd())
+ parser_m.add_argument('--outFofn', help = 'Write to outFofn',
+ default = 'barcode.fofn')
+ parser_m.add_argument('--adapterSidePad', help = 'Pad with adapterSidePad bases',
+ default = 4, type = int)
+ parser_m.add_argument('--insertSidePad', help = 'Pad with insertSidePad bases',
+ default = 4, type = int)
+ parser_m.add_argument('--scoreMode',
+ help = 'The mode in which the barcodes should be scored.',
+ choices = SCORE_MODES, default = 'symmetric', type = str)
+ parser_m.add_argument('--maxAdapters', type = int, default = 20,
+ help = 'Only score the first maxAdapters')
+ parser_m.add_argument('--scoreFirst', action = 'store_true', default = False,
+ help = 'Whether to try to score the leftmost barcode in a trace.')
+ parser_m.add_argument('--startTimeCutoff',
+ help = 'Reads must start before this value in order to be ' + \
+ 'included when scoreFirst is set.', type = float,
+ default = 10.0)
+ parser_m.add_argument('--nZmws', type = int, default = -1,
+ help = 'Use the first n ZMWs for testing')
+ parser_m.add_argument('--nProcs', type = int, default = 8,
+ help = 'How many processes to use')
+ parser_m.add_argument('--saveExtendedInfo', action = 'store_true', default = False,\
+ help = 'Whether to save extended information to' + \
+ 'the barcode.h5 files; this information is useful for ' + \
+ 'debugging and chimera detection')
+ parser_m.add_argument('barcodeFile', metavar = 'barcode.fasta',
+ help = 'Input barcode fasta file')
+ parser_m.add_argument('inputFile', metavar = 'input.fofn',
+ help = 'Input base fofn')
+
+ def addFilteringOpts(parser, justBarcode = False):
+ ## These are independent of the barcode scoring
+ if not justBarcode:
+ parser.add_argument('--minMaxInsertLength', default = 0, type = int,
+ help = "ZMW Filter: exclude ZMW if the longest subread" + \
+ "is less than this amount")
+ parser.add_argument('--hqStartTime', default = float("inf"), type = float,
+ help = "ZMW Filter: exclude ZMW if start time of HQ region" + \
+ "greater than this value (seconds)")
+ parser.add_argument('--minReadScore', default = 0, type = float,
+ help = "ZMW Filter: exclude ZMW if readScore is less than" + \
+ "this value")
+
+ ## These obviously need the barcode score
+ parser.add_argument('--minAvgBarcodeScore', default = 0.0, type = float,
+ help = "ZMW Filter: exclude ZMW if average barcode score " + \
+ "is less than this value")
+ parser.add_argument('--minNumBarcodes', default = 1, type = int,
+ help = "ZMW Filter: exclude ZMW if number of barcodes observed " + \
+ "is less than this value")
+ parser.add_argument('--minScoreRatio', default = 1.0, type = float,
+ help = "ZMW Filter: exclude ZMWs whose best score divided by " + \
+ "the 2nd best score is less than this ratio")
+
+ # Not yet implemented
+ # parser.add_argument('--filterChimeras', default = False, action = 'store_true',
+ # help = "ZMW Filter: exclude ZMWs that appear to be chimeric")
+
+
+ desc = ['Adds information about barcode alignments to a cmp.h5 file',
+ 'from a previous call to "labelZmws".']
+ parser_s = subparsers.add_parser('labelAlignments', description = "\n".join(desc),
+ help = "Label reads from a barcode or region h5 file",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ addFilteringOpts(parser_s, justBarcode = True)
+ parser_s.add_argument('inputFofn', metavar = 'barcode.fofn',
+ help = 'input barcode fofn file')
+ parser_s.add_argument('cmpH5', metavar = 'aligned_reads.cmp.h5',
+ help = 'cmp.h5 file to add barcode labels')
+
+ desc = ['Takes a bas.h5 fofn and a barcode.h5 fofn and produces',
+ 'a fast[a|q] file for each barcode.']
+ parser_s = subparsers.add_parser('emitFastqs', description = "\n".join(desc),
+ help = "Write fastq files",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser_s.add_argument('--outDir', metavar = 'output.dir',
+ help = 'output directory to write fastq files',
+ default = os.getcwd())
+
+ parser_s.add_argument('--subreads',
+ help = 'whether to produce fastq files for the subreads;' + \
+ 'the default is to use the CCS reads. This option only' + \
+ 'applies when input.fofn has both consensus and raw reads,' + \
+ 'otherwise the read type from input.fofn will be returned.',
+ action = 'store_true',
+ default = False)
+ parser_s.add_argument('--unlabeledZmws',
+ help = 'whether to emit a fastq file for the unlabeled ZMWs.' + \
+ ' These are the ZMWs where no adapters are found typically',
+ action = 'store_true',
+ default = False)
+
+ parser_s.add_argument('--trim', help = 'trim off barcodes and any excess constant sequence',
+ default = 20, type = int)
+ parser_s.add_argument('--fasta', help = ('whether the files produced should be FASTA files as' +
+ 'opposed to FASTQ'),
+ action = 'store_true',
+ default = False)
+ addFilteringOpts(parser_s)
+ parser_s.add_argument('inputFofn', metavar = 'input.fofn',
+ help = 'input base or CCS fofn file')
+ parser_s.add_argument('barcodeFofn', metavar = 'barcode.fofn',
+ help = 'input barcode.h5 fofn file')
+
+ desc = ['Compute consensus sequences for each barcode.']
+ parser_s = subparsers.add_parser('consensus', description = "\n".join(desc),
+ help = "Compute a consensus sequence for each barcode." + \
+ "This command relies on the presence of pbdagcon",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser_s.add_argument('--subsample', default = 1, type = float,
+ help = "Subsample ZMWs")
+ parser_s.add_argument('--nZmws', default = -1, type = int,
+ help = "Take n ZMWs")
+ parser_s.add_argument('--outDir', default = '.', type = str,
+ help = "Use this directory to output results")
+ parser_s.add_argument('--keepTmpDir', action = 'store_true', default = False)
+ parser_s.add_argument('--ccsFofn', default = '', type = str,
+ help = 'Obtain CCS data from ccsFofn instead of input.fofn')
+ parser_s.add_argument('--nProcs', default = 16, type = int,
+ help = 'Use nProcs to execute.')
+ parser_s.add_argument('--noQuiver', action = 'store_true',
+ default = False)
+ addFilteringOpts(parser_s)
+
+ parser_s.add_argument('inputFofn', metavar = 'input.fofn',
+ help = 'input bas.h5 fofn file')
+ parser_s.add_argument('barcodeFofn', metavar = 'barcode.fofn',
+ help = 'input bc.h5 fofn file')
+
+ parser_s.add_argument('--barcode', default = None, type = str, nargs = "+",
+ help = "Use this to extract consensus for just one barcode.")
+
+ def getVersion(self):
+ return __version__
+
+ def run(self):
+ logging.debug("Arguments" + str(self.args))
+
+ if self.args.subCommand == 'labelZmws':
+ makeBarcodeFofnFromBasFofn()
+ elif self.args.subCommand == 'labelAlignments':
+ labelAlignments()
+ elif self.args.subCommand == 'emitFastqs':
+ emitFastqs()
+ elif self.args.subCommand == 'consensus':
+ callConsensus()
+ else:
+ sys.exit(1)
+
+runner = Pbbarcode()
+
+def main():
+ """The entry point for pbbarcode"""
+ sys.exit(runner.start())
+
+#if __name__ == '__main__':
+# runner = Pbbarcode()
+# sys.exit(runner.start())
diff --git a/tests/cram/consensus.t.disabled b/tests/cram/consensus.t.disabled
new file mode 100644
index 0000000..5bd9285
--- /dev/null
+++ b/tests/cram/consensus.t.disabled
@@ -0,0 +1,88 @@
+ $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"`
+ $ export INBH51=`python -c "from pbcore import data ; print data.geBasH5s[0]"`
+ $ export INBH52=`python -c "from pbcore import data ; print data.getBasH5s[1]"`
+ $ export BARCODE_FASTA=$TESTDIR/../../etc/barcode.fasta
+ $ echo $INBH51 > bas.fofn
+ $ echo $INBH52 >> bas.fofn
+ $ pbbarcode labelZmws $BARCODE_FASTA bas.fofn
+ $ pbbarcode consensus bas.fofn barcode.fofn
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46[INFO] [blasr] started.
+ 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] [INFO] 2013-08-02T00:28:462013-08-02T00:28:46 [blasr] started. [blasr] started.
+
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
+ [INFO] 2013-08-02T00:28:46 [blasr] started.
+ [INFO] 2013-08-02T00:28:46 [blasr] ended.
diff --git a/tests/cram/sanity.t b/tests/cram/sanity.t
new file mode 100644
index 0000000..310c4db
--- /dev/null
+++ b/tests/cram/sanity.t
@@ -0,0 +1,55 @@
+ $ export INH5=`python -c "from pbcore import data ; print data.getCmpH5()"`
+ $ export INBH51=`python -c "from pbcore import data ; print data.getBasH5s()[0]"`
+ $ export INBH52=`python -c "from pbcore import data ; print data.getBasH5s()[1]"`
+ $ export BARCODE_FASTA=$TESTDIR/../../etc/barcode.fasta
+ $ echo $INBH51 > bas.fofn
+ $ echo $INBH52 >> bas.fofn
+ $ pbbarcode labelZmws $BARCODE_FASTA bas.fofn
+ $ pbbarcode labelZmws --scoreMode paired $BARCODE_FASTA bas.fofn
+ $ pbbarcode labelZmws --scoreMode paired --scoreFirst $BARCODE_FASTA bas.fofn
+ $ pbbarcode labelZmws --scoreMode paired --scoreFirst --adapterSidePad 0 --insertSidePad 0 $BARCODE_FASTA bas.fofn
+ $ pbbarcode emitFastqs --fasta bas.fofn barcode.fofn
+ $ pbbarcode emitFastqs --trim 20 bas.fofn barcode.fofn
+ $ pbbarcode emitFastqs --subreads --trim 20 bas.fofn barcode.fofn
+ $ cp $INH5 ./aligned_reads.cmp.h5
+ $ chmod 766 ./aligned_reads.cmp.h5
+ $ pbbarcode labelAlignments barcode.fofn aligned_reads.cmp.h5
+Check that same holes get the same barcode (consistent scoring)
+ $ cmph5tools.py stats --what "(Movie,HoleNumber,Barcode,AverageBarcodeScore)" aligned_reads.cmp.h5 | uniq
+ Movie Barcode AverageBarcodeScore HoleNumber
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 13.00 3008
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.50 2001
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.00 4009
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 12.57 2008
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 14.33 3006
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.00 1000
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.00 4004
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 14.50 1006
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.00 4006
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 13.33 2006
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 3002
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 13.33 2006
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 1009
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 3002
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.67 1009
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 13.33 1000
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 12.33 1007
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.50 9
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 13.00 1004
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.00 2002
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.80 2004
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.00 4007
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 12.80 2004
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 12.00 3008
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.33 2009
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc5--bc10 14.50 2007
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 12.57 2008
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 16.00 1002
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 13.33 1008
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc5--bc10 12.50 9
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.00 2000
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 11.67 9
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.00 2000
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 11.67 9
+ m110818_075520_42141_c100129202555500000315043109121112_s2_p0 bc3--bc4 14.33 8
+ m110818_075520_42141_c100129202555500000315043109121112_s1_p0 bc3--bc4 14.33 2003
diff --git a/tests/test_basic.py b/tests/test_basic.py
new file mode 100755
index 0000000..064bf71
--- /dev/null
+++ b/tests/test_basic.py
@@ -0,0 +1,32 @@
+import logging
+import unittest
+
+# this is purely for the coverage to not fail when it's generated
+import pbbarcode
+
+log = logging.getLogger(__name__)
+
+
+class TestBasic(unittest.TestCase):
+ def test_01(self):
+ """Place holder so jenkins will generate a coverage report"""
+ self.assertTrue(True)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbbarcode.git
More information about the debian-med-commit
mailing list