[med-svn] [kineticstools] 01/03: Imported Upstream version 0.5.1+20150828+git3aa3d96+dfsg
Afif Elghraoui
afif-guest at moszumanska.debian.org
Tue Dec 8 08:30:02 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository kineticstools.
commit 3aad1fff0cd46aca33e2ae24f5e6b7d603c9a876
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Mon Dec 7 22:48:29 2015 -0800
Imported Upstream version 0.5.1+20150828+git3aa3d96+dfsg
---
.gitignore | 5 +
CHANGELOG | 8 +
LICENSES.txt | 32 +
Makefile | 49 ++
README.md | 12 +
bin/__init__.py | 29 +
bin/copyIpdSummaryDataset.py | 125 +++
bin/testShared.py | 75 ++
bin/writeSummaryToCmp.py | 160 ++++
doc/Makefile | 153 ++++
doc/conf.py | 245 ++++++
doc/index.rst | 10 +
doc/make.bat | 190 +++++
doc/manual.rst | 180 +++++
doc/regional_detection_manual.rst | 131 ++++
kineticsTools/BasicLdaEnricher.py | 68 ++
kineticsTools/KineticWorker.py | 838 ++++++++++++++++++++
kineticsTools/MedakaLdaEnricher.py | 110 +++
kineticsTools/MixtureEstimationMethods.py | 314 ++++++++
kineticsTools/ModificationDecode.py | 368 +++++++++
kineticsTools/MultiSiteCommon.py | 176 +++++
kineticsTools/MultiSiteDetection.py | 320 ++++++++
kineticsTools/PositiveControlEnricher.py | 147 ++++
kineticsTools/ReferenceUtils.py | 154 ++++
kineticsTools/ResultWriter.py | 870 +++++++++++++++++++++
kineticsTools/WorkerProcess.py | 260 ++++++
kineticsTools/__init__.py | 29 +
kineticsTools/ipdModel.py | 611 +++++++++++++++
kineticsTools/ipdSummary.py | 780 ++++++++++++++++++
kineticsTools/pipelineTools.py | 49 ++
kineticsTools/resources/C2.h5 | Bin 0 -> 12164810 bytes
kineticsTools/resources/P4-C2.h5 | Bin 0 -> 14529175 bytes
kineticsTools/resources/P5-C3.h5 | Bin 0 -> 29242809 bytes
kineticsTools/resources/P6-C4.h5 | Bin 0 -> 9644191 bytes
kineticsTools/resources/XL-C2.h5 | Bin 0 -> 14426526 bytes
kineticsTools/resources/XL-XL.h5 | Bin 0 -> 14426526 bytes
.../p4_c2_arabidopsis_2_2_binary_classifier.csv | 127 +++
.../resources/p4_c2_medaka_2_binary_classifier.csv | 127 +++
kineticsTools/resources/unknown.h5 | Bin 0 -> 12164810 bytes
kineticsTools/sharedArray.py | 51 ++
kineticsTools/summarizeModifications.py | 201 +++++
kineticsTools/tree_predict.c | 212 +++++
setup.py | 29 +
strand-conventions.txt | 33 +
test/cram/case-ctrl.t | 37 +
test/cram/detection.t | 76 ++
test/cram/detection_bam.t | 56 ++
test/cram/detection_bam_dataset.t | 56 ++
test/cram/detection_bam_lossless.t | 50 ++
test/cram/identify.t | 79 ++
test/cram/long_running/README.txt | 42 +
test/cram/long_running/detect_and_identify_Bsub.t | 31 +
test/cram/long_running/detect_and_identify_Cagg.t | 26 +
test/cram/long_running/detect_and_identify_Hpyl.t | 28 +
test/cram/long_running/detect_and_identify_Mjan.t | 28 +
test/cram/long_running/run_on_cluster.py | 14 +
test/cram/methyl-fraction-case-ctrl.t | 35 +
test/cram/methyl-fraction-identify.t.off | 77 ++
test/cram/portability.sh | 4 +
test/cram/version.t | 32 +
test/data/c2-c2-lambda-mod-decode.cmp.h5 | Bin 0 -> 3508085 bytes
test/data/lambda/reference.info.xml | 19 +
test/data/lambda/sequence/lambda.fasta | 810 +++++++++++++++++++
test/data/lambda/sequence/lambda.fasta.fai | 1 +
test/data/p4-c2-lambda-mod-decode.cmp.h5 | Bin 0 -> 9549747 bytes
test/detectionMethylFractionTest.py | 52 ++
test/methyFractionTest.py | 51 ++
test/speed.py | 27 +
test/test.py | 181 +++++
test/test_ReferenceUtils.py | 73 ++
test/test_inputs.py | 147 ++++
test/test_tool_contract.py | 110 +++
72 files changed, 9420 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8ea5439
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+build/
+dist/
+src/pbtools.kineticsTools.egg-info/
+*.pyc
+kineticsTools.egg-info/
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..8a4bfe6
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,8 @@
+
+Version 0.5.0 (9/29/2014)
+
+ * Migrate algorithm parameters (hdf5 files containing the GBM
+ trees) into kineticsTools, out of SMRTpipe
+ * Will now FAIL with an informative message if the chemistry is
+ "unknown" or is not one of the chemistries we have trained for.
+ * Add CHANGELOG
diff --git a/LICENSES.txt b/LICENSES.txt
new file mode 100644
index 0000000..ccdee9e
--- /dev/null
+++ b/LICENSES.txt
@@ -0,0 +1,32 @@
+Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+* Neither the name of Pacific Biosciences nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..98f9dde
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,49 @@
+SHELL = /bin/bash -e
+
+all: build install
+
+build:
+ python setup.py build --executable="/usr/bin/env python"
+
+bdist:
+ python setup.py build --executable="/usr/bin/env python"
+ python setup.py bdist --formats=egg
+
+install:
+ python setup.py install
+
+develop:
+ python setup.py develop
+
+clean:
+ rm -rf build/;\
+ find . -name "*.egg-info" | xargs rm -rf;\
+ find . -name "*.pyc" | xargs rm -rf;\
+ rm -rf dist/
+
+test: tests
+check: tests
+tests: cram-tests unit-tests
+
+cram-tests:
+ cram test/cram/*.t
+
+long-tests:
+ cram test/cram/long_running/*.t
+
+unit-tests:
+ nosetests -s -v test/*.py
+
+pip-install:
+ @which pip > /dev/null
+ @pip freeze|grep 'kineticsTools=='>/dev/null \
+ && ( pip uninstall -y kineticsTools \
+ || pip uninstall -y pbtools.kineticsTools ) \
+ || true
+ @pip install --no-index \
+ --install-option="--install-data=$(DATA)" \
+ --install-option="--install-scripts=$(PREFIX)/bin" \
+ ./
+
+.PHONY: tests test clean cram-tests unit-tests
+.PHONY: clean
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1e95e30
--- /dev/null
+++ b/README.md
@@ -0,0 +1,12 @@
+kineticsTools
+=============
+
+Tools for detecting DNA modifications from single molecule, real-time (SMRT®) sequencing data. This tool implements the P_ModificationDetection module in SMRT® Portal, used by the RS_Modification_Detection and RS_Modifications_and_Motif_Detection protocol. Researchers interested in understanding or extending the modification detection algorthims can use these tools as a starting point.
+
+Academic Publications:
+ * [Characterization of DNA methyltransferase specificities using single-molecule, real-time DNA sequencing](http://nar.oxfordjournals.org/content/40/4/e29)
+ * [The methylomes of six bacteria](http://nar.oxfordjournals.org/content/early/2012/10/02/nar.gks891.full)
+
+Documentation:
+ * [Tool documentation](http://github.com/PacificBiosciences/kineticsTools/blob/master/doc/manual.rst)
+ * [Methods description](http://github.com/PacificBiosciences/kineticsTools/blob/master/doc/whitepaper/kinetics.pdf)
diff --git a/bin/__init__.py b/bin/__init__.py
new file mode 100755
index 0000000..402d7a8
--- /dev/null
+++ b/bin/__init__.py
@@ -0,0 +1,29 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
diff --git a/bin/copyIpdSummaryDataset.py b/bin/copyIpdSummaryDataset.py
new file mode 100755
index 0000000..ebcfb24
--- /dev/null
+++ b/bin/copyIpdSummaryDataset.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import os, sys
+
+from pbcore.util.ToolRunner import PBToolRunner
+from operator import xor
+import h5py
+
+# Version info
+__version__ = "1.1"
+
+
+def validateFile(p):
+ if os.path.isfile(p):
+ return os.path.abspath(p)
+ else:
+ raise IOError("Unable to find {p}.".format(p=p))
+
+
+class CopyIpdSummaryDatasets(PBToolRunner):
+ """
+ Copy IpdRatio datasets from infile:/ref0000x to outfile:/ref000x/Kinetics/IpdRatio
+ or from infile:/ref0000x to mergefile:/ref000x (Dataset)
+ """
+
+ def __init__(self):
+ super(CopyIpdSummaryDatasets, self).__init__(CopyIpdSummaryDatasets.__doc__)
+
+ self.parser.add_argument('--infile',
+ required=True,
+ type=validateFile,
+ dest='infile',
+ help='Input cmp.h5 filename')
+
+ self.parser.add_argument('--outfile',
+ type=validateFile,
+ required=False,
+ help='Output cmp.h5 filename')
+
+ self.parser.add_argument('--mergefile',
+ type=validateFile,
+ required=False,
+ help='Filename of output h5 file for merging')
+
+ def validateArgs(self):
+ if not xor(bool(self.args.mergefile), bool(self.args.outfile)):
+ raise Exception("Exactly one of --outfile, --mergefile is required")
+
+ def getVersion(self):
+ return __version__
+
+ def copyToCmpH5(self):
+ inFile = h5py.File(self.args.infile, mode='r')
+ outFile = h5py.File(self.args.outfile, mode='r+')
+
+ for refDataset in inFile.items():
+ (name, ds) = refDataset
+
+ if '/' + name in outFile:
+ targetGroup = outFile['/' + name]
+
+ if 'Kinetics' in targetGroup:
+ kinGroup = targetGroup['Kinetics']
+ else:
+ kinGroup = targetGroup.create_group('Kinetics')
+
+ if 'IpdRatio' in kinGroup:
+ del kinGroup['IpdRatio']
+
+ h5py.h5o.copy(inFile.id, name, kinGroup.id, 'IpdRatio')
+
+
+ def copyToMergeFile(self):
+ inFile = h5py.File(self.args.infile, mode='r')
+ mergeFile = h5py.File(self.args.mergefile, mode='r+')
+
+ for refDataset in inFile.items():
+ (name, ds) = refDataset
+ h5py.h5o.copy(inFile.id, name, mergeFile.id, name)
+
+
+
+ def run(self):
+ if self.args.outfile: self.copyToCmpH5()
+ else: self.copyToMergeFile()
+ return 0
+
+
+def main():
+ kt = CopyIpdSummaryDatasets()
+ rcode = kt.start()
+ return rcode
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/bin/testShared.py b/bin/testShared.py
new file mode 100755
index 0000000..21fd7fe
--- /dev/null
+++ b/bin/testShared.py
@@ -0,0 +1,75 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from multiprocessing.process import Process, current_process
+from src.kineticsTools.sharedArray import SharedArray
+
+# Test script for making sure that shared memory backed numpy arrays work properly
+# FIXME -- migrate to test dir if possible
+
+
+class Sub(Process):
+
+ def __init__(self, sa):
+ Process.__init__(self)
+ self.sa = sa
+ self.arr = sa.getNumpyWrapper()
+
+ def run(self):
+ import time
+
+ print "self.arr[10] = %f, Process = %s" % (self.arr[10], current_process())
+
+ print self.arr.shape
+
+ n = self.arr.shape[0] - 1
+
+ print "self.arr[%d] = %f, Process = %s" % (n, self.arr[n], current_process())
+ time.sleep(10)
+
+
+class Test:
+
+ def __init__(self):
+ self.sa = SharedArray(dtype="f", shape=50000000)
+
+ self.arr = self.sa.getNumpyWrapper()
+ self.arr[:] = 99.0
+ self.arr[10] = 100.0
+
+ def start(self):
+ proc = Sub(self.sa)
+ proc.start()
+ proc.join()
+
+
+if __name__ == "__main__":
+ tester = Test()
+ tester.start()
diff --git a/bin/writeSummaryToCmp.py b/bin/writeSummaryToCmp.py
new file mode 100755
index 0000000..ae655d0
--- /dev/null
+++ b/bin/writeSummaryToCmp.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+
+import cProfile
+from pbcore.io import GffReader, Gff3Record
+import os
+import logging
+import sys
+
+from pbcore.util.ToolRunner import PBToolRunner
+
+__version__ = "1.0"
+
+
+class IpdRatioSummaryWriter(PBToolRunner):
+
+ def __init__(self):
+ desc = ['Summarizes kinetic modifications in the alignment_summary.gff file',
+ 'Notes: For all command-line arguments, default values are listed in [].']
+ super(IpdRatioSummaryWriter, self).__init__('\n'.join(desc))
+
+ self.parser.add_argument('--pickle',
+ dest="pickle",
+ help='Name of input GFF file [%(default)s]')
+
+ self.parser.add_argument('--alignmentSummary',
+ dest="alignmentSummary",
+ help='Name alignment summary file [%(default)s]')
+
+ self.parser.add_argument('--outfile',
+ dest="outfile",
+ help='Name of modified alignment summary GFF file [%(default)s]')
+
+ self.parser.add_argument("--profile",
+ action="store_true",
+ dest="doProfiling",
+ default=False,
+ help="Enable Python-level profiling (using cProfile).")
+
+ def getVersion(self):
+ return __version__
+
+ def validateArgs(self):
+ if not os.path.exists(self.args.modifications):
+ self.parser.error('input modifications gff file provided does not exist')
+
+ if not os.path.exists(self.args.alignmentSummary):
+ self.parser.error('input alignment summary gff file provided does not exist')
+
+ def run(self):
+ self.options = self.args
+
+ # Log generously
+ logFormat = '%(asctime)s [%(levelname)s] %(message)s'
+ logging.basicConfig(level=logging.INFO, format=logFormat)
+ stdOutHandler = logging.StreamHandler(sys.stdout)
+ logging.Logger.root.addHandler(stdOutHandler)
+ logging.info("t1")
+
+ if self.args.doProfiling:
+ cProfile.runctx("self._mainLoop()",
+ globals=globals(),
+ locals=locals(),
+ filename="profile-main4.out")
+
+ else:
+ return self._mainLoop()
+
+ def _mainLoop(self):
+
+ # Read in the existing modifications.gff
+ modReader = GffReader(self.args.modifications)
+
+ # Set up some additional headers to be injected
+ headers = [
+ ('source', 'kineticModificationCaller 1.3.1'),
+ ('source-commandline', " ".join(sys.argv)),
+ ('attribute-description', 'modsfwd - count of detected DNA modifications on forward strand'),
+ ('attribute-description', 'modsrev - count of detected DNA modifications on reverse strand')
+ ]
+
+ # Get modification calls
+ hits = [{"pos": x.start, "strand": x.strand} for x in modReader if x.type == 'modified_base']
+
+ # Summary reader
+ summaryFile = file(self.args.alignmentSummary)
+
+ # Modified gff file
+ summaryWriter = file(self.args.outfile, "w")
+
+ self.seqMap = {}
+ inHeader = True
+
+ # Loop through
+ for line in summaryFile:
+ # Pass any metadata line straight through
+ if line[0] == "#":
+
+ # Parse headers
+ splitFields = line.replace('#', '').split(' ')
+ field = splitFields[0]
+ value = " ".join(splitFields[1:])
+ if field == 'sequence-header':
+ [internalTag, delim, externalTag] = value.strip().partition(' ')
+ self.seqMap[internalTag] = externalTag
+ print >>summaryWriter, line.strip()
+ continue
+
+ if inHeader:
+ # We are at the end of the header -- write the tool-specific headers
+ for field in headers:
+ print >>summaryWriter, ("##%s %s" % field)
+ inHeader = False
+
+ # Parse the line
+ rec = Gff3Record.fromString(line)
+
+ if rec.type == 'region':
+ # Get the hits in this interval, add them to the gff record
+ intervalHits = [h for h in hits if rec.start <= h['pos'] <= rec.end]
+ strand0Hits = len([h for h in intervalHits if h['strand'] == '+'])
+ strand1Hits = len([h for h in intervalHits if h['strand'] == '-'])
+
+ rec.modsfwd = strand0Hits
+ rec.modsrev = strand1Hits
+
+ print >>summaryWriter, str(rec)
+
+if __name__ == "__main__":
+ kt = ModificationSummary()
+ kt.start()
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..afbfe0d
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,153 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ -rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/kineticsTools.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/kineticsTools.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/kineticsTools"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/kineticsTools"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100755
index 0000000..d017b6e
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,245 @@
+# -*- coding: utf-8 -*-
+#
+# kineticsTools documentation build configuration file, created by
+# sphinx-quickstart on Tue Mar 6 09:09:23 2012.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, os.path.abspath('../src'))
+sys.path.insert(0, os.path.abspath('../src/pbtools/kineticsTools'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'kineticsTools'
+copyright = u'2012, Patrick Marks, David Alexander, Onureena Banerjee'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.4'
+# The full version, including alpha/beta/rc tags.
+release = '1.4'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'kineticsToolsdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ('index', 'kineticsTools.tex', u'kineticsTools Documentation',
+ u'Patrick Marks, David Alexander, Onureena Banerjee', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'kineticstools', u'kineticsTools Documentation',
+ [u'Patrick Marks, David Alexander, Onureena Banerjee'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ ('index', 'kineticsTools', u'kineticsTools Documentation',
+ u'Patrick Marks, David Alexander, Onureena Banerjee', 'kineticsTools', 'One line description of project.',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..bec0282
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,10 @@
+kineticsTools
+=============
+
+Contents:
+
+.. toctree::
+ :maxdepth: 2
+
+ manual
+ regional_detection_manual
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..b5d4eca
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,190 @@
+ at ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+ set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+ :help
+ echo.Please use `make ^<target^>` where ^<target^> is one of
+ echo. html to make standalone HTML files
+ echo. dirhtml to make HTML files named index.html in directories
+ echo. singlehtml to make a single large HTML file
+ echo. pickle to make pickle files
+ echo. json to make JSON files
+ echo. htmlhelp to make HTML files and a HTML help project
+ echo. qthelp to make HTML files and a qthelp project
+ echo. devhelp to make HTML files and a Devhelp project
+ echo. epub to make an epub
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+ echo. text to make text files
+ echo. man to make manual pages
+ echo. texinfo to make Texinfo files
+ echo. gettext to make PO message catalogs
+ echo. changes to make an overview over all changed/added/deprecated items
+ echo. linkcheck to check all external links for integrity
+ echo. doctest to run all doctests embedded in the documentation if enabled
+ goto end
+)
+
+if "%1" == "clean" (
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+ del /q /s %BUILDDIR%\*
+ goto end
+)
+
+if "%1" == "html" (
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+ goto end
+)
+
+if "%1" == "dirhtml" (
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+ goto end
+)
+
+if "%1" == "singlehtml" (
+ %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+ goto end
+)
+
+if "%1" == "pickle" (
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the pickle files.
+ goto end
+)
+
+if "%1" == "json" (
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the JSON files.
+ goto end
+)
+
+if "%1" == "htmlhelp" (
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+ goto end
+)
+
+if "%1" == "qthelp" (
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\kineticsTools.qhcp
+ echo.To view the help file:
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\kineticsTools.ghc
+ goto end
+)
+
+if "%1" == "devhelp" (
+ %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished.
+ goto end
+)
+
+if "%1" == "epub" (
+ %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The epub file is in %BUILDDIR%/epub.
+ goto end
+)
+
+if "%1" == "latex" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "text" (
+ %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The text files are in %BUILDDIR%/text.
+ goto end
+)
+
+if "%1" == "man" (
+ %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The manual pages are in %BUILDDIR%/man.
+ goto end
+)
+
+if "%1" == "texinfo" (
+ %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+ goto end
+)
+
+if "%1" == "gettext" (
+ %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+ goto end
+)
+
+if "%1" == "changes" (
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.The overview file is in %BUILDDIR%/changes.
+ goto end
+)
+
+if "%1" == "linkcheck" (
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+ goto end
+)
+
+if "%1" == "doctest" (
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+ goto end
+)
+
+:end
diff --git a/doc/manual.rst b/doc/manual.rst
new file mode 100644
index 0000000..c9b2c21
--- /dev/null
+++ b/doc/manual.rst
@@ -0,0 +1,180 @@
+
+
+
+========
+Overview
+========
+
+kineticsTool loads IPDs observed at each position in the genome, and compares those IPDs to value expected for unmodified DNA, and outputs the result of this statistical test.
+The expected IPD value for unmodified DNA can come from either an *in-silico control* or an *amplified control*. The in silico control is trained by PacBio and shipped with the package. It predicts predicts the IPD using the local sequence context around the current position.
+An amplified control dataset is generated by sequencing unmodified DNA with the same sequence as the test sample. An amplified control sample is usually generated by whole-genome amplification of the original sample.
+
+
+Dependencies
+------------
+kineticsTools depends on:
+ * pbcore (http://github.com/PacificBiosciences/pbcore)
+ * h5py
+ * numpy
+ * scipy
+
+Modification Detection
+----------------------
+The basic mode of kineticsTools does an independent comparison of IPDs at each position on the genome, for each strand, and emits various statistics to CSV and GFF (after applying a significance filter).
+
+Modifications Identification
+----------------------------
+kineticsTools also has a *Modification Identification* mode that can decode multi-site IPD 'fingerprints' into a reduced set of calls of specific modifications. This feature has the following benefits:
+ * Different modifications occuring on the same base can be distinguished (for example m5C and m4C)
+ * The signal from one modification is combined into one statistic, improving sensitivity, removing extra peaks, and correctly centering the call
+
+
+=========
+Algorithm
+=========
+
+Synthetic Control
+-----------------
+Studies of the relationship between IPD and sequence context reveal that most of the variation in mean IPD across a genome can be predicted from a 12-base sequence context surrounding the active site of the DNA polymerase. The bounds of the relevant context window correspond to the window of DNA in contact with the polymerase, as seen in DNA/polymerase crystal structures. To simplify the process of finding DNA modifications with PacBio data, the tool includes a pre-trained lookup table [...]
+
+
+Filtering and Trimming
+----------------------
+
+kineticsTools uses the Mapping QV generated by BLASR and stored in the cmp.h5 file to ignore reads that aren't confidently mapped. The default minimum Mapping QV required is 10, implying that BLASR has :math:`90\%` confidence that the read is correctly mapped. Because of the range of readlengths inherent in PacBio dataThis can be changed in using the --mapQvThreshold command line argument, or via the SMRTPortal configuration dialog for Modification Detection.
+
+There are a few features of PacBio data that require special attention in order to acheive good modification detection performance.
+kineticsTools inspects the alignment between the observed bases and the reference sequence -- in order for an IPD measurement to be included in the analysis, the PacBio read sequence must match the reference sequence for :math:`k` around the cognate base. In the current module :math:`k=1`
+The IPD distribution at some locus be thought of as a mixture between the 'normal' incorporation process IPD, which is sensitive to the local sequence context and DNA modifications and a contaminating 'pause' process IPD which have a much longer duration (mean >10x longer than normal), but happen rarely (~1% of IPDs).
+Note: Our current understanding is that pauses do not carry useful information about the methylation state of the DNA, however a more careful analysis may be warranted. Also note that modifications that drastically increase the
+Roughly 1% of observed IPDs are generated by pause events. Capping observed IPDs at the global 99th percentile is motivated by theory from robust hypothesis testing. Some sequence contexts may have naturally longer IPDs, to avoid capping too much data at those contexts, the cap threshold is adjusted per context as follows:
+capThreshold = max(global99, 5*modelPrediction, percentile(ipdObservations, 75))
+
+
+Statistical Testing
+-------------------
+We test the hypothesis that IPDs observed at a particular locus in the sample have a longer means than IPDs observed at the same locus in unmodified DNA. If we have generated a Whole Genome Amplified dataset, which removes DNA modifications, we use a case-control, two-sample t-test. This tool also provides a pre-calibrated 'synthetic control' model which predicts the unmodified IPD, given a 12 base sequence context. In the synthetic control case we use a one-sample t-test, with an adju [...]
+
+======
+Inputs
+======
+
+aligned_reads.cmp.h5
+--------------------
+
+A standard cmp.h5 file contain alignments and IPD information supplies the kinetic data used to perform modification detection. The standard cmp.h5 file of a SMRTportal jobs is data/aligned_read.cmp.h5.
+
+Reference Sequence
+------------------
+
+The tool requires the reference sequence used to perform alignments. Currently this must be supplied via the path to a SMRTportal reference repository entry.
+
+=======
+Outputs
+=======
+
+The modification detection tool provides results in a variety of formats suitable for in-depth statistical analysis,
+quick reference, and comsumption by visualization tools such as PacBio SMRTView.
+Results are generally indexed by reference position and reference strand. In all cases the strand value refers to the strand carrying the modification in DNA sample. Remember that the kinetic effect of the modification is observed in read sequences aligning to the opposite strand. So reads aligning to the positive strand carry information about modification on the negative strand and vice versa, but in this toolkit we alway report the strand containing the putative modification.
+
+modifications.csv
+-----------------
+The modifications.csv file contains one row for each (reference position, strand) pair that appeared in the dataset with coverage at least x.
+x defaults to 3, but is configurable with '--minCoverage' flag to ipdSummary.py. The reference position index is 1-based for compatibility with the gff file the R environment.
+
+Output columns
+--------------
+
+**in-silico control mode**
+
+================ ===========
+Column Description
+================ ===========
+refId reference sequence ID of this observation
+tpl 1-based template position
+strand native sample strand where kinetics were generated. '0' is the strand of the original FASTA, '1' is opposite strand from FASTA
+base the cognate base at this position in the reference
+score Phred-transformed pvalue that a kinetic deviation exists at this position
+tMean capped mean of normalized IPDs observed at this position
+tErr capped standard error of normalized IPDs observed at this position (standard deviation / sqrt(coverage)
+modelPrediction normalized mean IPD predicted by the synthetic control model for this sequence context
+ipdRatio tMean / modelPrediction
+coverage count of valid IPDs at this position (see Filtering section for details)
+frac estimate of the fraction of molecules that carry the modification
+fracLow 2.5% confidence bound of frac estimate
+fracUpp 97.5% confidence bound of frac estimate
+================ ===========
+
+**case-control mode**
+
+================ ===========
+Column Description
+================ ===========
+refId reference sequence ID of this observation
+tpl 1-based template position
+strand native sample strand where kinetics were generated. '0' is the strand of the original FASTA, '1' is opposite strand from FASTA
+base the cognate base at this position in the reference
+score Phred-transformed pvalue that a kinetic deviation exists at this position
+caseMean mean of normalized case IPDs observed at this position
+controlMean mean of normalized control IPDs observed at this position
+caseStd standard deviation of case IPDs observed at this position
+controlStd standard deviation of control IPDs observed at this position
+ipdRatio tMean / modelPrediction
+testStatistic t-test statistic
+coverage mean of case and control coverage
+controlCoverage count of valid control IPDs at this position (see Filtering section for details)
+caseCoverage count of valid case IPDs at this position (see Filtering section for details)
+================ ===========
+
+
+
+modifications.gff
+-----------------
+The modifications.gff is compliant with the GFF Version 3 specification (http://www.sequenceontology.org/gff3.shtml). Each template position / strand pair whose p-value exceeds the pvalue threshold appears as a row. The template position is 1-based, per the GFF spec. The strand column refers to the strand carrying the detected modification, which is the opposite strand from those used to detect the modification. The GFF confidence column is a Phred-transformed pvalue of detection.
+
+**Note on genome browser compatibility**
+
+The modifications.gff file will not work directly with most genome browsers. You will likely need to make a copy of the GFF file and convert the _seqid_ columns from the generic 'ref0000x' names generated by PacBio, to the FASTA headers present in the original reference FASTA file. The mapping table is written in the header of the modifications.gff file in ``#sequence-header`` tags. This issue will be resolved in the 1.4 release of kineticsTools.
+
+The auxiliary data column of the GFF file contains other statistics which may be useful downstream analysis or filtering. In particular the coverage level of the reads used to make the call, and +/- 20bp sequence context surrounding the site.
+
+
+================ ===========
+Column Description
+================ ===========
+seqid Fasta contig name
+source Name of tool -- 'kinModCall'
+type Modification type -- in identification mode this will be m6A, m4C, or m5C for identified bases, or the generic tag 'modified_base' if a kinetic event was detected that does not match a known modification signature
+start Modification position on contig
+end Modification position on contig
+score Phred transformed p-value of detection - this is the single-site detection p-value
+strand Sample strand containing modification
+phase Not applicable
+attributes Extra fields relevant to base mods. IPDRatio is traditional IPDRatio, context is the reference sequence -20bp to +20bp around the modification, and coverage level is the number of IPD observations used after Mapping QV filtering and accuracy filtering. If the row results from an identified modification we also include an identificationQv tag with the from the modification identification procedure. identificationQv is the phred-transformed probability of an incorre [...]
+================ ===========
+
+motifs.gff
+----------
+If the Motif Finder tool is run, it will generate motifs.gff, which a reprocessed version of modifications.gff with the following changes. If a detected modification occurs on a motif detected by the motif finder, the modification is annotated with motif data. An attribute 'motif' is added containing the motif string, and an attribute 'id' is added containing the motif id, which is the motif string for unpaired motifs or 'motifString1/motifString2' for paired motifs. If a motif instance [...]
+
+
+motif_summary.csv
+-----------------
+
+If the Motif Finder tool is run, motif_summary.csv is generated, summarizing the modified motifs discovered by the tool. The CSV contains one row per detected motif, with the following columns
+
+================== ===========
+Column Description
+================== ===========
+motifString Detected motif sequence
+centerPos Position in motif of modification (0-based)
+fraction Fraction of instances of this motif with modification QV above the QV threshold
+nDetected Number of instances of this motif with above threshold
+nGenome Number of instances of this motif in reference sequence
+groupTag A string idetifying the motif grouping. For paired motifs this is "<motifString1>/<motifString2>", For unpaired motifs this equals motifString
+partnerMotifString motifString of paired motif (motif with reverse-complementary motifString)
+meanScore Mean Modification Qv of detected instances
+meanIpdRatio Mean IPD ratio of detected instances
+meanCoverage Mean coverage of detected instances
+objectiveScore Objective score of this motif in the motif finder algorithm
+================== ===========
diff --git a/doc/regional_detection_manual.rst b/doc/regional_detection_manual.rst
new file mode 100644
index 0000000..c97f74e
--- /dev/null
+++ b/doc/regional_detection_manual.rst
@@ -0,0 +1,131 @@
+
+
+======================
+Regional m5C Detection
+======================
+
+This document describes a simple workflow for obtaining basic estimates of hypo-methylated regions in eukaryotic data. This application is based on and borrows heavily from the many original ideas of [1].
+
+Following [1], we define a hypo-methylated region here to be a genomic region of any length containing at least 50 CG sites that are all (or mostly) judged by a bisulfite sequencing caller to be less than 50% methylated.
+
+
+Requirements
+------------
+
+To estimate hypo-methylated regions as described here, the following two scripts are required:
+
+- MaximumScoringSubsequences.py
+- runMaxScoringSubsequences.py
+
+In addition, the following two m5C classifiers are available to choose from:
+
+- p4_c2_medaka_2_binary_classifier.csv
+- p4_c2_arabidopsis_2_2_binary_classifier.csv
+
+
+The workflow is currently two steps. The first step requires running ipdSummary.py, the basic modification detection tool, from the command line with special arguments. The second step requires running one of the additional scripts provided.
+
+The scripts listed above have no additional dependencies beyond those required for kineticsTools.
+
+An additional requirement relates to the chemistry used to collect data to which this workflow can be applied. Since kinetics generally vary from chemistry to chemistry, models must be applied to data collected using the same chemistry as the training data. Both models provided were trained using P4-C2 data. **These P4-C2 models are not expected to work on P5-C3 chemistry data.**
+
+
+
+Step 1. Application of LDA classifier
+-------------------------------------
+
+
+The first step involves running ipdSummary.py from the command line with the following arguments::
+
+ ipdSummary.py --useLDA
+ --refContigIndex <reference index>
+ --m5Cclassifier <csv file containing m5C binary classifier weights>
+ --m5Cgff <m5C scores gff>
+ --reference <FASTA file>
+ <cmp.h5 file>
+
+The current scheme is best suited to be run on one contig of the reference at a time: <reference index> specifies a contig number indexed from 1. This is due to a limitation on the regional detection script, runMaxScoringSubsequences.py, which assumes that the input GFF file contains scores corresponding to only one reference.
+
+The score is computed using a weights from an LDA classifier. The classifier should have two columns stored in a csv format. Each column should contain 127 values: 126 weights and one offset. The first column contains weights for data mapped to the forward strand, and the second column contains weights for the reverse strand. There are two classifiers provided.
+
+The file <m5C scores gff> specifies the location of the output file, which will assign one score to each CG site in the specified contig of the reference.
+
+
+
+Training features
+~~~~~~~~~~~~~~~~~
+
+The authors of [1] suggest training a classifier to help assign rough scores to each individual CG site. Following their approach, we select training features that can be derived using the standard modifications.csv output of ipdSummary.py:
+
+The modifications.csv file contains the following statistics for each site and each strand: tMean (mean IPD), tErr (standard error of IPDs), cov (coverage), modelPrediction (predicted mean IPD for this sequence context at that site). From these we can compute:
+
+1. standard deviation
+2. difference between tMean and modelPrediction
+3. alternative error estimate obtained by adding standard deviation and model error in quadrature
+4. t-statistic scaled to remove dependence on coverage
+
+The predictors we chose are tMean, modelPrediction, standard deviation, scaled t-statistic, difference between tMean and modelPrediction, alternative error estimate values for every site in a window [-10, +10] around the site of interest. Prior to training, a log-transformation is applied to all of these, except for the scaled t-statistics.
+
+Following [1], we assume that there is generally some concordance in the methylation status of CG sites on the two strands, and that kinetic information for the two strands can be combined for the two strands. Our method is to apply a different classifier to each strand, and then add the corresponding classification scores of the two strands for each CG site.
+
+
+
+Training labels and data
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The training labels were obtained from bisulfite sequencing data. A CG site was labeled un-methylated if bisulfite sequencing estimated that less that 0.5 of the molecules contained m5C.
+
+The authors of [1] kindly provided tables summarizing their bisulfite sequencing data for the Medaka genome, as well as RSII sequencing data that was used for training the model stored in the file p4_c2_medaka_2_binary_classifier.csv. In addition, we provide an alternative model for which the data was collected internally: p4_c2_arabidopsis_2_2_binary_classifier.csv. The bisulfite sequencing results for this data were provided by Chongyuan Luo in the Ecker Lab at the Salk Institute fo [...]
+
+
+
+Output containing m5C scores
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The GFF file < m5C scores gff > will have one row for every CG site in the specified contig of the reference.
+
+1. The first column will list the reference index, and the second and third columns will be 'kinModCall' and 'CG', respectively.
+2. The fourth and fifth columns should be identical, listing the template position of the CG site.
+3. The sixth column will contain a score that is obtained by summing the LDA scores for forward and reverse strands. A negative score is expected to loosely correspond to un-methylated sites.
+4. The seventh column should only contain '+', for the forward strand.
+5. The ninth column contains some extra information: the coverage and IPD ratio on the forward strand at that CG site.
+
+
+
+
+Step 2. Estimation of hypo-methylated regions
+----------------------------------------------
+
+The second step involves taking the output of step 1 as the input to the additional scripts provided::
+
+ runMaxScoringSubsequences.py --infile <m5C scores gff>
+ --outfile <m5C regions output gff>
+
+We apply the general method of [2] to the individual CG site scores obtained in Step 1 to estimate regions of hypo-methylation.
+
+The authors of [1] have developed an new method of boundary estimation that is specialized to this application and may yield superior results. Their implementation is available here: https://github.com/hacone/AgIn
+
+
+
+Output containing estimates of hypo-methylated regions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+The GFF file <m5C regions output gff> will have one row for hypo-methylated region in the specified contig of the reference.
+
+Following [1], we assume that hypo- and hyper-methylated regions alternate.
+
+1. The first column will list the reference index, and the second and third columns will be 'region' and 'hypomethylated', respectively.
+2. The fourth and fifth columns contain start and stop positions of the hypo-methylated region.
+3. The sixth column contains the negative sum of scores of CG sites in that region.
+4. The seventh column should only contain '+', for the forward strand.
+5. The ninth column contains some extra information: the number of CG sites in that region, as well as the average coverage and IPD ratio of CG sites in that region on the forward strand.
+
+Once again, we follow [1] and report only regions containing at least 50 CG sites.
+
+
+References
+----------
+
+1. Suzuki, Yuta, Wei Qu, Tatsuya Tsukahara, Stephen W. Turner, Jonas Korlach, Hideaki Yurino, Jun Yoshimura, Hiroyuki Takeda, and Shinichi Morishita, Completing CpG methylation statuses in a vertebrate genome by integrating SMRT sequencing kinetic data, to appear.
+2. Ruzzo, Walter L. and Martin Tompa, A Linear Time Algorithm for Finding All Maximal Scoring Subsequences, 7th International Conference on Intelligent Systems for Molecular Biology, Heidelberg, Germany, August 1999.
diff --git a/kineticsTools/BasicLdaEnricher.py b/kineticsTools/BasicLdaEnricher.py
new file mode 100755
index 0000000..55e8192
--- /dev/null
+++ b/kineticsTools/BasicLdaEnricher.py
@@ -0,0 +1,68 @@
+# Basic LDA Enricher class
+
+from math import sqrt
+import math
+import scipy.stats as s
+import array as a
+
+from scipy.optimize import fminbound
+from scipy.special import gammaln as gamln
+from numpy import log, pi, log10, e, log1p, exp
+import numpy as np
+
+from MultiSiteCommon import MultiSiteCommon
+from MixtureEstimationMethods import MixtureEstimationMethods
+
+
+class BasicLdaEnricher(MultiSiteCommon):
+
+ def __init__(self, gbmModel, sequence, rawKinetics, identifyFlag, modsToCall=['H', 'J', 'K']):
+
+ MultiSiteCommon.__init__(self, gbmModel, sequence, rawKinetics)
+
+ # FIXME: For debugging LDA, load in parameters for forward and reverse strands:
+
+ self.fwd_model = np.genfromtxt("/home/UNIXHOME/obanerjee/nat_fwd_model_expanded.csv", delimiter=',')
+ self.rev_model = np.genfromtxt("/home/UNIXHOME/obanerjee/nat_rev_model_expanded.csv", delimiter=',')
+
+ if identifyFlag:
+ if 'K' in modsToCall:
+ self.fwd_model = np.genfromtxt("/home/UNIXHOME/obanerjee/tet_fwd_model_expanded.csv", delimiter=',')
+ self.rev_model = np.genfromtxt("/home/UNIXHOME/obanerjee/tet_rev_model_expanded.csv", delimiter=',')
+
+ # write a method to take perSiteResults dictionary in and add a column Ca5C
+ def useLDAmodel(self, kinetics, pos, model, up, down):
+ """ Test out LDA model """
+
+ res = np.zeros((up + down + 1, 5))
+ ind = 0
+
+ # range from -down to +up
+ for offset in range(-down, (up + 1)):
+ a = pos + offset
+ # res[ind,] = [kinetics[a]["tMean"], kinetics[a]["modelPrediction"], kinetics[a]["tErr"], kinetics[a]["coverage"]]
+ res[ind, ] = [kinetics[a]["tMean"], kinetics[a]["modelPrediction"], kinetics[a]["tErr"], kinetics[a]["coverage"], np.exp(kinetics[a]["tStatistic"]) - 0.01]
+ ind += 1
+
+ apply = np.hstack(np.log(res + 0.01).transpose())
+ tmp = sum(np.multiply(apply, model[1:])) + model[0]
+ return tmp
+
+ def callLDAstrand(self, kinetics, strand, model, up, down):
+ tmp = [d for d in kinetics if d["strand"] == strand]
+ tmp.sort(key=lambda x: x["tpl"])
+
+ L = len(tmp)
+ for pos in range(down, (L - up)):
+ if tmp[pos]["base"] == 'C':
+ tmp[pos]["Ca5C"] = self.useLDAmodel(tmp, pos, model, up, down)
+
+ return tmp
+
+ def callEnricherFunction(self, kinetics, up=10, down=10):
+
+ fwd = self.callLDAstrand(kinetics, 0, self.fwd_model, up, down)
+ rev = self.callLDAstrand(kinetics, 1, self.rev_model, up, down)
+ res = fwd + rev
+ res.sort(key=lambda x: x["tpl"])
+ return res
diff --git a/kineticsTools/KineticWorker.py b/kineticsTools/KineticWorker.py
new file mode 100755
index 0000000..824f911
--- /dev/null
+++ b/kineticsTools/KineticWorker.py
@@ -0,0 +1,838 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from math import sqrt
+import math
+from scipy.special import erfc
+import logging
+
+import scipy.stats as s
+import numpy as np
+import scipy.stats.mstats as mstats
+import sys
+
+from MixtureEstimationMethods import MixtureEstimationMethods
+from MultiSiteCommon import MultiSiteCommon, canonicalBaseMap, modNames, ModificationPeakMask, FRAC, FRAClow, FRACup, log10e
+
+from MultiSiteDetection import *
+
+from MedakaLdaEnricher import MedakaLdaEnricher
+from BasicLdaEnricher import BasicLdaEnricher
+from PositiveControlEnricher import PositiveControlEnricher
+
+from kineticsTools.ModificationDecode import ModificationDecode, ModificationPeakMask
+
+from WorkerProcess import WorkerProcess, WorkerThread
+import pdb
+import traceback
+
+# Raw ipd record
+ipdRec = [('tpl', '<u4'), ('strand', '<i8'), ('ipd', '<f4')]
+
+class KineticWorker(object):
+
+ """
+ Manages the summarization of pulse features over a single reference
+ """
+
+ def __init__(self, ipdModel):
+ self.ipdModel = ipdModel
+ self.debug = False
+
+ def _prepForReferenceWindow(self, referenceWindow):
+ """ Set up member variable to call modifications on a window. """
+ start = referenceWindow.start
+ end = referenceWindow.end
+ # FIXME some inconsistency in how reference info is retrieved -
+ # DataSet API uses Name, ipdModel.py uses ID
+ self.refId = referenceWindow.refId
+ self.refName = referenceWindow.refName
+ refInfoTable = self.caseCmpH5.referenceInfo(self.refName)
+
+ # Each chunk is from a single reference -- fire up meanIpd func on the current reference
+ self.meanIpdFunc = self.ipdModel.predictIpdFunc(self.refId)
+ self.manyManyIpdFunc = self.ipdModel.predictManyIpdFunc(self.refId)
+
+ # Get the cognate base at a given position
+ self.cognateBaseFunc = self.ipdModel.cognateBaseFunc(self.refId)
+
+ # Padding needed for multi-site models
+ self.pad = self.ipdModel.gbmModel.pre + self.ipdModel.gbmModel.post + 1
+
+ # Sequence we work over
+ self.sequence = self.ipdModel.getReferenceWindow(self.refId, 0, start, end)
+
+ def onChunk(self, referenceWindow):
+
+ # Setup the object for a new window.
+ self._prepForReferenceWindow(referenceWindow)
+
+ # start and end are the windows of the reference that we are responsible for reporting data from.
+ # We may elect to pull data from a wider window for use with positive control
+ start = referenceWindow.start
+ end = referenceWindow.end
+
+ # Trim end coordinate to length of current template
+ end = min(end, self.ipdModel.refLength(self.refId))
+
+ if self.options.identify:
+ # If we are attempting to identify modifications, get the raw data for a slightly expanded window
+ # then do the decoding, then weave the modification results back into the main results
+
+ padStart = start - self.pad
+ padEnd = end + self.pad
+ perSiteResults = self._summarizeReferenceRegion((padStart, padEnd), self.options.methylFraction, self.options.identify)
+
+ if self.options.useLDA:
+
+ # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site
+ # Below is an example of how to use an alternative, the BasicLdaEnricher, which does not use the positive control model
+ # PositiveControlEnricher currently uses a logistic regression model trained using SMRTportal job 65203 (native E. coli)
+
+ lda = MedakaLdaEnricher( self.ipdModel.gbmModel, self.sequence, perSiteResults, self.options.m5Cclassifier )
+ # lda = BasicLdaEnricher( self.ipdModel.gbmModel, self.sequence, perSiteResults, self.options.identify, self.options.modsToCall )
+ # lda = PositiveControlEnricher(self.ipdModel.gbmModel, self.sequence, perSiteResults)
+ perSiteResults = lda.callEnricherFunction(perSiteResults)
+
+ try:
+ # Handle different modes of 'extra analysis' here -- this one is for multi-site m5C detection
+ # mods = self._multiSiteDetection(perSiteResults, (start, end))
+ mods = self._decodePositiveControl(perSiteResults, (start, end))
+ except:
+ type, value, tb = sys.exc_info()
+ traceback.print_exc()
+ pdb.post_mortem(tb)
+
+ finalCalls = []
+
+ # Weave together results
+ for strand in [0, 1]:
+ strandSign = 1 if strand == 0 else -1
+
+ siteDict = dict((x['tpl'], x) for x in perSiteResults if start <= x['tpl'] < end and x['strand'] == strand)
+ modDict = dict((x['tpl'], x) for x in mods if start <= x['tpl'] < end and x['strand'] == strand)
+
+ # Go through the modifications - add tags for identified mods to per-site stats
+ # add a 'offTarget' tag to the off target peaks.
+ for (pos, mod) in modDict.items():
+
+ # Only convert to positive control call if we actually have enough
+ # coverage on the cognate base!
+ if siteDict.has_key(mod['tpl']):
+
+ # Copy mod identification data
+ siteDict[mod['tpl']]['modificationScore'] = mod['QMod']
+ siteDict[mod['tpl']]['modification'] = mod['modification']
+
+ if self.options.methylFraction and mod.has_key(FRAC):
+ siteDict[mod['tpl']][FRAC] = mod[FRAC]
+ siteDict[mod['tpl']][FRAClow] = mod[FRAClow]
+ siteDict[mod['tpl']][FRACup] = mod[FRACup]
+
+ # Copy any extra properties that were added
+ newKeys = set(mod.keys()) - set(siteDict[mod['tpl']].keys())
+ for nk in newKeys:
+ siteDict[mod['tpl']][nk] = mod[nk]
+
+ if mod.has_key('Mask'):
+ # The decoder should supply the off-target peak mask
+ mask = mod['Mask']
+ mask.append(0) # make sure we always mask the cognate position
+ else:
+ # If the decoder doesn't supply a mask - use a hard-coded version
+ # FIXME - this branch is deprecated
+ mask = ModificationPeakMask[mod['modification']]
+
+ # Mask out neighbor peaks that may have been caused by this mod
+ for offset in mask:
+ shadowPos = mod['tpl'] + strandSign * offset
+ if siteDict.has_key(shadowPos):
+ siteDict[shadowPos]['offTargetPeak'] = True
+
+ finalCalls.extend(siteDict.values())
+
+ # Sort by template position
+ finalCalls.sort(key=lambda x: x['tpl'])
+ return finalCalls
+
+ else:
+ result = self._summarizeReferenceRegion((start, end), self.options.methylFraction, self.options.identify)
+
+ if self.options.useLDA and self.controlCmpH5 is None:
+
+ # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site
+ lda = MedakaLdaEnricher( self.ipdModel.gbmModel, self.sequence, result, self.options.m5Cclassifier )
+ # lda = BasicLdaEnricher(self.ipdModel.gbmModel, self.sequence, result, self.options.identify)
+ # lda = PositiveControlEnricher(self.ipdModel.gbmModel, self.sequence, result)
+ results = lda.callEnricherFunction(result)
+
+ result.sort(key=lambda x: x['tpl'])
+ return result
+
+ def _summarizeReferenceRegion(self, targetBounds, methylFractionFlag, identifyFlag):
+ """Compute the ipd stats for a chunk of the reference"""
+ (start, end) = targetBounds
+ logging.info('Making summary: %d to %d' % (start, end))
+
+ caseReferenceGroupId = self.caseCmpH5.referenceInfo(self.refName).Name
+ (caseChunks, capValue) = self._fetchChunks(caseReferenceGroupId, targetBounds, self.caseCmpH5)
+
+ if self.controlCmpH5 is None:
+ # in silico control workflow -- only get data from the main 'case' cmp.h5
+
+ goodSites = [x for x in caseChunks if x['data']['ipd'].size > 2]
+
+ # Flip the strand, and make predictions for the whole chunk
+ predictions = self.manyManyIpdFunc([(x['tpl'], 1 - x['strand']) for x in goodSites])
+ goodSitesWithPred = zip(goodSites, predictions)
+
+ return [self._computePositionSyntheticControl(x, capValue, methylFractionFlag, identifyFlag, prediction.item()) for (x, prediction) in goodSitesWithPred]
+
+ else:
+ # case/control workflow -- get data from the case and control files and compare
+ result = []
+
+ contigName = self.caseCmpH5.referenceInfo(self.refName).FullName
+ controlRefTable = self.controlCmpH5.referenceInfoTable
+
+ # Make sure this RefId contains a refGroup in the control cmp.h5 file
+ # if self.refId in self.controlCmpH5.referenceInfoTable.Name:
+ # if self.refId in [ int( str.split('ref')[1] ) for str in self.controlCmpH5.referenceInfoTable.Name ]:
+ if contigName in controlRefTable.FullName:
+
+ controlRefRow = controlRefTable[controlRefTable['FullName'] == contigName][0]
+ (controlChunks, controlCapValue) = self._fetchChunks(controlRefRow.ID, targetBounds, self.controlCmpH5)
+ controlSites = {(x['strand'], x['tpl']): x for x in controlChunks}
+
+ for caseChunk in caseChunks:
+ # try:
+ # FIXME: catch None or the exception.
+ caseKey = (caseChunk['strand'], caseChunk['tpl'])
+ controlChunk = controlSites.get(caseKey) # , default = None)
+
+ if controlChunk and \
+ caseChunk['data']['ipd'].size > 2 and \
+ controlChunk['data']['ipd'].size > 2:
+ result.append(self._computePositionTraditionalControl(caseChunk, controlChunk, capValue, controlCapValue, methylFractionFlag, identifyFlag))
+ # except:
+ # pass
+
+ return result
+
+ def _decodePositiveControl(self, kinetics, bounds):
+ """Compute the ipd stats for a chunk of the reference"""
+
+ (kinStart, kinEnd) = bounds
+ callBounds = (self.pad, kinEnd - kinStart + self.pad)
+
+ chunkFwd = dict((x['tpl'], x) for x in kinetics if x['strand'] == 0 and x['coverage'] > self.options.identifyMinCov)
+ chunkRev = dict((x['tpl'], x) for x in kinetics if x['strand'] == 1 and x['coverage'] > self.options.identifyMinCov)
+
+ modCalls = []
+
+ # Fwd sequence window
+ canonicalSequence = self.ipdModel.getReferenceWindow(self.refId, 0, kinStart - self.pad, kinEnd + self.pad)
+
+ # Map the raw kinetics into the frame-of reference of our sequence snippets
+ def toRef(p):
+ return p - (kinStart - self.pad)
+
+ def fromRef(r):
+ return r + (kinStart - self.pad)
+
+ mappedChunk = dict((toRef(pos), k) for (pos, k) in chunkFwd.items())
+
+ # Decode the modifications
+ decoder = ModificationDecode(self.ipdModel.gbmModel, canonicalSequence, mappedChunk, callBounds, self.options.methylMinCov, self.options.modsToCall, self.options.methylFraction, self.options.useLDA)
+
+ # Map the modification positions back to normal template indices
+ for (r, mod) in decoder.decode().items():
+ mod["strand"] = 0
+ mod['tpl'] = fromRef(r)
+ modCalls.append(mod)
+
+ # Repeat decoding on reverse sequence
+ # Reverse sequence
+ canonicalSequence = self.ipdModel.getReferenceWindow(self.refId, 1, kinStart - self.pad, kinEnd + self.pad)
+
+ # Map the raw kinetics into the frame-of reference of our sequence snippets
+ def toRef(p):
+ return len(canonicalSequence) - p + (kinStart - self.pad)
+
+ def fromRef(r):
+ return len(canonicalSequence) - r + (kinStart - self.pad)
+
+ mappedChunk = dict((toRef(pos), k) for (pos, k) in chunkRev.items())
+ decoder = ModificationDecode(self.ipdModel.gbmModel, canonicalSequence, mappedChunk, callBounds, self.options.methylMinCov, self.options.modsToCall, self.options.methylFraction, self.options.useLDA)
+
+ for (r, mod) in decoder.decode().items():
+ mod["strand"] = 1
+ mod['tpl'] = fromRef(r)
+ modCalls.append(mod)
+
+ return modCalls
+
+ def _multiSiteDetection(self, kinetics, bounds):
+ """Compute the ipd stats for a chunk of the reference"""
+
+ (kinStart, kinEnd) = bounds
+ callBounds = (self.pad, kinEnd - kinStart + self.pad)
+
+ chunkFwd = dict((x['tpl'], x) for x in kinetics if x['strand'] == 0 and x['coverage'] > self.options.identifyMinCov)
+ chunkRev = dict((x['tpl'], x) for x in kinetics if x['strand'] == 1 and x['coverage'] > self.options.identifyMinCov)
+
+ modCalls = []
+
+ # Fwd sequence window
+ canonicalSequence = self.ipdModel.getReferenceWindow(self.refId, 0, kinStart - self.pad, kinEnd + self.pad)
+
+ # Map the raw kinetics into the frame-of reference of our sequence snippets
+ def toRef(p):
+ return p - (kinStart - self.pad)
+
+ def fromRef(r):
+ return r + (kinStart - self.pad)
+
+ mappedChunk = dict((toRef(pos), k) for (pos, k) in chunkFwd.items())
+
+ # Decode the modifications
+ decoder = MultiSiteDetection(self.ipdModel.gbmModel, canonicalSequence, mappedChunk, callBounds, self.options.methylMinCov)
+
+ # Map the modification positions back to normal template indices
+ for (r, mod) in decoder.decode().items():
+ mod["strand"] = 0
+ mod['tpl'] = fromRef(r)
+ modCalls.append(mod)
+
+ # Repeat decoding on reverse sequence
+ # Reverse sequence
+ canonicalSequence = self.ipdModel.getReferenceWindow(self.refId, 1, kinStart - self.pad, kinEnd + self.pad)
+
+ # Map the raw kinetics into the frame-of reference of our sequence snippets
+ def toRef(p):
+ return len(canonicalSequence) - p + (kinStart - self.pad)
+
+ def fromRef(r):
+ return len(canonicalSequence) - r + (kinStart - self.pad)
+
+ mappedChunk = dict((toRef(pos), k) for (pos, k) in chunkRev.items())
+ decoder = MultiSiteDetection(self.ipdModel.gbmModel, canonicalSequence, mappedChunk, callBounds, self.options.methylMinCov)
+
+ for (r, mod) in decoder.decode().items():
+ mod["strand"] = 1
+ mod['tpl'] = fromRef(r)
+ modCalls.append(mod)
+
+ return modCalls
+
+ def _fetchChunks(self, refGroupId, targetBounds, cmpH5File):
+ """Get the IPDs for each position/strand on the given reference in the given window, from the given cmpH5 file"""
+ (start, end) = targetBounds
+
+ # Take <= N alignments overlapping window with
+ # - mapQV >= threshold,
+ # - identity >= 0.82
+ # (the N are randomly chosen if there are more)
+ # N = self.options.maxAlignments, default=1500
+ MIN_IDENTITY = 0.0 # identity filter was broken
+ # previously. leaving "off" for now for
+ # bw compat
+ MIN_READLENGTH = 50
+
+ hits = [ hit for hit in cmpH5File.readsInRange(refGroupId,
+ max(start, 0), end)
+ if ((hit.mapQV >= self.options.mapQvThreshold) and
+ (hit.identity >= MIN_IDENTITY) and
+ (hit.readLength >= MIN_READLENGTH)) ]
+ logging.info("Retrieved %d hits" % len(hits))
+ if len(hits) > self.options.maxAlignments:
+ # XXX a bit of a hack - to ensure deterministic behavior when
+ # running in parallel, re-seed the RNG before each call
+ if self.options.randomSeed is None:
+ np.random.seed(len(hits))
+ hits = np.random.choice(hits, size=self.options.maxAlignments, replace=False)
+
+ # FIXME -- we are dealing with the IPD format change from seconds to frames here
+ factor = 1.0 / cmpH5File.readGroupTable[0].FrameRate
+ # Should be handled in pbcore
+ #for alnFile in cmpH5File.resourceReaders():
+ # ver = alnFile.version[0:3]
+ # if ver == '1.2':
+ # factor = 1.0
+ # else:
+ # # NOTE -- assuming that all movies have the same frame rate!
+ # fr = cmpH5File.readGroupTable[0].FrameRate
+ # factor = 1.0 / fr
+ # break
+
+ rawIpds = self._loadRawIpds(hits, start, end, factor)
+ ipdVect = rawIpds['ipd']
+
+ if ipdVect.size < 10:
+ # Default is there is no coverage
+ capValue = 5.0
+ else:
+ # Compute IPD quantiles on the current block -- will be used for trimming extreme IPDs
+ capValue = np.percentile(ipdVect, self.options.cap_percentile)
+
+ chunks = self._chunkRawIpds(rawIpds)
+ return chunks, capValue
+
+ def _loadRawIpds(self, alnHitIter, targetStart=-1, targetEnd=3e12, factor=1.0):
+ """
+ Get a DataFrame of the raw ipds in the give alignment hits, indexed by template position and strand.
+ Factor is a normalization factor to the get units into seconds.
+ """
+
+ # Put in an empty 'starter' array -- the np.concatenate call below will fail on an empty list
+ array0 = np.zeros(0, dtype=ipdRec)
+
+ # Maintain separate lists for each strand to speed up sorting
+ s0list = [array0]
+ s1list = [array0]
+
+ for aln in alnHitIter:
+ # Pull out error-free position
+ matched = np.logical_and(np.array([x != '-' for x in aln.read()]), np.array([x != '-' for x in aln.reference()]))
+
+ # Normalize kinetics of the entire subread
+ rawIpd = aln.IPD() * factor
+
+ np.logical_and(np.logical_not(np.isnan(rawIpd)), matched, out=matched)
+
+ normalization = self._subreadNormalizationFactor(rawIpd[matched])
+ rawIpd /= normalization
+
+ # Trim down to just the position that cover our interval
+ referencePositions = aln.referencePositions()
+ np.logical_and(referencePositions < targetEnd, matched, matched)
+ np.logical_and(referencePositions >= targetStart, matched, matched)
+ nm = matched.sum()
+
+ # Bail out if we don't have any samples
+ if nm == 0:
+ continue
+
+ ipd = rawIpd[matched]
+ tpl = referencePositions[matched]
+
+ dfTemp = np.zeros(nm, dtype=ipdRec)
+ dfTemp['ipd'] = ipd
+ dfTemp['tpl'] = tpl
+ dfTemp['strand'] = aln.isReverseStrand
+
+ if aln.isForwardStrand:
+ s0list.append(dfTemp)
+ else:
+ s1list.append(dfTemp)
+
+ # Sort the set of ipd observations
+ s0Ipds = np.concatenate(s0list)
+ sortOrder = np.argsort(s0Ipds['tpl'])
+ s0Ipds = s0Ipds[sortOrder]
+
+ s1Ipds = np.concatenate(s1list)
+ sortOrder = np.argsort(s1Ipds['tpl'])
+ s1Ipds = s1Ipds[sortOrder]
+
+ return np.concatenate([s0Ipds, s1Ipds])
+
+ def _chunkRawIpds(self, rawIpds):
+ """
+ Return a list of view recarrays into the rawIpds recarray, one for each unique (tpl, stand) level
+ """
+ views = []
+
+ # Bail out if we have no data
+ if rawIpds.size == 0:
+ return views
+
+ start = 0
+ tpl = rawIpds['tpl']
+ strand = rawIpds['strand']
+
+ # Start off at the first chunk
+ curIdx = (tpl[0], strand[0])
+ for i in xrange(1, rawIpds.shape[0]):
+ newIdx = (tpl[i], strand[i])
+
+ # In this case we are still int he same chunk -- continue
+ if curIdx == newIdx:
+ continue
+
+ # In this case we have completed the chunk -- emit the chunk
+ else:
+ obj = {'tpl': curIdx[0], 'strand': curIdx[1], 'data': rawIpds[start:i]}
+ views.append(obj)
+ start = i
+ curIdx = newIdx
+
+ # Make sure to return final chunk
+ obj = {'tpl': curIdx[0], 'strand': curIdx[1], 'data': rawIpds[start:]}
+ views.append(obj)
+
+ # If the user has specified a maximum coverage level to use, enforce it here -- just take the first n reads
+ if self.options.maxCoverage is not None:
+ maxCov = self.options.maxCoverage
+ for x in views:
+ d = x['data']
+ d = d[0:maxCov]
+ x['data'] = d
+
+ return views
+
+ def _subreadNormalizationFactor(self, rawIpds):
+ """
+ Normalize subread ipds
+ """
+
+ # Default normalization factor -- this value should very rarely get used
+ if rawIpds.size < 2:
+ return 0.1
+
+ if np.isnan(rawIpds).any():
+ print "got nan: %s" % str(rawIpds)
+
+ if rawIpds.mean() < 0.0001:
+ print "small"
+ print "got small: %s" % str(rawIpds)
+
+ capValue = min(10, np.percentile(rawIpds, 99))
+ capIpds = np.minimum(rawIpds, capValue)
+ return capIpds.mean()
+
+ def computeObservationPValue(self, siteObs):
+ """
+ Compute a p-value on the observation of a kinetic event
+ """
+
+ # p-value of detection -- FIXME needs much more thought here!
+ # p-value computation (slightly robustified Gaussian model)
+ # emf - rms fractional error of background model
+ # em - rms error of background model = um * emf
+ # um - predicted mean of unmodified ipd from model
+ # uo - (trimmed) observed mean ipd
+ # eo - (trimmed) standard error of observed mean (std / sqrt(coverage))
+ # Null model is ~N(um, em^2 + eo^2)
+ # Then compute standard gaussian p-value = erfc((uo-um) / sqrt(2 * (em^2 + eo^2))) / 2
+ # FIXME? -- right now we only detect the case where the ipd gets longer.
+
+ um = siteObs['modelPrediction']
+
+ # FIXME -- pipe through model error
+ em = 0.1 * um
+ # em = model.fractionalModelError * em
+
+ uo = siteObs['tMean']
+ eo = siteObs['tErr']
+
+ pvalue = erfc((uo - um) / sqrt(2 * (em ** 2 + eo ** 2))) / 2
+ return pvalue.item()
+
+ def computeObservationTstatistic(self, siteObs):
+ """
+ Compute a p-value on the observation of a kinetic event
+ """
+
+ # p-value of detection -- FIXME needs much more thought here!
+ # p-value computation (slightly robustified Gaussian model)
+ # emf - rms fractional error of background model
+ # em - rms error of background model = um * emf
+ # um - predicted mean of unmodified ipd from model
+ # uo - (trimmed) observed mean ipd
+ # eo - (trimmed) standard error of observed mean (std / sqrt(coverage))
+ # Null model is ~N(um, em^2 + eo^2)
+ # Then compute standard gaussian p-value = erfc((uo-um) / sqrt(2 * (em^2 + eo^2))) / 2
+ # FIXME? -- right now we only detect the case where the ipd gets longer.
+
+ um = siteObs['modelPrediction']
+
+ # FIXME -- pipe through model error
+ #em = 0.06 * um + 0.12 * um**2.0
+ em = 0.01 + 0.03 * um + 0.06 * um ** (1.7)
+ # em = model.fractionalModelError * em
+
+ uo = siteObs['tMean']
+ eo = siteObs['tErr']
+
+ import scipy.stats as s
+
+ t = -(uo - um) / sqrt(em ** 2 + eo ** 2)
+ return t
+
+ def computeObservationPValueTTest(self, siteObs):
+ t = siteObs['tStatistic']
+ df = max(1, siteObs['coverage'] - 1)
+
+ pvalue = s.t._cdf(t, df)
+ return pvalue.item()
+
+ def _computePositionSyntheticControl(self, caseObservations, capValue, methylFractionFlag, identifyFlag, modelPrediction=None):
+ """Summarize the observed ipds at one template position/strand, using the synthetic ipd model"""
+
+ # Compute stats on the observed ipds
+ d = caseObservations['data']['ipd']
+ res = dict()
+
+ # ref00000x name
+ res['refId'] = self.refId
+
+ # FASTA header name
+ res['refName'] = self.refName
+
+ # NOTE -- this is where the strand flipping occurs -- make sure to reproduce this in the all calling methods
+ strand = res['strand'] = 1 - caseObservations['strand']
+ tpl = res['tpl'] = caseObservations['tpl']
+ res['coverage'] = d.size
+
+ # Don't compute these stats - they just take time and confuse things
+ # res['mean'] = d.mean().item()
+ # res['median'] = np.median(d).item()
+ # res['std'] = np.std(d).item()
+ # Compute the predicted IPD from the model
+ # NOTE! The ipd model is in the observed read strand
+ if modelPrediction is None:
+ modelPrediction = self.meanIpdFunc(tpl, strand).item()
+ res['modelPrediction'] = modelPrediction
+
+ res['base'] = self.cognateBaseFunc(tpl, strand)
+
+ # Store in case of methylated fraction estimtion:
+ res['rawData'] = d
+
+ # Try a hybrid capping approach -- cap at the higher of
+ # - 5x the model prediction
+ # - 90th percentile of the local data (at low coverage we pick a lower percentile to ensure we trim the highest datapoint
+ # - global cap value
+
+ percentile = min(90, (1.0 - 1.0 / (d.size - 1)) * 100)
+ localPercentile = np.percentile(d, percentile)
+ capValue = max(capValue, 4.0 * modelPrediction, localPercentile)
+
+ # np.minimum(d, capValue, out=d) # this version will send capped IPDs to modified fraction estimator
+ d = np.minimum(d, capValue)
+
+ # Trimmed stats
+ res['tMean'] = d.mean().item()
+ res['tErr'] = np.std(d).item() / sqrt(d.size)
+
+ ipdRatio = res['tMean'] / res['modelPrediction']
+ if not np.isnan(ipdRatio):
+ res['ipdRatio'] = ipdRatio
+ else:
+ res['ipdRatio'] = 1.0
+
+ # Don't know the modification yet
+ res["modification"] = "."
+
+ # use ttest-based pvalue
+ # res['pvalue'] = self.computeObservationPValue(res)
+ res['tStatistic'] = self.computeObservationTstatistic(res)
+ res['pvalue'] = self.computeObservationPValueTTest(res)
+
+ pvalue = max(sys.float_info.min, res['pvalue'])
+ score = round(-10.0 * math.log10(pvalue))
+ res['score'] = score
+
+ # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
+ if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
+ if res['coverage'] > self.options.methylMinCov:
+ modelPrediction = self.meanIpdFunc(tpl, strand).item()
+
+ # Instantiate mixture estimation methods:
+ mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post, self.ipdModel.gbmModel.pre, res, self.options.methylMinCov)
+ x = mixture.detectionMixModelBootstrap(modelPrediction, d)
+ # x = self.detectionMixModelBootstrap(modelPrediction, d)
+
+ res[FRAC] = x[0]
+ res[FRAClow] = x[1]
+ res[FRACup] = x[2]
+ else:
+ res[FRAC] = np.nan
+ res[FRACup] = np.nan
+ res[FRAClow] = np.nan
+
+ # print res
+ return res
+
+##
+# straight port from R's p.adjust.
+##
+ def _cummin(v):
+ r = array([v[0]] * len(v))
+ r[0] = v[0]
+ for i in xrange(1, len(v)):
+ r[i] = r[i - 1] if r[i - 1] < v[i] else v[i]
+ return r
+
+ def _BH_FDR(pvals):
+ s = array(range(len(pvals), 0, -1))
+ o = array(argsort(pvals)[::-1])
+ r = array(argsort(o))
+ return [1 if x > 1 else x for x in (_cummin(float(len(pvals)) / s * pvals[o]))[r]]
+
+##
+# Null simulation. the test below assumes that IPDs are normal after
+# capping and logging. FIXME: permutation based
+##
+# def sim(N=100):
+# return [ _tTest(np.exp(x1),np.exp(x2), 100)['pvalue'] for x1,x2 in
+# zip([ np.random.normal(size=100) for g in range(0, N)],
+# [ np.random.normal(size=100) for g in range(0, N) ] )]
+#
+## _tTest(np.exp(np.random.normal(1.5, size = 100)), np.exp(np.random.normal(1., size = 100)))
+##
+ def _tTest(x, y, exclude=95):
+ """Compute a one-sided Welsh t-statistic."""
+ with np.errstate(all="ignore"):
+ def cappedSlog(v):
+ q = np.percentile(v, exclude)
+ v2 = v.copy()
+ v2 = v2[~np.isnan(v2)]
+ v2[v2 > q] = q
+ v2[v2 <= 0] = 1. / (75 + 1)
+ return np.log(v2)
+ x1 = cappedSlog(x)
+ x2 = cappedSlog(y)
+ sx1 = np.var(x1) / len(x1)
+ sx2 = np.var(x2) / len(x2)
+ totalSE = np.sqrt(sx1 + sx2)
+ if totalSE == 0:
+ stat = 0
+ else:
+ stat = (np.mean(x1) - np.mean(x2)) / totalSE
+
+ #df = (sx1 + sx2)**2 / (sx1**2/(len(x1)-1) + sx2**2/(len(x2) - 1))
+ #pval = 1 - scidist.t.cdf(stat, df)
+
+ # Scipy's t distribution CDF implementaton has inadequate
+ # precision. We have switched to the normal distribution for
+ # better behaved p values.
+ pval = 0.5 * erfc(stat / sqrt(2))
+
+ return {'testStatistic': stat, 'pvalue': pval}
+
+ def _computePositionTraditionalControl(self, caseObservations, controlObservations, capValue, controlCapValue, methylFractionFlag, identifyFlag, testProcedure=_tTest):
+
+ oCapValue = capValue
+ oControlCapValue = controlCapValue
+
+ """Summarize the observed ipds at one template position/strand, using a case-control analysis"""
+ # Compute stats on the observed ipds
+ caseData = caseObservations['data']['ipd']
+ controlData = controlObservations['data']['ipd']
+
+ # cap both the native and control data, more or less as it is done in computePositionSyntheticControl:
+ percentile = min( 90, ( 1.0 - 1.0 / ( caseData.size - 1 ) ) * 100 )
+ localPercentile = np.percentile( caseData, percentile )
+ capValue = max( capValue, 4.0 * np.median(caseData).item(), localPercentile )
+ caseData = np.minimum( caseData, capValue )
+
+ percentile = min( 90, ( 1.0 - 1.0 / ( controlData.size - 1 ) ) * 100 )
+ localPercentile = np.percentile( controlData, percentile )
+ controlCapValue = max( controlCapValue, 4.0 * np.median(controlData).item(), localPercentile )
+ controlData = np.minimum( controlData, controlCapValue )
+
+
+ res = dict()
+ res['refId'] = self.refId
+
+ # FASTA header name
+ res['refName'] = self.refName
+
+ strand = res['strand'] = 1 - caseObservations['strand']
+ tpl = res['tpl'] = caseObservations['tpl']
+ res['base'] = self.cognateBaseFunc(tpl, strand)
+
+ res['coverage'] = int(round((caseData.size + controlData.size) / 2.0)) # need a coverage annotation
+
+ res['caseCoverage'] = caseData.size
+ res['controlCoverage'] = controlData.size
+
+ res['caseMean'] = caseData.mean().item()
+ res['caseMedian'] = np.median(caseData).item()
+ res['caseStd'] = np.std(caseData).item()
+
+ res['controlMean'] = controlData.mean().item()
+ res['controlMedian'] = np.median(controlData).item()
+ res['controlStd'] = np.std(controlData).item()
+
+ trim = (0.001, 0.03)
+ ctrlMean = mstats.trimmed_mean(controlData, trim).item()
+ if abs(ctrlMean) > 1e-3:
+ res['ipdRatio'] = (mstats.trimmed_mean(caseData, trim).item() / ctrlMean)
+ else:
+ res['ipdRatio'] = 1.0
+
+ testResults = testProcedure(caseData, controlData)
+ res['testStatistic'] = testResults['testStatistic']
+ res['pvalue'] = testResults['pvalue']
+
+ # res['testStatistic'] = ( res['caseMedian'] - res['controlMedian'] ) / sqrt( res['caseStd']**2 + res['controlStd']**2 )
+ # res['pvalue'] = 0.5 * erfc(res['testStatistic'] / sqrt(2))
+
+ pvalue = max(sys.float_info.min, res['pvalue'])
+ res['score'] = round(-10.0 * math.log10(pvalue))
+
+ # print res
+
+ # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case.
+ if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag:
+ if res['controlCoverage'] > self.options.methylMinCov and res['caseCoverage'] > self.options.methylMinCov:
+ # Instantiate mixture estimation methods:
+ mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post, self.ipdModel.gbmModel.pre, res, self.options.methylMinCov)
+ x = mixture.detectionMixModelBootstrap(res['controlMean'], caseData)
+ res[FRAC] = x[0]
+ res[FRAClow] = x[1]
+ res[FRACup] = x[2]
+ else:
+ res[FRAC] = np.nan
+ res[FRACup] = np.nan
+ res[FRAClow] = np.nan
+
+ return res
+
+
+class KineticWorkerProcess(KineticWorker, WorkerProcess):
+
+ """Worker that executes as a process."""
+
+ def __init__(self, options, workQueue, resultsQueue, ipdModel, sharedAlignmentSet=None):
+ WorkerProcess.__init__(self, options, workQueue, resultsQueue, sharedAlignmentSet)
+ KineticWorker.__init__(self, ipdModel)
+
+
+class KineticWorkerThread(KineticWorker, WorkerThread):
+
+ """Worker that executes as a thread (for debugging purposes only)."""
+
+ def __init__(self, options, workQueue, resultsQueue, ipdModel, sharedAlignmentSet=None):
+ WorkerThread.__init__(self, options, workQueue, resultsQueue, sharedAlignmentSet)
+ KineticWorker.__init__(self, ipdModel)
diff --git a/kineticsTools/MedakaLdaEnricher.py b/kineticsTools/MedakaLdaEnricher.py
new file mode 100755
index 0000000..4bf49e5
--- /dev/null
+++ b/kineticsTools/MedakaLdaEnricher.py
@@ -0,0 +1,110 @@
+# Try to implement method used in Morishita et al.'s Medaka fish genome paper here
+
+from collections import defaultdict, Counter
+
+import os
+from math import sqrt
+import math
+import scipy.stats as s
+import array as a
+
+from scipy.optimize import fminbound
+from scipy.special import gammaln as gamln
+from numpy import log, pi, log10, e, log1p, exp
+import numpy as np
+
+from MultiSiteCommon import MultiSiteCommon
+
+
+class MedakaLdaEnricher(MultiSiteCommon):
+
+ def __init__(self, gbmModel, sequence, rawKinetics, m5Cclassifier):
+
+ MultiSiteCommon.__init__(self, gbmModel, sequence, rawKinetics)
+
+ models = np.genfromtxt(m5Cclassifier, delimiter=',' )
+ self.fwd_model = models[:,0]
+ self.rev_model = models[:,1]
+
+
+
+ # write a method to take perSiteResults dictionary in and add a column Ca5C
+ def useLDAmodel(self, kinetics, pos, model, up, down ):
+ """ Test out LDA model """
+
+ print "From use LDA model.\n"
+
+ res = np.zeros((up + down + 1, 6))
+ ind = 0
+
+ # range from -down to +up
+ for offset in range(-down, (up + 1)):
+ a = pos + offset
+
+ std = kinetics[a]["tErr"] * sqrt( kinetics[a]["coverage"] )
+ mErr = 0.01 + 0.03 * kinetics[a]["modelPrediction"] + 0.06 * kinetics[a]["modelPrediction"] ** 2
+ den = sqrt( mErr **2 + std **2 )
+ t0 = ( kinetics[a]["tMean"] - kinetics[a]["modelPrediction"] ) / den
+
+ res[ind, ] = [kinetics[a]["tMean"], kinetics[a]["modelPrediction"], std, np.exp( t0 ) - 0.01, kinetics[a]["ipdRatio"], den]
+ ind += 1
+
+ predictors = np.hstack(np.log(res + 0.01).transpose())
+ tmp = sum( np.multiply( predictors, model[1:] )) + model[0]
+
+ return tmp
+
+
+ def callLDAstrand(self, kinetics, strand, model, up, down):
+
+ print "From callLDAstrand.\n"
+
+ tmp = [d for d in kinetics if d["strand"] == strand]
+ tmp.sort(key=lambda x: x["tpl"])
+
+ L = len(tmp)
+ for pos in range(down, (L - up)):
+ if (strand == 0 and tmp[pos]["base"] == 'C' and tmp[pos+1]["base"] == 'G'):
+ tmp[pos]["Ca5C"] = self.useLDAmodel(tmp, pos, model, up, down )
+
+ if (strand == 1 and tmp[pos]["base"] == 'C' and tmp[pos-1]["base"] == 'G'):
+ tmp[pos-1]["Ca5C"] = self.useLDAmodel(tmp, pos, model, up, down )
+
+ return tmp
+
+
+ def aggregate(self, dataset, group_by_key, sum_value_key):
+
+ print "From aggregate.\n"
+ emp = {}
+ for item in dataset:
+ if item.has_key( sum_value_key ):
+ if emp.has_key( item[group_by_key] ):
+ emp[ item[group_by_key] ] += item[sum_value_key]
+ else:
+ emp[ item[group_by_key] ] = item[sum_value_key]
+
+ # Need to go back over the set again?
+ for item in dataset:
+ if item.has_key( sum_value_key ):
+ item[ sum_value_key ] = emp[ item[group_by_key] ]
+
+ return dataset
+
+
+
+ def callEnricherFunction(self, kinetics, up=10, down=10):
+
+ print "From callEnricher function.\n"
+
+ fwd = self.callLDAstrand(kinetics, 0, self.fwd_model, up, down)
+ rev = self.callLDAstrand(kinetics, 1, self.rev_model, up, down)
+ res = fwd + rev
+ res.sort( key = lambda x: x["tpl"] )
+
+ # Would like to (1) find rows where tpl is the same, (2) add the Ca5C columns for these rows,
+ # (3) store the same result under Ca5C for each row.
+ # In R, this would be: a <- aggregate(Ca5C ~ tpl, df, sum )
+
+ res = self.aggregate( res, 'tpl', 'Ca5C')
+ return res
diff --git a/kineticsTools/MixtureEstimationMethods.py b/kineticsTools/MixtureEstimationMethods.py
new file mode 100755
index 0000000..5944123
--- /dev/null
+++ b/kineticsTools/MixtureEstimationMethods.py
@@ -0,0 +1,314 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from math import sqrt
+import math
+import scipy.stats as s
+import array as a
+
+from scipy.optimize import fminbound
+from numpy import log, pi, log10, e, log1p, exp
+import numpy as np
+
+import traceback
+
+log10e = log10(e)
+
+# canonicalBaseMap = { 'A': 'A', 'C':'C', 'G':'G', 'T':'T', 'H':'A', 'I':'C', 'J':'C', 'K':'C' }
+# modNames = { 'H':'m6A', 'I':'m5C', 'J':'m4C', 'K':'m5C' }
+
+# ModificationPeakMask = { 'm6A' : [0, -5], 'm4C': [0, -5], 'm5C': [2, 0, -1, -2, -4, -5, -6] }
+
+# Labels for modified fraction:
+
+# FRAC = 'frac'
+# FRAClow = 'fracLow'
+# FRACup = 'fracUp'
+
+# Try computing these only once
+
+k1 = s.norm.ppf(0.025)
+k2 = s.norm.ppf(0.975)
+
+
+class MixtureEstimationMethods(object):
+
+ def __init__(self, gbmModelPost, gbmModelPre, rawKinetics, methylMinCov):
+ """
+ All indexes are 0-based into the the sequence.
+
+ find a set of sites that _might_ have a modification - each modification type will include a list of
+ 'neighbor peaks' that can add the current site to the 'options' list.
+ 6mA and 4mC will use only the on-target peak
+ 5caC will use on target, -2 and -6.
+
+ Only hits that make this list will be tested in the mod identification process
+
+ Use the viterbi algorithm to find the optimal modifications to include, by measuring the per-site likelihood
+ of the observed IPD, given the underlying sequence and methylation states.
+ """
+
+ self.methylMinCov = methylMinCov
+
+ # Temporary:
+ # self.useLDA = useLDAFlag
+ # self.modsToCall = modsToCall
+ # self.methylFractionFlag = methylFractionFlag
+
+ # log1p = math.log(0.05)
+ # self.modPriors = { 'H': log1p, 'I': log1p, 'J': log1p, 'K': log1p }
+
+ # self.gbmModel = gbmModel
+ # self.sequence = sequence
+
+ # self.callStart = callBounds[0]
+ # self.callEnd = callBounds[1]
+
+ # Extents that we will attemp to call a modification
+ # self.callRange = xrange(self.callStart, self.callEnd)
+
+ # These switch because we changing viewpoints
+ self.pre = gbmModelPost
+ self.post = gbmModelPre
+
+ # self.lStart = self.pre
+ # self.lEnd = len(self.sequence) - self.post
+
+ # Extents that we will use for likelihoods
+ # self.likelihoodRange = xrange(self.lStart, self.lEnd)
+ # self.alternateBases = dict((x, set(sequence[x])) for x in xrange(len(sequence)))
+
+ self.rawKinetics = rawKinetics
+
+ # Return value of mixture model log likelihood function
+ def mixModelFn(self, p, a0, a1):
+ tmp = (1 - p) * a0 + p * a1
+ return -np.log(tmp[np.nonzero(tmp)]).sum()
+ # return -np.ma.log( tmp ).sum()
+
+ # Try to speed up calculation by avoiding a call to scipy.stats.norm.pdf()
+ def replaceScipyNormPdf(self, data, mu):
+ return np.exp( -np.divide( data, mu) ) / mu
+ # tmp = np.divide(data, mu)
+ # return np.exp(np.subtract(tmp, np.power(tmp, 2) / 2.0)) / mu
+ # pdf for normal distribution: res = res / sqrt( 2 * pi ) (can factor out sqrt(2 * pi))
+
+ # Return optimum argument (mixing proportion) of mixture model log likelihood function.
+ def estimateSingleFraction(self, mu1, data, mu0, L, optProp = True ):
+ # NOTE: ignoring the warnings here is sloppy, should be looked
+ # at later.
+ with np.errstate(all="ignore"):
+ a0 = self.replaceScipyNormPdf(data, mu0)
+ a1 = self.replaceScipyNormPdf(data, mu1)
+
+ # if f'(0) < 0 (equ. a1/a0 < L), then f'(1) < 0 as well and solution p-hat <= 0
+ if np.divide(a1, a0).sum() <= L:
+ res = 0.0
+ # if f'(1) > 0 (equ. a0/a1 < L), then f'(0) > 0 as well and solution p-hat >= 1
+ elif np.divide(a0, a1).sum() <= L:
+ res = 1.0
+ else:
+ # unconstrained minimization of convex, single-variable function
+ res = fminbound(self.mixModelFn, 0.01, 0.99, args=(a0, a1), xtol=1e-02)
+
+ if optProp:
+ # return the optimal proportion
+ return res
+ else:
+ # return the corresponding log likelihood function value
+ return self.mixModelFn( res, a0, a1 )
+
+
+ # Try bias-corrected, accelerated quantiles for bootstrap confidence intervals
+ def bcaQuantile(self, estimate, bootDist, data, mu0, mu1, nSamples, n):
+
+ tmp = sum(y <= estimate for y in bootDist) / float(nSamples + 1)
+ if tmp > 0 and tmp < 1:
+
+ # bias correction
+ z0 = s.norm.ppf(tmp)
+
+ # acceleration
+ x = np.zeros(n)
+ for i in range(n):
+ x[i] = self.estimateSingleFraction(mu1, np.delete(data, i), mu0, n - 1)
+ xbar = np.mean(x)
+ denom = np.power(np.sum(np.power(x - xbar, 2)), 1.5)
+ if abs(denom) < 1e-4:
+ q1 = 2.5
+ q2 = 97.5
+ else:
+ a = np.divide(np.sum(np.power(x - xbar, 3)), denom) / 6.0
+
+ # quantiles: (k1 and k2 are defined globally)
+ q1 = 100 * s.norm.cdf(z0 + (z0 + k1) / (1 - a * (z0 + k1)))
+ q2 = 100 * s.norm.cdf(z0 + (z0 + k2) / (1 - a * (z0 + k2)))
+
+ elif tmp == 0.0:
+ q1 = 0
+ q2 = 0
+
+ elif tmp == 1.0:
+ q1 = 100
+ q2 = 100
+
+ return (q1, q2)
+
+ # Bootstraps mix prop estimates to return estimate and simple bounds for 95% confidence interval
+ def bootstrap(self, pos, mu0, mu1, nSamples=500):
+
+ if not self.rawKinetics.has_key(pos):
+ return np.array([float('nan'), float('nan'), float('nan')])
+
+ res = np.zeros(3)
+ sample = self.rawKinetics[pos]["rawData"]
+ L = len(sample)
+ X = np.zeros(nSamples + 1)
+ res[0] = self.estimateSingleFraction(mu1, sample, mu0, L)
+ X[nSamples] = res[0]
+
+ for i in range(nSamples):
+ bootstrappedSamples = sample[s.randint.rvs(0, L - 1, size=L)]
+ X[i] = self.estimateSingleFraction(mu1, bootstrappedSamples, mu0, L)
+
+ q1, q2 = self.bcaQuantile(res[0], X, sample, mu0, mu1, (nSamples + 1), L)
+ res[1] = np.percentile(X, q1)
+ res[2] = np.percentile(X, q2)
+ return res
+
+ # Returns [estimate, 95% CI lower bnd, 95% CI upper bound] using a weighted sum
+ # The hope is that this would work better for a multi-site signature, such as m5C_TET
+ def estimateMethylatedFractions(self, pos, meanVector, modMeanVector, maskPos):
+
+ maskPos = np.array(maskPos)
+ L = len(maskPos)
+ if L == 0:
+ res = self.bootstrap(pos, meanVector[self.post], modMeanVector[self.post])
+ else:
+ est = np.zeros(L)
+ low = np.zeros(L)
+ upp = np.zeros(L)
+ res = np.zeros(3)
+ wts = np.zeros(L)
+
+ # for offset in maskPos:
+ for count in range(L):
+ offset = maskPos[count]
+ mu0 = meanVector[self.post + offset]
+ mu1 = modMeanVector[self.post + offset]
+ if mu1 > mu0:
+ k = self.bootstrap((pos + offset), mu0, mu1)
+ wts[count] = k[0] * (mu1 - mu0)
+ est[count] = k[0]
+ low[count] = k[1]
+ upp[count] = k[2]
+
+ if sum(wts) > 1e-3:
+ wts = wts / sum(wts)
+ res[0] = np.multiply(est, wts).sum()
+ res[1] = np.multiply(low, wts).sum()
+ res[2] = np.multiply(upp, wts).sum()
+
+ # print str(res)
+ return res
+
+
+ # Return the optimal mixing proportion in the detection case: estimate both p and mu1
+ def optimalMixProportion(self, data, mu0, L):
+ # mistake: want a function that returns optimum likelihood function value, not optimizing proportion
+ mu1 = fminbound(self.estimateSingleFraction, mu0, 10.0 * mu0, args=(data, mu0, L, False), xtol=1e-01)
+ return self.estimateSingleFraction(mu1, data, mu0, L)
+
+ # Bootstraps mix prop estimates to return estimate and simple bounds for 95% confidence interval
+ def detectionMixModelBootstrap(self, modelPrediction, data, nSamples=100):
+ # Case-resampled bootstrapped estimates:
+ L = len(data)
+ res = np.zeros(4)
+
+ res[0] = self.optimalMixProportion(data, modelPrediction, L)
+ X = np.zeros(nSamples + 1)
+ X[nSamples] = res[0]
+ for i in range(nSamples):
+ resampledData = [data[j] for j in s.randint.rvs(0, L - 1, size=L)]
+ X[i] = self.optimalMixProportion(resampledData, modelPrediction, L)
+
+ # A very basic way to estimate the 95% confidence interval:
+ res[1] = np.percentile(X, 2.5)
+ res[2] = np.percentile(X, 97.5)
+
+ # Estimate a weight:
+ # weight = np.maximum( (x[1] - modelPrediction), 0 )
+ res[3] = 1.0
+ return res
+
+ # Everything below here is unused for now:
+ # Return second derivative of mixture model log likelihood function - unused for now
+ def mixModelFnPrime2(self, p, a0, a1):
+ tmp = np.square((1 - p) * a0 + p * a1)
+ nonzero_indices = np.nonzero(tmp)
+ return np.divide(np.square(a1 - a0)[nonzero_indices], tmp[nonzero_indices]).sum()
+
+ # Return third derivative of mixture model log likelihood function - unused for now
+ def mixModelFnPrime3(self, p, a0, a1):
+ tmp = np.power((1 - p) * a0 + p * a1, 3)
+ nonzero_indices = np.nonzero(tmp)
+ return -np.divide(np.power(a1 - a0, 3)[nonzero_indices], tmp[nonzero_indices]).sum()
+
+ # Try removing very large values before case resampling for bootstrap estimation - unused for now
+ def processSample(self, sample):
+ q1 = np.percentile(sample, 25)
+ q2 = np.percentile(sample, 75)
+ iqr = 1.5 * (q2 - q1)
+ uif = q2 + iqr
+ lif = q1 - iqr
+
+ def removeBoxplotOutliers(x):
+ if (x > lif) and (x < uif):
+ return x
+ return filter(removeBoxplotOutliers, sample)
+
+ # Return derivative of mixture model log likelihood function -- unused for now
+ def mixModelFnPrime(self, p, a0, a1):
+ tmp = (1 - p) * a0 + p * a1
+ nonzero_indices = np.nonzero(tmp)
+ return -np.divide((a1 - a0)[nonzero_indices], tmp[nonzero_indices]).sum()
+
+ # unconstrained minimization of convex, single-variable function - unused for now
+ # much slower than fminbound
+ def homeMadeMinimization(self, a0, a1, low, up, xtol=1e-02, maxIters=500):
+ nIters = 0
+ while (up - low) > xtol and nIters < maxIters:
+ p0 = (up - low) / 2.0
+ if self.mixModelFnPrime(p0, a0, a1) <= 0:
+ low = p0
+ else:
+ up = p0
+ nIters += 1
+ return p0
diff --git a/kineticsTools/ModificationDecode.py b/kineticsTools/ModificationDecode.py
new file mode 100755
index 0000000..36ba3a5
--- /dev/null
+++ b/kineticsTools/ModificationDecode.py
@@ -0,0 +1,368 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from math import sqrt
+import math
+import scipy.stats as s
+import array as a
+
+from scipy.optimize import fminbound
+from scipy.special import gammaln as gamln
+from numpy import log, pi, log10, e, log1p, exp
+import numpy as np
+
+from MultiSiteCommon import MultiSiteCommon, canonicalBaseMap, modNames, ModificationPeakMask, FRAC, FRAClow, FRACup, log10e
+from MixtureEstimationMethods import MixtureEstimationMethods
+
+
+class ModificationDecode(MultiSiteCommon):
+
+ def __init__(self, gbmModel, sequence, rawKinetics, callBounds, methylMinCov, modsToCall=['H', 'J', 'K'], methylFractionFlag=False, useLDAFlag=False):
+
+ MultiSiteCommon.__init__(self, gbmModel, sequence, rawKinetics)
+
+ # Extents that we will attemp to call a modification
+ self.callStart = callBounds[0]
+ self.callEnd = callBounds[1]
+ self.callRange = xrange(self.callStart, self.callEnd)
+
+ self.methylMinCov = methylMinCov
+ self.modsToCall = modsToCall
+ self.methylFractionFlag = methylFractionFlag
+ self.useLDA = useLDAFlag
+
+ def decode(self):
+ """Use this method to do the full modification finding protocol"""
+
+ # Find potential modification sites
+ self.findAlternates()
+
+ # Compute all the required mean ipds under all possible composite hypotheses
+ self.computeContextMeans()
+
+ # Fill out the forward matrix
+ self.fwdRecursion()
+
+ # Trace back the fwd matrix and return modification calls
+ modCalls = self.traceback()
+
+ # Compute a confidence for each mod and return results
+ return self.scoreMods(modCalls)
+
+ def findAlternates(self):
+ """ Use rules about where IPD peaks appear to generate list
+ the set of possible modified bases that we will test during decoding."""
+
+ scoreThresholdLow = 16
+ scoreThresholdHigh = 19
+ seq = self.sequence
+
+ for (pos, peak) in self.rawKinetics.items():
+ score = peak['score']
+
+ # if self.useLDA:
+ # Try using LDA model to identify putative Ca5C, regardless of scores
+ # if peak.has_key('Ca5C'):
+ # if peak['Ca5C'] < 0:
+ # self.alternateBases[pos].add('K')
+
+ # Exclude points with low score
+ if score < scoreThresholdLow:
+ continue
+
+ # note -- don't use the tpl in the actual array - use the dict key
+ # we have reversed the indexing to deal with the reverse strand
+
+ if self.callStart <= pos < self.callEnd:
+ c = seq[pos]
+
+ # On-target A peak
+ if 'H' in self.modsToCall and c == 'A' and score > scoreThresholdHigh:
+ self.alternateBases[pos].add('H')
+
+ # On-target C peak
+ if 'J' in self.modsToCall and c == 'C' and score > scoreThresholdHigh:
+ self.alternateBases[pos].add('J')
+
+ if 'K' in self.modsToCall:
+ if c == 'C':
+ self.alternateBases[pos].add('K')
+
+ # peak at -1 or -2 or -6 of a C -- 5caC
+
+ if seq[pos - 2] == 'C' and pos - 2 >= self.callStart:
+ self.alternateBases[pos - 2].add('K')
+
+ if seq[pos + 1] == 'C' and pos + 1 < self.callEnd:
+ self.alternateBases[pos + 1].add('K')
+
+ if seq[pos + 2] == 'C' and pos + 2 < self.callEnd:
+ self.alternateBases[pos + 2].add('K')
+
+ if seq[pos + 5] == 'C' and pos + 5 < self.callEnd:
+ self.alternateBases[pos + 5].add('K')
+
+ if seq[pos + 6] == 'C' and pos + 6 < self.callEnd:
+ self.alternateBases[pos + 6].add('K')
+
+ def fwdRecursion(self):
+ start = self.lStart
+ end = self.lEnd
+
+ # Likelihood of each configuration at each position
+ scores = dict()
+
+ # fwd score matrix and fwd lookback matrix
+ fwdScore = dict()
+ fwdPrevState = dict()
+
+ # Fill out first column of score & fwd matrix
+ scores[start] = dict((cfg, self.scorePosition(start, cfg)) for cfg in self.getConfigs(start))
+
+ # First column of fwd matrix is same a score matrix, with 'None' in the index matrix
+ fwdScore[start] = scores[start]
+ fwdPrevState[start] = dict((x, None) for x in scores[start].keys())
+
+ for centerPos in xrange(start + 1, end):
+
+ # Score and fwd column for current position
+ scoreCol = dict()
+ fwdScoreCol = dict()
+ fwdPrevStateCol = dict()
+
+ # Loop over current state options
+ for cfg in self.getConfigs(centerPos):
+ score = self.scorePosition(centerPos, cfg)
+ scoreCol[cfg] = score
+
+ bestPrevState = None
+ bestScore = -1e20
+
+ # Loop over previous state options
+ for (prevCfg, prevScore) in fwdScore[centerPos - 1].items():
+ if self.compareStates(cfg, prevCfg) and prevScore + score > bestScore:
+ bestScore = prevScore + score
+ bestPrevState = prevCfg
+
+ fwdScoreCol[cfg] = bestScore
+ fwdPrevStateCol[cfg] = bestPrevState
+
+ scores[centerPos] = scoreCol
+ fwdScore[centerPos] = fwdScoreCol
+ fwdPrevState[centerPos] = fwdPrevStateCol
+
+ self.fwdScore = fwdScore
+ self.fwdPrevState = fwdPrevState
+ self.scores = scores
+
+ def traceback(self):
+ """
+ Traceback the fwd matrix to get the bset scoring configuration of modifications
+ """
+ start = self.lStart
+ end = self.lEnd
+
+ modCalls = dict()
+
+ def cogBase(cfg):
+ return cfg[self.pre]
+
+ pos = end - 1
+ currentCol = self.fwdScore[end - 1]
+ bestConfig = max(currentCol, key=lambda x: currentCol[x])
+
+ while True:
+ if cogBase(bestConfig) != self.sequence[pos]:
+ # Found a modification - save it!
+ modCalls[pos] = cogBase(bestConfig)
+
+ bestConfig = self.fwdPrevState[pos][bestConfig]
+ pos -= 1
+
+ if bestConfig is None:
+ break
+
+ # if self.useLDA:
+ # allow LDA-predicted sites through to GFF file
+ # for pos in range(start, end):
+ # if self.rawKinetics.has_key(pos):
+ # if self.rawKinetics[pos].has_key('Ca5C'):
+ # if 'K' in self.modsToCall:
+ # cutoff = min( 0, self.rawKinetics[pos]['coverage']/20.0 - 3.0 )
+ # cutoff = 0
+ # echoSites = [modCalls[pos + i] for i in [-6, -5, -2, -1, 2] if modCalls.has_key(pos + i)]
+ # else:
+ # cutoff = -2.25
+ # echoSites = [modCalls[pos + i] for i in range(-10,11) if modCalls.has_key(pos + i)]
+ # if self.rawKinetics[pos]['Ca5C'] < cutoff:
+ # so long as those sites are not in the vicinity of a m6A/m4C call
+ # if 'H' not in echoSites and 'J' not in echoSites:
+ # modCalls[pos] = 'K'
+ # else:
+ # remove any non-LDA-predicted sites from modCalls dictionary?
+ # if modCalls.has_key(pos):
+ # del modCalls[pos]
+ # correct adjacent calls:
+ # if self.useLDA:
+ # for pos in range(start + 2, end - 2, 2 ):
+ # x = [pos + i for i in range(-2,3) if modCalls.has_key( pos + i) and self.rawKinetics.has_key( pos + i) ]
+ # y = [modCalls[j] for j in x]
+ # if y.count('K') > 1:
+ # tmp = [self.rawKinetics[j]['Ca5C'] for j in x if self.rawKinetics[j].has_key('Ca5C') ]
+ # if len(tmp) > 0:
+ # lowest = min(tmp)
+ # for j in x:
+ # if self.rawKinetics[j].has_key('Ca5C'):
+ # if self.rawKinetics[j]['Ca5C'] > lowest:
+ # del modCalls[j]
+ #
+ # if adjacent m5C calls are made by the LDA, select the one that has the lower LDA score (Ca5C)
+ # for pos in range(start, end):
+ # if modCalls.has_key(pos) and self.rawKinetics.has_key(pos) and modCalls.has_key(pos+1) and self.rawKinetics.has_key(pos+1):
+ # if self.rawKinetics[pos].has_key('Ca5C') and self.rawKinetics[pos+1].has_key('Ca5C'):
+ # if self.rawKinetics[pos]['Ca5C'] < self.rawKinetics[pos+1]['Ca5C']:
+ # del modCalls[pos+1]
+ # else:
+ # del modCalls[pos]
+ return modCalls
+
+ def scoreMods(self, modCalls):
+ """
+ For each modification in the best scoring configuration, score a config excluding the current mod against the winning config
+ use this value as the Qmod for the deleted modification
+ """
+
+ qvModCalls = dict()
+
+ modSeq = a.array('c')
+ modSeq.fromstring(self.sequence)
+
+ # Apply the found modifications to the raw sequence
+ for (pos, call) in modCalls.items():
+ modSeq[pos] = call
+
+ for (pos, call) in modCalls.items():
+
+ # Score the modified template at all positions affected by this mod
+ modScore = self.scoreRegion(pos - self.post, pos + self.pre, modSeq)
+ modScores = self.getRegionScores(pos - self.post, pos + self.pre, modSeq)
+
+ if self.methylFractionFlag and self.rawKinetics.has_key(pos):
+ if self.rawKinetics[pos]["coverage"] > self.methylMinCov:
+ modifiedMeanVectors = self.getContextMeans(pos - self.post, pos + self.pre, modSeq)
+
+ # Switch back to the unmodified base and re-score
+ modSeq[pos] = canonicalBaseMap[call]
+ noModScore = self.scoreRegion(pos - self.post, pos + self.pre, modSeq)
+ noModScores = self.getRegionScores(pos - self.post, pos + self.pre, modSeq)
+
+ if self.methylFractionFlag and self.rawKinetics.has_key(pos):
+ if self.rawKinetics[pos]["coverage"] > self.methylMinCov:
+ unModifiedMeanVectors = self.getContextMeans(pos - self.post, pos + self.pre, modSeq)
+
+ # Put back the modified base
+ modSeq[pos] = call
+
+ # Compute score difference
+ llr = modScore - noModScore
+
+ # Convert from LLR to phred-scaled probability of modification
+ qModScore = 10 * llr * log10e + 10 * log1p(exp(-llr)) * log10e
+
+ # Figure out which secondary peaks were likely generated by this modification
+ # What is the posterior that the peak was generated by this mod?
+ maskPos = self.findMaskPositions(pos, modScores, noModScores)
+
+ # FIXME: Without this, currently, the identificationQv score is too low for many Ca5C sites
+ # if self.useLDA:
+ # if self.rawKinetics.has_key(pos):
+ # if self.rawKinetics[pos].has_key('Ca5C'):
+ # llr = -self.rawKinetics[pos]['Ca5C']
+ # qModScore = 100 * llr * log10e + 100*log1p(exp(-llr))*log10e
+ if self.methylFractionFlag and self.rawKinetics.has_key(pos):
+
+ if self.rawKinetics[pos]["coverage"] > self.methylMinCov:
+
+ # Instantiate mixture estimation methods:
+ mixture = MixtureEstimationMethods(self.gbmModel.post, self.gbmModel.pre, self.rawKinetics, self.methylMinCov)
+
+ # Use modifiedMeanVectors and unmodifiedMeanVectors to calculate mixing proportion, and 95% CI limits.
+ methylFracEst, methylFracLow, methylFracUpp = mixture.estimateMethylatedFractions(pos, unModifiedMeanVectors, modifiedMeanVectors, ModificationPeakMask[modNames[call]])
+
+ qvModCalls[pos] = {'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos,
+ FRAC: methylFracEst, FRAClow: methylFracLow, FRACup: methylFracUpp}
+
+ else:
+ qvModCalls[pos] = {'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos}
+
+ else:
+ # Store the full results
+ qvModCalls[pos] = {'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos}
+
+ return qvModCalls
+
+ def scoreRegion(self, start, end, sequence):
+
+ sc = 0.0
+ for pos in xrange(start, end + 1):
+ ctx = sequence[(pos - self.pre):(pos + self.post + 1)].tostring()
+ if self.scores.has_key(pos):
+ sc += self.scores[pos][ctx]
+
+ return sc
+
+ def getRegionScores(self, start, end, sequence):
+ scores = np.zeros(end - start + 1)
+
+ for pos in xrange(start, end + 1):
+ ctx = sequence[(pos - self.pre):(pos + self.post + 1)].tostring()
+ if self.scores.has_key(pos):
+ scores[pos - start] = self.scores[pos][ctx]
+
+ return scores
+
+ def findMaskPositions(self, pos, modScores, noModScores):
+
+ maskPos = []
+ start = pos - self.post
+ end = pos + self.pre
+
+ for i in xrange(start, end + 1):
+ # Add a neighboring peak to the mask if
+ # a) it has a single-site qv > 20
+ # b) the observed IPDs are somewhat more likely under the modified hypothesis than the unmodified hypothesis
+ if self.rawKinetics.has_key(i) and self.rawKinetics[i]["score"] > 20:
+ if modScores[i - start] - noModScores[i - start] > 1.0:
+ maskPos.append(i - pos)
+
+ return maskPos
+
+ def compareStates(self, current, prev):
+ return current[0:-1] == prev[1:]
diff --git a/kineticsTools/MultiSiteCommon.py b/kineticsTools/MultiSiteCommon.py
new file mode 100755
index 0000000..3dca876
--- /dev/null
+++ b/kineticsTools/MultiSiteCommon.py
@@ -0,0 +1,176 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from math import sqrt
+import math
+import scipy.stats as s
+import array as a
+
+from scipy.optimize import fminbound
+from scipy.special import gammaln as gamln
+from numpy import log, pi, log10, e, log1p, exp
+import numpy as np
+
+
+log10e = log10(e)
+canonicalBaseMap = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', 'H': 'A', 'I': 'C', 'J': 'C', 'K': 'C'}
+modNames = {'H': 'm6A', 'I': 'm5C', 'J': 'm4C', 'K': 'm5C'}
+ModificationPeakMask = {'m6A': [0, -5], 'm4C': [0, -5], 'm5C': [2, 0, -1, -2, -4, -5, -6]}
+
+# Labels for modified fraction:
+
+FRAC = 'frac'
+FRAClow = 'fracLow'
+FRACup = 'fracUp'
+
+# Try computing these only once
+
+k1 = s.norm.ppf(0.025)
+k2 = s.norm.ppf(0.975)
+
+
+class MultiSiteCommon(object):
+
+ def __init__(self, gbmModel, sequence, rawKinetics):
+ """
+ All indexes are 0-based into the the sequence.
+
+ find a set of sites that _might_ have a modification - each modification type will include a list of
+ 'neighbor peaks' that can add the current site to the 'options' list.
+ 6mA and 4mC will use only the on-target peak
+ 5caC will use on target, -2 and -6.
+
+ Only hits that make this list will be tested in the mod identification process
+
+ Use the viterbi algorithm to find the optimal modifications to include, by measuring the per-site likelihood
+ of the observed IPD, given the underlying sequence and methylation states.
+ """
+
+ log1p = math.log(0.05)
+ self.modPriors = {'H': log1p, 'I': log1p, 'J': log1p, 'K': log1p}
+
+ self.gbmModel = gbmModel
+ self.sequence = sequence
+
+ # These switch because we changing viewpoints
+ self.pre = gbmModel.post
+ self.post = gbmModel.pre
+
+ self.lStart = self.pre
+ self.lEnd = len(self.sequence) - self.post
+
+ # Extents that we will use for likelihoods
+ self.likelihoodRange = xrange(self.lStart, self.lEnd)
+ self.alternateBases = dict((x, set(sequence[x])) for x in xrange(len(sequence)))
+
+ self.rawKinetics = rawKinetics
+
+ def _possibleConfigs(self, start, end):
+
+ if start == end:
+ return self.alternateBases[start]
+ else:
+ r = []
+ currentChars = self.alternateBases[start]
+ for suffix in self._possibleConfigs(start + 1, end):
+ for c in currentChars:
+ r.append(c + suffix)
+
+ return r
+
+ def getConfigs(self, centerIdx):
+ start = centerIdx - self.pre
+ end = centerIdx + self.post
+ return self._possibleConfigs(start, end)
+
+ def computeContextMeans(self):
+ """Generate a hash of the mean ipd for all candidate contexts"""
+ allContexts = list(set([cfg for pos in self.likelihoodRange for cfg in self.getConfigs(pos)]))
+ predictions = self.gbmModel.getPredictions(allContexts)
+ self.contextMeanTable = dict(zip(allContexts, predictions))
+
+ # Log-t pdf - copied from scipy distributions.py line 3836
+ def _logpdf(self, x, df):
+ r = df * 1.0
+ lPx = gamln((r + 1) / 2) - gamln(r / 2)
+ lPx -= 0.5 * log(r * pi) + (r + 1) / 2 * log(1 + (x ** 2) / r)
+ return lPx
+
+ def singleScore(self, position, context):
+ if self.rawKinetics.has_key(position):
+ siteObs = self.rawKinetics[position]
+
+ # mu of model, error in model
+ um = self.contextMeanTable[context]
+
+ # FIXME -- unify this with the error model used in KineticWorker.py
+ # em = 0.06 * um + 0.12 * um**2.0
+ em = 0.01 + 0.03 * um + 0.06 * um ** (1.7)
+
+ uo = siteObs['tMean']
+ eo = siteObs['tErr']
+
+ t = -(uo - um) / sqrt(em ** 2 + eo ** 2)
+ df = max(1, siteObs['coverage'] - 1)
+
+ logLikelihood = self._logpdf(t, df).item()
+ # logLikelihood = s.t.logpdf(t, df).item()
+ else:
+ logLikelihood = 0
+
+ return logLikelihood
+
+ def scorePosition(self, position, context):
+ """ Compute the likelihood of the observed IPDs at position, given the context"""
+
+ # Handle the prior for a modification at the current base here
+ # unmodified bases get a prior of 0, modified bases get a prior less than 0.
+ prior = 0.0
+ if self.modPriors.has_key(context[self.pre]):
+ prior = self.modPriors[context[self.pre]]
+
+ # Handle positions where we don't have enough coverage
+ if not self.rawKinetics.has_key(position):
+ return prior
+
+ ll = self.singleScore(position, context)
+ # return logLikelihood.item() + prior
+ return ll + prior
+
+ # Return expected IPDs for a portion [start, end] of the sequence.
+ def getContextMeans(self, start, end, sequence):
+ meanVector = []
+ for pos in xrange(start, end + 1):
+ ctx = sequence[(pos - self.pre):(pos + self.post + 1)].tostring()
+ if self.contextMeanTable.has_key(ctx):
+ meanVector.append(self.contextMeanTable[ctx])
+ else:
+ meanVector.append(self.gbmModel.getPredictions([ctx]))
+ return meanVector
diff --git a/kineticsTools/MultiSiteDetection.py b/kineticsTools/MultiSiteDetection.py
new file mode 100755
index 0000000..3c57536
--- /dev/null
+++ b/kineticsTools/MultiSiteDetection.py
@@ -0,0 +1,320 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from math import sqrt
+import math
+import scipy.stats as s
+import array as a
+import sys
+
+from numpy import log, pi, log10, e, log1p, exp
+import numpy as np
+import re
+
+log10e = log10(e)
+
+canonicalBaseMap = {'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', 'H': 'A', 'I': 'C', 'J': 'C', 'K': 'C'}
+modNames = {'H': 'm6A', 'I': 'm5C', 'J': 'm4C', 'K': 'm5C'}
+
+m5CCode = 'I'
+
+iupacMap = {
+ 'A': 'A',
+ 'C': 'C',
+ 'G': 'G',
+ 'T': 'T',
+ 'K': 'GT',
+ 'M': 'AC',
+ 'R': 'AG',
+ 'Y': 'CT',
+ 'S': 'CG',
+ 'W': 'AT',
+ 'B': 'CGT',
+ 'D': 'AGT',
+ 'H': 'ACT',
+ 'V': 'ACG',
+ 'N': 'ACGT'
+}
+
+
+def findMotifPositions(seq, motifs):
+ regexs = []
+
+ # Generate a regex for each motif, honouring degenerate bases
+ for m in motifs:
+ regex = ''
+
+ for c in m:
+ regex = regex + "[" + iupacMap[c] + "]"
+
+ regexs.append(regex)
+
+ allMatches = []
+
+ # Return a list of matching positions in the sequence
+ for r in regexs:
+ rr = re.compile(r)
+ matches = [x.start() for x in rr.finditer(seq)]
+ allMatches.extend(matches)
+
+ allMatches.sort()
+
+ return allMatches
+
+
+class MultiSiteDetection(object):
+
+ def __init__(self, gbmModel, sequence, rawKinetics, callBounds, methylMinCov, motifs=['CG']):
+ """
+
+ """
+
+ self.methylMinCov = methylMinCov
+ self.motifs = motifs
+
+ self.gbmModel = gbmModel
+ self.sequence = sequence
+
+ self.callStart = callBounds[0]
+ self.callEnd = callBounds[1]
+
+ # Extents that we will attempt to call a modification
+ self.callRange = xrange(self.callStart, self.callEnd)
+
+ # These switch because we changing viewpoints
+ self.pre = gbmModel.post
+ self.post = gbmModel.pre
+
+ self.lStart = self.pre
+ self.lEnd = len(self.sequence) - self.post
+
+ # Extents that we will use for likelihoods
+ self.likelihoodRange = xrange(self.lStart, self.lEnd)
+
+ self.alternateBases = dict((x, list(sequence[x])) for x in xrange(len(sequence)))
+
+ self.rawKinetics = rawKinetics
+
+ def getConfigs(self, centerIdx):
+ ''' Enumerate all the contexts centered at centerIdx with one
+ modification added '''
+ start = centerIdx - self.pre
+ end = centerIdx + self.post
+ return self._possibleConfigs(start, end)
+
+ def _possibleConfigs(self, start, end):
+ ''' Enumerate all the contexts coming from the substring self.sequence[start,end] with one
+ modification added '''
+
+ if start == end:
+ return self.alternateBases[start]
+ else:
+ r = []
+ allSuffixes = self._possibleConfigs(start + 1, end)
+
+ # The first suffix is alway the one with no modifications
+ # Only add the alternate to that one -- that way we only
+ # get configurations with a single modification, not all combos
+
+ noModsSuffix = allSuffixes[0]
+ if len(allSuffixes) > 1:
+ restSuffixes = allSuffixes[1:]
+ else:
+ restSuffixes = []
+
+ # The noMods suffix get the alternates
+ for c in self.alternateBases[start]:
+ r.append(c + noModsSuffix)
+
+ # the other suffixes already have mods -- they just get the unmodified base
+ for suffix in restSuffixes:
+ r.append(self.alternateBases[start][0] + suffix)
+
+ return r
+
+ # Compute something for all the windows in [start, end]
+ def getContexts(self, start, end, sequence):
+ contexts = []
+
+ for pos in xrange(start, end + 1):
+ ctx = sequence[(pos - self.pre):(pos + self.post + 1)].tostring()
+ contexts.append(ctx)
+
+ return contexts
+
+ def computeContextMeans(self):
+ """Generate a hash of the mean ipd for all candidate contexts"""
+
+ allContexts = []
+
+ for pos in self.motifPositions:
+ for offsetPos in xrange(pos - self.post, pos + self.pre + 1):
+ cfgs = self.getConfigs(offsetPos)
+ allContexts.extend(cfgs)
+
+ predictions = self.gbmModel.getPredictions(allContexts)
+ self.contextMeanTable = dict(zip(allContexts, predictions))
+
+ def decode(self):
+ """Use this method to do the full modification finding protocol"""
+
+ # Find sites matching the desired motif
+ self.findMotifs()
+
+ # Compute all the required mean ipds under all possible composite hypotheses
+ self.computeContextMeans()
+
+ # Compute a confidence for each mod and return results
+ return self.scorePositions()
+
+ def findMotifs(self):
+ """ Mark all the positions matching the requested motif """
+
+ # Generate list of matching positions
+ allMotifPositions = findMotifPositions(self.sequence, self.motifs)
+ self.motifPositions = []
+
+ for pos in allMotifPositions:
+ # Only use bases that are inside the callBounds
+ if self.callStart <= pos < self.callEnd:
+ self.alternateBases[pos].append('I')
+ self.motifPositions.append(pos)
+
+ def multiSiteDetection(self, positions, nullPred, modPred, centerPosition):
+ ''' kinetics, nullPred, and modifiedPred are parallel arrays
+ containing the observations and predictions surrounding a
+ single candidate motif site. Estimate the p-value of
+ modification and the modified fraction here'''
+
+ # Apply the error model to the predictions
+ nullErr = 0.01 + 0.03 * nullPred + 0.06 * nullPred ** (1.7)
+ modErr = 0.01 + 0.03 * modPred + 0.06 * modPred ** (1.7)
+
+ obsMean = np.zeros(nullPred.shape)
+ obsErr = np.zeros(nullPred.shape)
+
+ # Get the observations into the same array format
+ for i in xrange(len(positions)):
+ position = positions[i]
+
+ if self.rawKinetics.has_key(position):
+ siteObs = self.rawKinetics[position]
+ obsMean[i] = siteObs['tMean']
+ obsErr[i] = siteObs['tErr']
+ else:
+ # Crank up the variance -- we don't have an observation at this
+ # position, so we should ignore it.
+ obsMean[i] = 0.0
+ obsErr[i] = 999999999
+
+ # Subtract off the background model from the observations and the modified prediction
+ dObs = obsMean - nullPred
+ # Error of observation and prediction are uncorrelated
+ obsSigma = obsErr ** 2 + nullErr ** 2
+ invObsSigma = 1.0 / obsSigma
+
+ # Error of null prediction and mod prediction are probably correlated -- need a better estimate of the error of the difference!!
+ dPred = modPred - nullPred
+ dPredSigma = (obsErr ** 2 + nullErr ** 2) / 2 # Just stubbing in a factor of 2 here...
+
+ weightsNumerator = invObsSigma * dPred
+ weights = weightsNumerator / (dPred * weightsNumerator).sum()
+
+ signalEstimate = (weights * dObs).sum()
+ varianceEstimate = (np.abs(weights) * obsSigma).sum()
+
+ maxSignal = (weights * dPred).sum()
+ maxSignalVariance = (np.abs(weights) * dPredSigma).sum()
+
+ # Now just run the standard erf on this Gaussian to quantify the probability that there is some signal
+ # What we want now:
+ #
+ # 1. p-value that dObs * dPred (dot product) is greater than 0.
+ # 2. Distribution of \alpha, where dObs = \alpha dPred, where \alpha \in [0,1], with appropriate error propagation
+ # 2a. Is it possible to summarize 2 with a Beta distribution?
+
+ pvalue = s.norm._cdf(-signalEstimate / varianceEstimate)
+ pvalue = max(sys.float_info.min, pvalue)
+ score = -10.0 * log10(pvalue)
+
+ centerPosition['MSscore'] = score
+ centerPosition['MSpvalue'] = pvalue
+
+ centerPosition['signal'] = signalEstimate
+ centerPosition['variance'] = varianceEstimate
+
+ centerPosition['modelSignal'] = maxSignal
+ centerPosition['modelVariance'] = maxSignalVariance
+
+ centerPosition['Mask'] = []
+
+ return centerPosition
+
+ def scorePositions(self):
+ """
+ Score each motif site in the sequence.
+ """
+
+ qvModCalls = dict()
+
+ dnaSeq = a.array('c')
+ dnaSeq.fromstring(self.sequence)
+
+ for pos in self.motifPositions:
+ if self.rawKinetics.has_key(pos):
+
+ # Fetch unmodified positions
+ nullPred = self.getRegionPredictions(pos - self.post, pos + self.pre, dnaSeq)
+
+ # Fetch modified positions and reset sequence
+ originalBase = dnaSeq[pos]
+ dnaSeq[pos] = m5CCode
+ modifiedPred = self.getRegionPredictions(pos - self.post, pos + self.pre, dnaSeq)
+ dnaSeq[pos] = originalBase
+
+ # Position that contribute to this call
+ positions = xrange(pos - self.post, pos + self.pre + 1)
+
+ # Run the multi-site detection and save the results
+ centerStats = self.rawKinetics[pos]
+ centerStats = self.multiSiteDetection(positions, nullPred, modifiedPred, centerStats)
+
+ qvModCalls[pos] = centerStats
+
+ return qvModCalls
+
+ def getRegionPredictions(self, start, end, sequence):
+ predictions = np.zeros(end - start + 1)
+
+ for pos in xrange(start, end + 1):
+ ctx = sequence[(pos - self.pre):(pos + self.post + 1)].tostring()
+ predictions[pos - start] = self.contextMeanTable[ctx]
+
+ return predictions
diff --git a/kineticsTools/PositiveControlEnricher.py b/kineticsTools/PositiveControlEnricher.py
new file mode 100755
index 0000000..6fa6bd9
--- /dev/null
+++ b/kineticsTools/PositiveControlEnricher.py
@@ -0,0 +1,147 @@
+# Positive Control Enricher class
+
+from math import sqrt
+import math
+import scipy.stats as s
+import array as a
+
+from scipy.optimize import fminbound
+from scipy.special import gammaln as gamln
+from numpy import *
+from numpy import log, pi, log10, e, log1p, exp
+import numpy as np
+
+from MultiSiteCommon import MultiSiteCommon
+from MixtureEstimationMethods import MixtureEstimationMethods
+
+
+class PositiveControlEnricher(MultiSiteCommon):
+
+ def __init__(self, gbmModel, sequence, rawKinetics):
+
+ MultiSiteCommon.__init__(self, gbmModel, sequence, rawKinetics)
+ self.fwd_model = np.genfromtxt("/home/UNIXHOME/obanerjee/initial_lr_model_weights_fwd.csv", delimiter=',')
+ self.rev_model = np.genfromtxt("/home/UNIXHOME/obanerjee/initial_lr_model_weights_rev.csv", delimiter=',')
+ self.fwd_model = np.squeeze(np.asarray(self.fwd_model))
+ self.rev_model = np.squeeze(np.asarray(self.rev_model))
+
+ def fn(self, l):
+ if l == "A":
+ return 1
+ if l == "C":
+ return 2
+ if l == "G":
+ return 3
+ return 4
+
+ def tStatisticDenominator(self, mu0, tErr):
+
+ em = 0.01 + 0.03 * mu0 + 0.06 * mu0 ** 1.7
+ den = sqrt(em ** 2 + tErr ** 2)
+ return den
+
+ def applyLRmodel(self, kinetics, pos, unmodIPDs, modifIPDs, model, up, down, context):
+ """ Test out LDA model """
+
+ res = np.zeros((up + down + 1, 7))
+ ind = 0
+
+ # range from -down to +up
+ for offset in range(-down, (up + 1)):
+ a = pos + offset
+ tmp = np.squeeze(np.asarray([kinetics[a]["tMean"], kinetics[a]["tErr"], kinetics[a]["coverage"], unmodIPDs[offset + down], modifIPDs[offset + down]]))
+
+ # get t-statistics corresponding to mu0 and mu1:
+ den = self.tStatisticDenominator(tmp[3], tmp[1])
+ k1 = -(tmp[0] - tmp[3]) / den
+ k2 = -(tmp[0] - tmp[4]) / den
+ tmp = np.append(np.log(tmp + 0.01), k1)
+ tmp = np.append(tmp, k2)
+
+ res[ind, ] = tmp
+ ind += 1
+
+ # collected features for prediction:
+ apply = np.hstack(res.transpose())
+ apply = np.squeeze(np.asarray(apply))
+ apply[isnan(apply)] = 0
+
+ # include context:
+ context = context[(pos - 10):(pos + 11)]
+ del context[11]
+ context = np.array(map(self.fn, context))
+ apply = np.concatenate([apply, context])
+
+ # calculate logistic regression score:
+ z = sum(np.multiply(apply, model[1:])) + model[0]
+ score = -z - np.log(1 + np.exp(-z))
+
+ return score
+
+ def callLRstrand(self, kinetics, strand, model, up, down):
+
+ tmp = [d for d in kinetics if d["strand"] == strand]
+ tmp.sort(key=lambda x: x["tpl"])
+
+ modSeq = a.array('c')
+ modSeq.fromstring(self.sequence)
+
+ L = len(tmp)
+
+ for pos in range(down, (L - up)):
+
+ if tmp[pos]["base"] == 'C':
+
+ # Get H1 means:
+ modSeq[pos] = 'I'
+ modifIPDs = self.getContextMeans(pos - 10, pos + 10, modSeq)
+
+ # Get H0 means:
+ modSeq[pos] = 'C'
+ unmodIPDs = self.getContextMeans(pos - 10, pos + 10, modSeq)
+
+ tmp[pos]["Ca5C"] = self.applyLRmodel(tmp, pos, unmodIPDs, modifIPDs, model, up, down, modSeq)
+
+ return tmp
+
+ def callEnricherFunction(self, kinetics, up=10, down=10):
+
+ # Compute all the required mean ipds under all possible composite hypotheses
+ self.computeContextMeans()
+
+ fwd = self.callLRstrand(kinetics, 0, self.fwd_model, up, down)
+ rev = self.callLRstrand(kinetics, 1, self.rev_model, up, down)
+ res = fwd + rev
+ res.sort(key=lambda x: x["tpl"])
+ return res
+
+ def scoreMods(self, modCalls):
+ """
+ For each modification in the best scoring configuration, score a config excluding the current mod against the winning config
+ use this value as the Qmod for the deleted modification
+ """
+
+ qvModCalls = dict()
+
+ modSeq = a.array('c')
+ modSeq.fromstring(self.sequence)
+
+ modCalls = [i for i in range(len(modSeq)) if modSeq[i] == 'C']
+
+ for pos in modCalls:
+
+ # now apply the modification at this position:
+ modSeq[pos] = 'K'
+ modifIPDs = self.getContextMeans(pos - self.post, pos + self.pre, modSeq)
+
+ # using canonical base map, try to get H0 means:
+ modSeq[pos] = canonicalBaseMap[call]
+ unmodIPDs = self.getContextMeans(pos - self.post, pos + self.pre, modSeq)
+
+ # try to collect related statistics: tMean, tErr, tStatistic
+ tmp = self.applyLRmodel(kinetics, pos, unmodIPDs, modifIPDs, modSeq, up, down)
+
+ # now try to use these vectors to make a basic decision:
+ basicDecision[pos] = {'score': tmp}
+
+ return basicDecision
diff --git a/kineticsTools/ReferenceUtils.py b/kineticsTools/ReferenceUtils.py
new file mode 100755
index 0000000..a48ca8d
--- /dev/null
+++ b/kineticsTools/ReferenceUtils.py
@@ -0,0 +1,154 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# FIXME all of this belongs somewhere else (probably either pbcore.io.dataset
+# or a future base module for resequencing apps)
+
+from collections import namedtuple
+import itertools
+import math
+import re
+import os
+
+from pbcore.io import AlignmentSet, ReferenceSet
+
+# FIXME pbcore keys contigs by name, but kineticsTools usually keys by ID
+ReferenceWindow = namedtuple("ReferenceWindow", ["refId", "refName", "start", "end"])
+
+
+class ReferenceUtils():
+
+ @staticmethod
+ def loadReferenceContigs(referencePath, alignmentSet):
+ # FIXME we should get rid of this entirely, but I think it requires
+ # fixing the inconsistency in how contigs are referenced here versus in
+ # pbcore.io
+ """
+ Load the reference contigs, and tag each one with the ref.cmpH5ID it
+ was assigned in the alignment file(s). Return a list of contigs,
+ which are used to set up IpdModel.
+ """
+
+ # Read contigs from FASTA file (or XML dataset)
+ refReader = ReferenceSet(referencePath)
+ contigs = [x for x in refReader]
+ contigDict = dict([(x.id, x) for x in contigs])
+
+ # initially each contig has an id of None -- this will be overwritten with the id from the cmp.h5, if there are any
+ # reads mapped to it.
+ for x in contigs:
+ x.cmph5ID = None
+
+ # Mark each contig with it's ID from the cmp.h5 - match them up using MD5s
+ for x in alignmentSet.referenceInfoTable:
+ contigDict[x.FullName].cmph5ID = x.ID
+
+ return contigs
+
+ @staticmethod
+ def referenceWindowsFromAlignment(ds, refInfoLookup):
+ return [ ReferenceWindow(refId=refInfoLookup(w[0]).ID,
+ refName=w[0],
+ start=w[1],
+ end=w[2]) for w in ds.refWindows ]
+
+ @staticmethod
+ def parseReferenceWindow(s, refInfoLookup):
+ if s is None:
+ return None
+ m = re.match("(.*):(.*)-(.*)", s)
+ if m:
+ refContigInfo = refInfoLookup(m.group(1))
+ refId = refContigInfo.ID
+ refName = refContigInfo.Name
+ refStart = int(m.group(2))
+ refEnd = min(int(m.group(3)), refContigInfo.Length)
+ else:
+ refContigInfo = refInfoLookup(s)
+ refId = refContigInfo.ID
+ refName = refContigInfo.Name
+ refStart = 0
+ refEnd = refContigInfo.Length
+ return ReferenceWindow(refId=refId, refName=refName, start=refStart,
+ end=refEnd)
+
+ @staticmethod
+ def createReferenceWindows(refInfo):
+ return [ ReferenceWindow(refId=r.ID,
+ refName=r.Name,
+ start=0,
+ end=r.Length) for r in refInfo ]
+
+
+ @staticmethod
+ def enumerateChunks(referenceStride, referenceWindow):
+ """
+ Enumerate all work chunks on this reference contig (restricted to
+ the windows, if provided).
+ """
+ def intersection(int1, int2):
+ s1, e1 = int1
+ s2, e2 = int2
+ si, ei = max(s1, s2), min(e1, e2)
+ if si < ei:
+ return (si, ei)
+ else:
+ return None
+
+ def enumerateIntervals(bounds, stride):
+ """
+ Enumerate windows of size "stride", attempting to align window
+ boundaries on multiple of stride.
+ """
+ def alignDown(chunk, x):
+ return (x/chunk)*chunk
+ def alignUp(chunk, x):
+ return int(math.ceil(float(x)/chunk)*chunk)
+
+ start, end = bounds
+ roundStart = alignDown(stride, start)
+ roundEnd = alignUp (stride, end)
+
+ for s in xrange(roundStart, roundEnd, stride):
+ roundWin = (s, s + stride)
+ yield intersection(bounds, roundWin)
+
+ for (s, e) in enumerateIntervals((referenceWindow.start,
+ referenceWindow.end), referenceStride):
+ yield ReferenceWindow(refId=referenceWindow.refId,
+ refName=referenceWindow.refName,
+ start=s, end=e)
+
+ @staticmethod
+ def loadAlignmentChemistry(alignmentSet):
+ chems = alignmentSet.sequencingChemistry
+ chemCounts = {k: len(list(v)) for k, v in itertools.groupby(chems)}
+ majorityChem = max(chemCounts, key=chemCounts.get)
+ return majorityChem
diff --git a/kineticsTools/ResultWriter.py b/kineticsTools/ResultWriter.py
new file mode 100755
index 0000000..d5e74e2
--- /dev/null
+++ b/kineticsTools/ResultWriter.py
@@ -0,0 +1,870 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import cProfile
+import logging
+import os.path
+import os
+from multiprocessing import Process
+import cPickle
+import h5py
+import numpy as np
+from pbcore.io import GffWriter, Gff3Record
+import sys
+from kineticsTools.pipelineTools import consumer
+import math
+
+
+# Labels for modified fraction:
+FRAC = 'frac'
+FRAClow = 'fracLow'
+FRACup = 'fracUp'
+
+
+class ResultCollectorProcess(Process):
+
+ """
+ Gathers results and writes to a file.
+ """
+
+ def __init__(self, options, resultsQueue):
+ Process.__init__(self)
+ self.daemon = True
+ self.options = options
+ self._resultsQueue = resultsQueue
+
+ def _run(self):
+ logging.info("Process %s (PID=%d) started running" % (self.name, self.pid))
+
+ self.onStart()
+
+ nextChunkId = 0
+ chunkCache = {}
+
+ sentinelsReceived = 0
+ while sentinelsReceived < self.options.numWorkers:
+ result = self._resultsQueue.get()
+ self._resultsQueue.task_done()
+
+ if result is None:
+ sentinelsReceived += 1
+ else:
+ # Write out chunks in chunkId order.
+ # Buffer received chunks until they can be written in order
+ (chunkId, datum) = result
+ chunkCache[chunkId] = datum
+
+ # The rawData field is large and unused. Delete it to mitigate
+ # risk of OOM problems
+ for column in datum:
+ if 'rawData' in column:
+ del column['rawData']
+
+ # Write out all the chunks that we can
+ while chunkCache.has_key(nextChunkId):
+ nextChunk = chunkCache.pop(nextChunkId)
+ self.onResult(nextChunk)
+
+ nextChunkId += 1
+
+ logging.info("Result thread shutting down...")
+ self.onFinish()
+
+ def run(self):
+
+ if self.options.doProfiling:
+ cProfile.runctx("self._run()",
+ globals=globals(),
+ locals=locals(),
+ filename="profile-%s.out" % self.name)
+ else:
+ self._run()
+
+ # ==================================
+ # Overridable interface begins here.
+ #
+ def onStart(self):
+ pass
+
+ def onResult(self, result):
+ pass
+
+ def onFinish(self):
+ pass
+
+
+class KineticsWriter(ResultCollectorProcess):
+
+ def __init__(self, options, resultQueue, refInfo, ipdModel):
+ ResultCollectorProcess.__init__(self, options, resultQueue)
+
+ self.refInfo = refInfo
+ self.ipdModel = ipdModel
+
+ @consumer
+ def msCsvConsumer(self, filename):
+ """
+ Consume IPD summary rows and write them to csv
+ """
+
+ # Open the csv file
+ f = self.openWriteHandle(filename)
+ delim = ","
+
+ cols = ["refName", "tpl", "strand", "base", "score", "tMean", "tErr", "modelPrediction", "ipdRatio", "coverage", "signal", "variance", "MSscore"]
+
+ # Special cases for formatting columns of the csv
+ handlers = dict()
+ threeF = lambda x: "%.3f" % x
+
+ handlers["refName"] = lambda x: "\"%s\"" % x
+
+ handlers["tpl"] = lambda x: str(x.item() + 1)
+ handlers["score"] = lambda x: "%d" % x
+
+ handlers["tMean"] = threeF
+ handlers["modelPrediction"] = threeF
+ handlers["caseMean"] = threeF
+ handlers["controlMean"] = threeF
+ handlers["ipdRatio"] = threeF
+ handlers["pvalue"] = lambda x: "%.3e" % x
+
+ handlers["controlStd"] = threeF
+ handlers["controlStd"] = threeF
+ handlers["tErr"] = threeF
+
+ fourF = lambda x: "%.4f" % x
+ handlers["signal"] = fourF
+ handlers["variance"] = fourF
+ handlers["MSscore"] = lambda x: "%d" % x
+
+ print >>f, delim.join(cols)
+
+ def fmt(rowData, colName):
+ if not rowData.has_key(colName):
+ return ""
+
+ if handlers.has_key(colName):
+ return handlers[colName](rowData[colName])
+ else:
+ return str(rowData[colName])
+
+ try:
+ while True:
+ # Pull a list of record in from the producer
+ itemList = (yield)
+
+ for item in itemList:
+ if item.has_key("signal"):
+ values = [fmt(item, col) for col in cols]
+ print >>f, delim.join(values)
+
+ except GeneratorExit:
+ f.close()
+ return
+ except Exception as e:
+ print e
+
+ @consumer
+ def hdf5CsvConsumer(self, filename):
+
+ grp = h5py.File(filename, "w")
+
+ y = [int(ref.Length) for ref in self.refInfo]
+ dataLength = sum(y)
+ y.append(8192)
+ chunkSize = min(dataLength, 8192 * 2)
+ # print "dataLength = ", dataLength, " chunkSize = ", chunkSize, " y = ", y
+
+ refIdDataset = grp.create_dataset('refId', (dataLength,), dtype="u4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ tplDataset = grp.create_dataset('tpl', (dataLength,), dtype="u4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ strandDataset = grp.create_dataset('strand', (dataLength,), dtype="u1", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+
+
+
+
+ @consumer
+ def csvConsumer(self, filename):
+ """
+ Consume IPD summary rows and write them to csv
+ """
+
+ # Open the csv file
+ f = self.openWriteHandle(filename)
+ delim = ","
+
+ if self.options.control is None:
+
+ # Columns for in-silico control
+ if self.options.methylFraction:
+ cols = ["refName", "tpl", "strand", "base", "score", "tMean", "tErr", "modelPrediction", "ipdRatio", "coverage", FRAC, FRAClow, FRACup]
+ else:
+ if self.options.useLDA:
+ # FIXME: For testing LDA model, to look at LDA scores in csv output (run without --methylFraction or --control):
+ cols = ["refName", "tpl", "strand", "base", "score", "tMean", "tErr", "modelPrediction", "ipdRatio", "coverage", "Ca5C"]
+ else:
+ cols = ["refName", "tpl", "strand", "base", "score", "tMean", "tErr", "modelPrediction", "ipdRatio", "coverage"]
+
+ else:
+ # Columns for case-control
+ if self.options.methylFraction:
+ cols = ["refName", "tpl", "strand", "base", "score", "pvalue", "caseMean", "controlMean", "caseStd", "controlStd", "ipdRatio", "testStatistic", "coverage", "controlCoverage", "caseCoverage", FRAC, FRAClow, FRACup]
+ else:
+ cols = ["refName", "tpl", "strand", "base", "score", "pvalue", "caseMean", "controlMean", "caseStd", "controlStd", "ipdRatio", "testStatistic", "coverage", "controlCoverage", "caseCoverage"]
+
+ # Special cases for formatting columns of the csv
+ handlers = dict()
+ threeF = lambda x: "%.3f" % x
+
+ handlers["refName"] = lambda x: "\"%s\"" % x
+
+ handlers["tpl"] = lambda x: str(x.item() + 1)
+ handlers["score"] = lambda x: "%d" % x
+
+ handlers["tMean"] = threeF
+ handlers["modelPrediction"] = threeF
+ handlers["caseMean"] = threeF
+ handlers["controlMean"] = threeF
+ handlers["ipdRatio"] = threeF
+ handlers["pvalue"] = lambda x: "%.3e" % x
+
+ handlers["controlStd"] = threeF
+ handlers["controlStd"] = threeF
+ handlers["tErr"] = threeF
+
+ # FIXME: remove this line later:
+ handlers["Ca5C"] = threeF
+
+ handlers[FRAC] = threeF
+ handlers[FRAClow] = threeF
+ handlers[FRACup] = threeF
+
+ print >>f, delim.join(cols)
+
+ def fmt(rowData, colName):
+ if not rowData.has_key(colName):
+ return ""
+
+ if handlers.has_key(colName):
+ return handlers[colName](rowData[colName])
+ else:
+ return str(rowData[colName])
+
+ try:
+ while True:
+ # Pull a list of record in from the producer
+ itemList = (yield)
+
+ for item in itemList:
+ values = [fmt(item, col) for col in cols]
+ print >>f, delim.join(values)
+
+ except GeneratorExit:
+ f.close()
+ return
+ except Exception as e:
+ print e
+
+ @consumer
+ def hdf5CsvConsumer(self, filename):
+
+ grp = h5py.File(filename, "w")
+
+ y = [int(ref.Length) for ref in self.refInfo]
+ dataLength = sum(y)
+ y.append(8192)
+ chunkSize = min(dataLength, 8192 * 2)
+ # print "dataLength = ", dataLength, " chunkSize = ", chunkSize, " y = ", y
+
+ refIdDataset = grp.create_dataset('refId', (dataLength,), dtype="u4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ tplDataset = grp.create_dataset('tpl', (dataLength,), dtype="u4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ strandDataset = grp.create_dataset('strand', (dataLength,), dtype="u1", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ baseDataset = grp.create_dataset('base', (dataLength,), dtype="a1", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ scoreDataset = grp.create_dataset('score', (dataLength,), dtype="u4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ tMeanDataset = grp.create_dataset('tMean', (dataLength,), dtype="f4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ tErrDataset = grp.create_dataset('tErr', (dataLength,), dtype="f4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ modelPredictionDataset = grp.create_dataset('modelPrediction', (dataLength,), dtype="f4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ ipdRatioDataset = grp.create_dataset('ipdRatio', (dataLength,), dtype="f4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ coverageDataset = grp.create_dataset('coverage', (dataLength,), dtype="u4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+
+ if self.options.methylFraction:
+ fracDataset = grp.create_dataset(FRAC, (dataLength,), dtype="f4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ fracLowDataset = grp.create_dataset(FRAClow, (dataLength,), dtype="f4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+ fracUpDataset = grp.create_dataset(FRACup, (dataLength,), dtype="f4", compression="gzip", chunks=(chunkSize,), compression_opts=2)
+
+ try:
+ while True:
+ # Get a chunk of IPD records
+ chunk = (yield)
+
+ if len(chunk) == 0:
+ continue
+
+ '''
+ # determine the correct group:
+ refIdDataset = grp['refId']
+ tplDataset = grp['tpl']
+ strandDataset = grp['strand']
+ baseDataset = grp['base']
+ scoreDataset = grp['score']
+ tMeanDataset = grp['tMean']
+ tErrDataset = grp['tErr']
+ modelPredictionDataset = grp['modelPrediction']
+ ipdRatioDataset = grp['ipdRatio']
+ coverageDataset = grp['coverage']
+ if self.options.methylFraction:
+ fracDataset = grp[FRAC]
+ fracLowDataset = grp[FRAClow]
+ fracUpDataset = grp[FRACup]
+ '''
+
+ start = min(x['tpl'] for x in chunk)
+ end = min(max(x['tpl'] for x in chunk), tplDataset.shape[0] - 1)
+
+ arrLen = end - start + 1
+
+ refId = np.empty(arrLen, dtype="u4")
+ tpl = np.zeros(arrLen, dtype="u4")
+ strand = np.zeros(arrLen, dtype="u1")
+ base = np.zeros(arrLen, dtype="a1")
+ score = np.zeros(arrLen, dtype="u4")
+ tMean = np.zeros(arrLen, dtype="f4")
+ tErr = np.zeros(arrLen, dtype="f4")
+ modelPrediction = np.zeros(arrLen, dtype="f4")
+ ipdRatio = np.zeros(arrLen, dtype="f4")
+ coverage = np.zeros(arrLen, dtype="u4")
+ if self.options.methylFraction:
+ frac = np.empty(arrLen, dtype="f4")
+ fracLow = np.empty(arrLen, dtype="f4")
+ fracUp = np.empty(arrLen, dtype="f4")
+
+ # Fill out the ipd observations into the dataset
+ for x in chunk:
+ # offset into the current chunk
+ idx = x['tpl'] - start
+
+ # Data points past the end of the reference can make it through -- filter them out here
+ if idx < arrLen:
+ refId[idx] = int(x['refId'])
+ tpl[idx] = int(x['tpl'])
+ strand[idx] = int(x['strand'])
+ base[idx] = x['base']
+ score[idx] = int(x['score'])
+ tMean[idx] = float(x['tMean'])
+ tErr[idx] = float(x['tErr'])
+ modelPrediction[idx] = float(x['modelPrediction'])
+ ipdRatio[idx] = float(x['ipdRatio'])
+ coverage[idx] = int(x['coverage'])
+ if self.options.methylFraction:
+ if FRAC in x:
+ frac[idx] = float(x[FRAC])
+ fracLow[idx] = float(x[FRAClow])
+ fracUp[idx] = float(x[FRACup])
+ else:
+ frac[idx] = np.nan
+ fracLow[idx] = np.nan
+ fracUp[idx] = np.nan
+
+ refIdDataset[start:(end + 1)] = refId
+ tplDataset[start:(end + 1)] = tpl
+ strandDataset[start:(end + 1)] = strand
+ baseDataset[start:(end + 1)] = base
+ scoreDataset[start:(end + 1)] = score
+ tMeanDataset[start:(end + 1)] = tMean
+ tErrDataset[start:(end + 1)] = tErr
+ modelPredictionDataset[start:(end + 1)] = modelPrediction
+ ipdRatioDataset[start:(end + 1)] = ipdRatio
+ coverageDataset[start:(end + 1)] = coverage
+ if self.options.methylFraction:
+ fracDataset[start:(end + 1)] = frac
+ fracLowDataset[start:(end + 1)] = fracLow
+ fracUpDataset[start:(end + 1)] = fracUp
+
+ except GeneratorExit:
+ # Close down the h5 file
+ grp.close()
+ return
+
+ # an alternative version that collects data into groups according to reference:
+ @consumer
+ def alt_hdf5CsvConsumer(self, filename):
+ """
+ Similar to csv consumer but writing to hdf5 format.
+ """
+
+ f = h5py.File(filename, "w")
+ dsDict = {}
+
+ for ref in self.refInfo:
+ # Each reference group will house a collection of datasets:
+ chunkSize = min(ref.Length, 8192)
+
+ # Create a group for each reference:
+ grp = f.create_group(str(ref.Name))
+
+ ds = grp.create_dataset('tpl', (ref.Length,), dtype="u4", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset('strand', (ref.Length,), dtype="u1", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset('base', (ref.Length,), dtype="a1", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset('score', (ref.Length,), dtype="u4", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset('tMean', (ref.Length,), dtype="f4", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset('tErr', (ref.Length,), dtype="f4", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset('modelPrediction', (ref.Length,), dtype="f4", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset('ipdRatio', (ref.Length,), dtype="f4", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset('coverage', (ref.Length,), dtype="u4", compression="gzip", chunks=(chunkSize,))
+
+ if self.options.methylFraction:
+ ds = grp.create_dataset(FRAC, (ref.Length,), dtype="f4", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset(FRAClow, (ref.Length,), dtype="f4", compression="gzip", chunks=(chunkSize,))
+ ds = grp.create_dataset(FRACup, (ref.Length,), dtype="f4", compression="gzip", chunks=(chunkSize,))
+
+ # Maintain a dictionary of group paths?
+ dsDict[ref.ID] = grp
+
+ try:
+ while True:
+
+ # Get a chunk of IPD records
+ chunk = (yield)
+
+ if len(chunk) == 0:
+ continue
+
+ # determine the correct group:
+ grp = dsDict[chunk[0]['refId']]
+
+ tplDataset = grp['tpl']
+ strandDataset = grp['strand']
+ baseDataset = grp['base']
+ scoreDataset = grp['score']
+ tMeanDataset = grp['tMean']
+ tErrDataset = grp['tErr']
+ modelPredictionDataset = grp['modelPrediction']
+ ipdRatioDataset = grp['ipdRatio']
+ coverageDataset = grp['coverage']
+ if self.options.methylFraction:
+ fracDataset = grp[FRAC]
+ fracLowDataset = grp[FRAClow]
+ fracUpDataset = grp[FRACup]
+
+ start = min(x['tpl'] for x in chunk)
+ end = min(max(x['tpl'] for x in chunk), tplDataset.shape[0] - 1)
+
+ arrLen = end - start + 1
+
+ tpl = np.zeros(arrLen, dtype="u4")
+ strand = np.zeros(arrLen, dtype="u1")
+ base = np.zeros(arrLen, dtype="a1")
+ score = np.zeros(arrLen, dtype="u4")
+ tMean = np.zeros(arrLen, dtype="f4")
+ tErr = np.zeros(arrLen, dtype="f4")
+ modelPrediction = np.zeros(arrLen, dtype="f4")
+ ipdRatio = np.zeros(arrLen, dtype="f4")
+ coverage = np.zeros(arrLen, dtype="u4")
+ if self.options.methylFraction:
+ frac = np.empty(arrLen, dtype="f4")
+ fracLow = np.empty(arrLen, dtype="f4")
+ fracUp = np.empty(arrLen, dtype="f4")
+
+ # Fill out the ipd observations into the dataset
+ for x in chunk:
+ # offset into the current chunk
+ idx = x['tpl'] - start
+
+ # Data points past the end of the reference can make it through -- filter them out here
+ if idx < arrLen:
+ tpl[idx] += int(x['tpl'])
+ strand[idx] += int(x['strand'])
+ base[idx] = x['base']
+ score[idx] += int(x['score'])
+ tMean[idx] += float(x['tMean'])
+ tErr[idx] += float(x['tErr'])
+ modelPrediction[idx] += float(x['modelPrediction'])
+ ipdRatio[idx] += float(x['ipdRatio'])
+ coverage[idx] += int(x['coverage'])
+ if self.options.methylFraction:
+ if FRAC in x:
+ frac[idx] = float(x[FRAC])
+ fracLow[idx] = float(x[FRAClow])
+ fracUp[idx] = float(x[FRACup])
+ else:
+ frac[idx] = np.nan
+ fracLow[idx] = np.nan
+ fracUp[idx] = np.nan
+
+ # Write our chunk into the main dataset
+ tplDataset[start:(end + 1)] = tpl
+ strandDataset[start:(end + 1)] = strand
+ baseDataset[start:(end + 1)] = base
+ scoreDataset[start:(end + 1)] = score
+ tMeanDataset[start:(end + 1)] = tMean
+ tErrDataset[start:(end + 1)] = tErr
+ modelPredictionDataset[start:(end + 1)] = modelPrediction
+ ipdRatioDataset[start:(end + 1)] = ipdRatio
+ coverageDataset[start:(end + 1)] = coverage
+ if self.options.methylFraction:
+ fracDataset[start:(end + 1)] = frac
+ fracLowDataset[start:(end + 1)] = fracLow
+ fracUpDataset[start:(end + 1)] = fracUp
+
+ except GeneratorExit:
+ # Close down the h5 file
+ f.close()
+ return
+
+ def openWriteHandle(self, filename):
+ if filename[-2:] == 'gz':
+ import gzip
+ fileobj = gzip.GzipFile(filename, mode="w", compresslevel=3)
+ else:
+ fileobj = open(filename, "w", 2 << 15)
+
+ return fileobj
+
+ @consumer
+ def ipdRatioH5Consumer(self, fileName):
+ """
+ Create an HDF5 file containing a uint32 dataset for each reference, with size equal to the
+ reference length. Write packed IPD ratios into the file chunk-wise
+ """
+
+ f = h5py.File(fileName, "w")
+ dsDict = {}
+
+ for ref in self.refInfo:
+ # FIXME -- create with good chunk parameters, activate compression
+ logging.info("Creating IpdRatio dataset w/ name: %s, Size: %d" % (str(ref.Name), ref.Length))
+
+ chunkSize = min(ref.Length, 8192)
+
+ ds = f.create_dataset(str(ref.Name), (ref.Length,),
+ dtype="u4",
+ compression='gzip',
+ chunks=(chunkSize,))
+
+ dsDict[ref.ID] = ds
+
+ try:
+ while True:
+ # Get a chunk of IPD records
+
+ chunk = (yield)
+
+ if len(chunk) == 0:
+ continue
+
+ ds = dsDict[chunk[0]['refId']]
+
+ start = min(x['tpl'] for x in chunk)
+ end = min(max(x['tpl'] for x in chunk), ds.shape[0] - 1)
+
+ arrLen = end - start + 1
+ arr = np.zeros(arrLen, dtype="u4")
+
+ # Fill out the ipd observations into the dataset
+ for x in chunk:
+ # offset into the current chunk
+ idx = x['tpl'] - start
+
+ # convert to a 16 bit uint with a conversion factor of 100
+ #
+ # Note (Bug 26065): the upstream code sometimes
+ # gives a NaN for ipdRatio (?). We are not attempting
+ # to investigate this in the 2.3 timeframe, we are just aiming
+ # to 1) not crash and 2) not call a modification here.
+ #
+ if np.isnan(x['ipdRatio']):
+ val = 100 # ipdRatio of 1
+ else:
+ val = min(2 ** 16 - 1, int(x['ipdRatio'] * 100))
+
+ # strand 0 is the lower 16 bits, strand 1 is the upper 16 bits
+ val = val if x['strand'] == 0 else val << 16
+
+ # Data points past the end of the reference can make it through -- filter them out here
+ # NOTE - figure out why the KineticsWorker generates these in the first place.
+ if idx < arrLen:
+ arr[idx] += val
+
+ # Write our chunk into the main dataset
+ ds[start:(end + 1)] = arr
+
+ except GeneratorExit:
+ # Close down the h5 file
+ f.close()
+ return
+
+ @consumer
+ def pickleConsumer(self, fileName):
+ """
+ Consume IPD summary rows and pickle to a 'None' terminated stream
+ """
+
+ f = open(fileName, "w")
+ pickleStream = cPickle.Pickler(f)
+
+ try:
+ while True:
+ # Pickle a record
+ n = (yield)
+ pickleStream.dump(n)
+ pickleStream.clear_memo()
+
+ except GeneratorExit:
+ # Write an end sentinel to the pickle stream
+ pickleStream.dump(None)
+ f.close()
+ return
+
+
+
+ def makeGffRecord(self, siteObs):
+ """
+ Convert the internal site observation object into a GFF entry
+ """
+ # Some useful attributes about the observation
+ # - cognate base
+ # - context snippet
+ # - ipd ratio
+ # - coverage
+ snippet = self.snippetFunc(siteObs['tpl'], siteObs['strand'])
+ attributes = [('coverage', siteObs['coverage']),
+ ('context', snippet),
+ ('IPDRatio', siteObs['ipdRatio'])]
+
+ # Base of detected mod -- single position, closed,open
+ # interval.
+ # Note -- internally the tool uses 0-based reference
+ # coordinates, however in gff the template indices are
+ # 1-based. Make that adjustment here.
+ # On start vs. end: My reading of the gff spec
+ # (http://www.sequenceontology.org/resources/gff3.html) says
+ # to me that 1-base long feature (e.g. a modified base) should
+ # have start + 1 == end, and 0-base long features
+ # (e.g. insertions) should have start == end. This is not the
+ # convention that Marco has apdopted in SMRTView, or the
+ # convention that EviCons originally used. We will adopt
+ # their convention here, for now.
+ start = siteObs['tpl'] + 1
+ end = siteObs['tpl'] + 1
+
+ if siteObs.has_key('motif'):
+ attributes.append(('motif', "%s" % siteObs['motif']))
+
+ if siteObs.has_key('id'):
+ attributes.append(('id', "%s" % siteObs['id']))
+
+ if self.options.methylFraction and siteObs.has_key(FRAC):
+ attributes.append(('frac', "%.3f" % siteObs[FRAC]))
+ attributes.append(('fracLow', "%.3f" % siteObs[FRAClow]))
+ attributes.append(('fracUp', "%.3f" % siteObs[FRACup]))
+
+ if siteObs.has_key('modificationScore'):
+ # Report the QV from the modification identification module as a special tag
+ attributes.append(('identificationQv', "%d" % int(round(siteObs['modificationScore']))))
+
+ if siteObs.has_key('modification'):
+
+ if siteObs['modification'] == '.':
+ recordType = 'modified_base'
+
+ elif siteObs['modification'] == 'nMd':
+ recordType = '.'
+
+ else:
+ # if we have an identified mod, use it; otherwise use the old generic term
+ recordType = siteObs['modification']
+
+ else:
+ recordType = 'modified_base'
+
+ refName = siteObs['refName']
+ score = int(round(siteObs['score']))
+ strand = '+' if siteObs['strand'] == 0 else '-'
+
+ return Gff3Record(refName, start, end,
+ type=recordType,
+ score=score,
+ strand=strand,
+ source='kinModCall',
+ attributes=attributes)
+ return rec
+
+ @consumer
+ def gffConsumer(self, filename):
+ """
+ Consume IPD summary rows, filter them and write to GFF
+ """
+
+ #f = file(filename, 'w', 2<<15)
+ f = self.openWriteHandle(filename)
+ gff = GffWriter(f)
+
+ # write headers describing the program that generated the data
+ gff.writeHeader('##source ipdSummary v2.0')
+ gff.writeHeader('##source-commandline %s' % self.options.cmdLine)
+
+ # Write the reference renaming info into the gff headers ala evicons
+ for entry in self.refInfo:
+ gff.writeHeader("##sequence-region %s 1 %d"
+ % (entry.Name, entry.Length))
+
+ minScore = -10 * math.log10(self.options.pvalue)
+ snippetRef = -1
+ try:
+ while True:
+ # Pull a record in from the
+ siteObsList = (yield)
+
+ for siteObs in siteObsList:
+ # self.snippetFunc is a function that return a reference snippet given a template position and a strand
+ if snippetRef != siteObs['refId']:
+ self.snippetFunc = self.ipdModel.snippetFunc(siteObs['refId'], 20, 20)
+ snippetRef = siteObs['refId']
+
+ # Two cases for gff entries:
+ # 1. 'Identified modification' - will have a 'modification' key
+ # - use the modification name as the gff event type
+ # - use 'modificationScore' for the gff score
+ # 2. Detected - no 'modification' key
+ # - use 'modified_base' as the event type
+ # - use the single site 'score' property as the gff score
+ # - do not put this kind into the gff if it contains the a 'offTargetPeak' tag
+
+ if siteObs['coverage'] > self.options.minCoverage:
+ # Case 1
+ if siteObs.has_key('modification') and siteObs['modification'] != '.':
+ gff.writeRecord(self.makeGffRecord(siteObs))
+
+ # Case 2
+ elif siteObs['score'] > minScore and not siteObs.has_key('offTargetPeak'):
+ gff.writeRecord(self.makeGffRecord(siteObs))
+
+ # FIXME: Try not filtering:
+ # gff.writeRecord(self.makeGffRecord(siteObs))
+
+ except GeneratorExit:
+ f.close()
+ return
+
+
+ def makeM5CgffRecord(self, siteObs):
+
+
+ start = siteObs['tpl'] + 1
+ end = siteObs['tpl'] + 1
+
+ attributes = [('coverage', siteObs['coverage']),
+ ('IPDRatio', siteObs['ipdRatio'])]
+
+ recordType = 'CG'
+ refName = siteObs['refId']
+ score = "%.3f" % siteObs['Ca5C']
+ strand = '+' if siteObs['strand'] == 0 else '-'
+
+ return Gff3Record(refName, start, end,
+ type=recordType,
+ score=score,
+ strand=strand,
+ source='kinModCall',attributes=attributes)
+
+
+ @consumer
+ def m5CgffConsumer( self, filename ):
+
+ f = self.openWriteHandle( filename )
+ gff = GffWriter( f )
+
+
+ # write headers describing the program that generated the data
+ gff.writeHeader('##source ipdSummary v2.0')
+ gff.writeHeader('##source-commandline %s' % self.options.cmdLine)
+
+ # Write the reference renaming info into the gff headers ala evicons
+ # for entry in self.refInfo:
+ # gff.writeHeader("##sequence-region %s 1 %d"
+ # % (entry.Name, entry.Length))
+
+ try:
+ while True:
+ # Pull in a single record?
+ siteObsList = (yield)
+
+ for siteObs in siteObsList:
+ if siteObs.has_key('Ca5C') and siteObs['strand'] == 0:
+ gff.writeRecord( self.makeM5CgffRecord( siteObs ) )
+
+ except GeneratorExit:
+ f.close()
+ return
+
+
+
+
+ def onStart(self):
+
+ # Spec for what kinds of output files we can generate.
+ # Entry format is (<option field name>, <extension>, <writer consumer function>)
+ fileSpec = [
+ ('m5Cgff', 'm5C.gff', self.m5CgffConsumer),
+ ('gff', 'gff', self.gffConsumer),
+ ('csv', 'csv', self.csvConsumer),
+ ('ms_csv', 'ms.csv', self.msCsvConsumer),
+ ('pickle', 'pickle', self.csvConsumer),
+ ('summary_h5', 'summary.h5', self.ipdRatioH5Consumer),
+ ('csv_h5', 'h5', self.hdf5CsvConsumer)
+ ]
+
+ sinkList = []
+
+ # Go through the possible output file types and
+ # determine if they should be output
+ for (fileType, ext, func) in fileSpec:
+ name = None
+
+ # The 'outfile argument causes all outputs to be generated
+ if self.options.outfile:
+ name = self.options.outfile + '.' + ext
+
+ # Individual outputs can specified - these filename override the default
+ if self.options.__getattribute__(fileType):
+ name = self.options.__getattribute__(fileType)
+
+ if name:
+ sinkList.append(func(name))
+
+ self.sinkList = sinkList
+
+ def onResult(self, resultChunk):
+ for sink in self.sinkList:
+ sink.send(resultChunk)
+
+ def onFinish(self):
+ for sink in self.sinkList:
+ sink.close()
diff --git a/kineticsTools/WorkerProcess.py b/kineticsTools/WorkerProcess.py
new file mode 100755
index 0000000..8f942b0
--- /dev/null
+++ b/kineticsTools/WorkerProcess.py
@@ -0,0 +1,260 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import cProfile
+import logging
+import os.path
+import copy
+from multiprocessing import Process
+from multiprocessing.process import current_process
+from threading import Thread, Event
+from urlparse import urlparse
+import warnings
+
+import numpy as np
+
+import pbcore.io
+from pbcore.io.opener import (openAlignmentFile, openIndexedAlignmentFile)
+
+
+# FIXME this should ultimately go somewhere else. actually, so should the
+# rest of this module.
+def _openFiles(self, refFile=None, sharedIndices=None):
+ """
+ Hack to enable sharing of indices (but not filehandles!) between dataset
+ instances.
+ """
+ log = logging.getLogger()
+ log.debug("Opening resources")
+ for k, extRes in enumerate(self.externalResources):
+ location = urlparse(extRes.resourceId).path
+ sharedIndex = None
+ if sharedIndices is not None:
+ sharedIndex = sharedIndices[k]
+ try:
+ resource = openIndexedAlignmentFile(
+ location,
+ referenceFastaFname=refFile,
+ sharedIndex=sharedIndex)
+ except (IOError, ValueError):
+ log.info("pbi file missing for {f}, operating with "
+ "reduced speed and functionality".format(
+ f=location))
+ resource = openAlignmentFile(location,
+ referenceFastaFname=refFile)
+ if not resource:
+ raise IOError("{f} fails to open".format(f=location))
+ self._openReaders.append(resource)
+ log.debug("Done opening resources")
+
+def _reopen (self):
+ """
+ Force re-opening of underlying alignment files, preserving the
+ reference and indices if present, and return a copy of the
+ AlignmentSet. This is a workaround to allow us to share the index
+ file(s) already loaded in memory while avoiding multiprocessing
+ problems related to .bam files.
+ """
+ refFile = None
+ if not self.isCmpH5:
+ refFile = self._referenceFile
+ newSet = copy.deepcopy(self)
+ newSet._referenceFastaFname = refFile
+ if not self.isCmpH5 and not self.hasPbi:
+ self.close()
+ newSet._openFiles(refFile=refFile)
+ else:
+ indices = [ f.index for f in self.resourceReaders() ]
+ self.close()
+ _openFiles(newSet, refFile=refFile, sharedIndices=indices)
+ return newSet
+
+
+class Worker(object):
+
+ """
+ Base class for worker processes that read reference coordinates
+ from the task queue, perform variant calling, then push results
+ back to another queue, to be written to a GFF file by another
+ process.
+
+ All tasks that are O(genome length * coverage depth) should be
+ distributed to Worker processes, leaving the ResultCollector
+ process only O(genome length) work to do.
+ """
+
+ def __init__(self, options, workQueue, resultsQueue,
+ sharedAlignmentSet=None):
+ self.options = options
+ self.daemon = True
+ self._workQueue = workQueue
+ self._resultsQueue = resultsQueue
+ self._sharedAlignmentSet = sharedAlignmentSet
+
+ def _run(self):
+ logging.info("Worker %s (PID=%d) started running" % (self.name, self.pid))
+ if self._sharedAlignmentSet is not None:
+ # XXX this will create an entirely new AlignmentSet object, but
+ # keeping any indices already loaded into memory
+ self.caseCmpH5 = _reopen(self._sharedAlignmentSet)
+ #`self._sharedAlignmentSet.close()
+ self._sharedAlignmentSet = None
+ else:
+ warnings.warn("Shared AlignmentSet not used")
+ self.caseCmpH5 = pbcore.io.AlignmentSet(self.options.infile,
+ referenceFastaFname=self.options.reference)
+
+ self.controlCmpH5 = None
+ if not self.options.control is None:
+ # We have a cmp.h5 with control vales -- load that cmp.h5
+ self.controlCmpH5 = pbcore.io.AlignmentSet(self.options.control,
+ referenceFastaFname=self.options.reference)
+
+ if self.options.randomSeed is None:
+ np.random.seed(42)
+ self.onStart()
+
+ while True:
+ if self.isTerminated():
+ break
+
+ chunkDesc = self._workQueue.get()
+ if chunkDesc is None:
+ # Sentinel indicating end of input. Place a sentinel
+ # on the results queue and end this worker process.
+ self._resultsQueue.put(None)
+ self._workQueue.task_done()
+ break
+ else:
+ (chunkId, datum) = chunkDesc
+ logging.info("Got chunk: (%s, %s) -- Process: %s" % (chunkId, str(datum), current_process()))
+ result = self.onChunk(datum)
+
+ logging.debug("Process %s: putting result." % current_process())
+ self._resultsQueue.put((chunkId, result))
+ self._workQueue.task_done()
+
+ self.onFinish()
+
+ logging.info("Process %s (PID=%d) done; exiting." % (self.name, self.pid))
+
+ def run(self):
+ # Make the workers run with lower priority -- hopefully the results writer will win
+ # It is single threaded so it could become the bottleneck
+ self._lowPriority()
+
+ if self.options.doProfiling:
+ cProfile.runctx("self._run()",
+ globals=globals(),
+ locals=locals(),
+ filename="profile-%s.out" % self.name)
+ else:
+ self._run()
+
+ #==
+ # Begin overridable interface
+ #==
+ def onStart(self):
+ pass
+
+ def onChunk(self, target):
+ """
+ This function is the heart of the matter.
+
+ referenceWindow, alnHits -> result
+ """
+ pass
+
+ def onFinish(self):
+ pass
+
+
+class WorkerProcess(Worker, Process):
+
+ """Worker that executes as a process."""
+
+ def __init__(self, *args, **kwds):
+ Process.__init__(self)
+ super(WorkerProcess, self).__init__(*args, **kwds)
+ self.daemon = True
+
+ def _lowPriority(self):
+ """
+ Set the priority of the process to below-normal.
+ """
+ import sys
+ try:
+ sys.getwindowsversion()
+ except:
+ isWindows = False
+ else:
+ isWindows = True
+
+ if isWindows:
+ # Based on:
+ # "Recipe 496767: Set Process Priority In Windows" on ActiveState
+ # http://code.activestate.com/recipes/496767/
+ import win32api
+ import win32process
+ import win32con
+
+ pid = win32api.GetCurrentProcessId()
+ handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
+ win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
+ else:
+ os.nice(10)
+
+ def isTerminated(self):
+ return False
+
+
+class WorkerThread(Worker, Thread):
+
+ """Worker that executes as a thread (for debugging purposes only)."""
+
+ def __init__(self, *args, **kwds):
+ Thread.__init__(self)
+ super(WorkerThread, self).__init__(*args, **kwds)
+ self._stop = Event()
+ self.daemon = True
+ self.exitcode = 0
+
+ def terminate(self):
+ self._stop.set()
+
+ def isTerminated(self):
+ return self._stop.isSet()
+
+ @property
+ def pid(self):
+ return -1
+
+ def _lowPriority(self):
+ pass
diff --git a/kineticsTools/__init__.py b/kineticsTools/__init__.py
new file mode 100755
index 0000000..402d7a8
--- /dev/null
+++ b/kineticsTools/__init__.py
@@ -0,0 +1,29 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
diff --git a/kineticsTools/ipdModel.py b/kineticsTools/ipdModel.py
new file mode 100755
index 0000000..3cc39cc
--- /dev/null
+++ b/kineticsTools/ipdModel.py
@@ -0,0 +1,611 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+import logging
+import os
+import re
+import h5py
+import numpy as np
+import ctypes as C
+from kineticsTools.sharedArray import SharedArray
+from pkg_resources import Requirement, resource_filename
+
+byte = np.dtype('byte')
+float32 = np.dtype('float32')
+uint8 = np.dtype('uint8')
+
+# Map for ascii encoded bases to integers 0-3 -- will be used to define a 24-bit lookup code
+# for fetching predicted IPDs from the kinetic LUT.
+
+# We start everything at 0, so anything will map to 'A' unless it appears in this table
+lutCodeMap = np.zeros(256, dtype=uint8)
+maps = {'a': 0, 'A': 0, 'c': 1, 'C': 1, 'g': 2, 'G': 2, 't': 3, 'T': 3}
+for k in maps:
+ lutCodeMap[ord(k)] = maps[k]
+lutReverseMap = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}
+
+seqCodeMap = np.ones(256, dtype=uint8) * 4
+for k in maps:
+ seqCodeMap[ord(k)] = maps[k]
+seqMap = {0: 'A', 1: 'C', 2: 'G', 3: 'T', 4: 'N'}
+seqMapNp = np.array(['A', 'C', 'G', 'T', 'N'])
+
+seqMapComplement = {0: 'T', 1: 'G', 2: 'C', 3: 'A', 4: 'N'}
+seqMapComplementNp = np.array(['T', 'G', 'C', 'A', 'N'])
+
+# Base letters for modification calling
+# 'H' : m6A, 'I' : m5C, 'J' : m4C, 'K' : m5C/TET
+baseToCode = {'N': 0, 'A': 0, 'C': 1, 'G': 2, 'T': 3, 'H': 4, 'I': 5, 'J': 6, 'K': 7}
+baseToCanonicalCode = {'N': 0, 'A': 0, 'C': 1, 'G': 2, 'T': 3, 'H': 0, 'I': 1, 'J': 1, 'K': 1}
+
+codeToBase = dict([(y, x) for (x, y) in baseToCode.items()])
+
+def _getAbsPath(fname):
+ return resource_filename(Requirement.parse('kineticsTools'),'kineticsTools/%s' % fname)
+
+class GbmContextModel(object):
+
+ """
+ Class for computing ipd predictions on contexts. Evaluate the GBM tree model for a list of contexts
+ Contexts may contain arbitrary combinations of modified bases
+ """
+
+ def __init__(self, modelH5Group, modelIterations=-1):
+
+ # This will hold the ctypes function pointer
+ # It will be lazily initialized
+ self.nativeInnerPredict = None
+ self.nativeInnerPredictCtx = None
+
+ def ds(name):
+ return modelH5Group[name][:]
+
+ self.varNames = ds("VarNames")
+ self.modFeatureIdx = dict((int(self.varNames[x][1:]), x) for x in range(len(self.varNames)) if self.varNames[x][0] == 'M')
+ self.canonicalFeatureIdx = dict((int(self.varNames[x][1:]), x) for x in range(len(self.varNames)) if self.varNames[x][0] == 'R')
+
+ self.pre = 10
+ self.post = 4
+ self.ctxSize = self.pre + self.post + 1
+
+ self.splitVar = ds("Variables")
+ self.leftNodes = ds("LeftNodes")
+ self.rightNodes = ds("RightNodes")
+ self.missingNodes = ds("MissingNodes")
+
+ self.splitVar16 = self.splitVar.astype(np.int16)
+
+ self.splitCodes = ds("SplitCodes").astype(np.float32)
+
+ self.cSplits = ds("CSplits")
+ self.maxCSplits = self.cSplits.shape[1]
+
+ self.initialValue = ds("InitialValue").astype(np.float32)[0]
+
+ exp = 2 ** np.arange(self.cSplits.shape[1] - 1, -1, -1)
+ self.bSplits = ((self.cSplits > 0) * exp).sum(1)
+
+ # total number of trees in model
+ self.nTrees = self.splitVar.shape[0]
+ self.treeSize = self.splitVar.shape[1]
+
+ offsets = np.floor(np.arange(0, self.leftNodes.size) / self.treeSize) * self.treeSize
+ offsets = offsets.astype(np.int32)
+
+ self.leftNodesOffset = self.leftNodes.flatten().astype(np.int32) + offsets
+ self.rightNodesOffset = self.rightNodes.flatten().astype(np.int32) + offsets
+ self.missingNodesOffset = self.missingNodes.flatten().astype(np.int32) + offsets
+
+ self.splitCodesCtx = self.splitCodes.copy().flatten()
+
+ splitCodesCtxView = self.splitCodesCtx.view()
+ splitCodesCtxView.dtype = np.uint32
+
+ # Pack the cSplits as a bit array directly into the splitCode array
+ # using an uin32 view of the splitCode array
+ flatSplitVar = self.splitVar.flatten()
+
+ powOfTwo = 2 ** np.arange(self.maxCSplits)
+
+ for i in xrange(self.splitCodesCtx.shape[0]):
+
+ if flatSplitVar[i] != -1:
+ # This is a pointer to a cSplit row -- pack the csplit into a unit32, then overwirte
+ # this slot of the ctxSplitCodes
+ cs = self.cSplits[int(self.splitCodesCtx[i]), :]
+ v = (powOfTwo * (cs > 0)).sum()
+
+ splitCodesCtxView[i] = v
+
+ # If the user has requested fewer iterations, update nTrees
+ if modelIterations > 0:
+ self.nTrees = modelIterations
+
+ def _initNativeTreePredict(self):
+ """
+ Initialization routine the C tree-predict method
+ Needs to be invoked lazily because the native function pointer cannot be pickled
+ """
+
+ import platform
+
+ if platform.system() == "Windows":
+
+ libfn = "tree_predict.dll"
+ path = os.path.dirname(os.path.abspath(__file__))
+ windowsLib = path + os.path.sep + libfn
+
+ if os.path.exists(windowsLib):
+ self._lib = np.ctypeslib.load_library(libfn, path)
+ else:
+ raise Exception("can't find tree_predict.dll")
+ else:
+ DLL_PATH = _getAbsPath("tree_predict.so")
+
+ if os.path.exists(DLL_PATH):
+ self._lib = np.ctypeslib.load_library("tree_predict.so", DLL_PATH)
+ else:
+ raise Exception("can't find tree_predict.so")
+
+ lpb = self._lib
+
+ lpb.init_native.argtypes = [C.c_int]
+
+ fp = C.POINTER(C.c_float)
+ fpp = C.POINTER(fp)
+ ip = C.POINTER(C.c_int)
+ sp = C.POINTER(C.c_int16)
+ ui64p = C.POINTER(C.c_uint64)
+
+ args = [fp, fpp, C.c_int, ip, ip, ip, fp, ip, ip, ip, C.c_float, C.c_int, C.c_int, C.c_int]
+ lpb.innerPredict.argtypes = args
+ self.nativeInnerPredict = lpb.innerPredict
+
+ # Fast version
+
+ # void innerPredictCtx(
+ # int ctxSize, float radPredF[], uint64_t contextPack[], int cRows,
+ # int16 left[], int16 right[], int16 missing[], float splitCode[], int16 splitVar[],
+ # int varTypes[], float initialValue, int treeSize, int numTrees, int maxCSplitSize)
+
+ args = [C.c_int, fp, ui64p, C.c_int, ip, ip, ip, fp, sp, ip, C.c_float, C.c_int, C.c_int, C.c_int]
+ lpb.innerPredictCtx.argtypes = args
+ self.nativeInnerPredictCtx = lpb.innerPredictCtx
+
+ def getPredictionsSlow(self, ctxStrings, nTrees=None):
+ """Compute IPD predictions for arbitrary methylation-containing contexts."""
+ # C prototype that we call:
+ # void innerPredict(
+ # float[] radPredF,
+ # IntPtr[] dataMatrix,
+ # int cRows, int[] left, int[] right, int[] missing,
+ # float[] splitCode, int[] splitVar, int[] cSplits,
+ # int[] varTypes, float initialValue,
+ # int treeSize, int numTrees, int maxCSplitSize);
+
+ # Make sure native library is initialized
+ if self.nativeInnerPredict is None:
+ self._initNativeTreePredict()
+
+ def fp(arr):
+ return arr.ctypes.data_as(C.POINTER(C.c_float))
+
+ def ip(arr):
+ return arr.ctypes.data_as(C.POINTER(C.c_int))
+
+ if nTrees is None:
+ nTrees = self.nTrees
+
+ n = len(ctxStrings)
+
+ mCols = [np.zeros(n, dtype=np.float32) for x in xrange(self.ctxSize)]
+ rCols = [np.zeros(n, dtype=np.float32) for x in xrange(self.ctxSize)]
+
+ for stringIdx in xrange(len(ctxStrings)):
+ s = ctxStrings[stringIdx]
+
+ for i in xrange(len(s)):
+ mCols[i][stringIdx] = baseToCode[s[i]]
+ rCols[i][stringIdx] = baseToCanonicalCode[s[i]]
+
+ dataPtrs = (C.POINTER(C.c_float) * (2 * self.ctxSize))()
+
+ varTypes = np.zeros(2 * self.ctxSize, dtype=np.int32)
+
+ for i in xrange(self.ctxSize):
+ dataPtrs[self.modFeatureIdx[i]] = mCols[i].ctypes.data_as(C.POINTER(C.c_float))
+ dataPtrs[self.canonicalFeatureIdx[i]] = rCols[i].ctypes.data_as(C.POINTER(C.c_float))
+
+ varTypes[self.modFeatureIdx[i]] = 8
+ varTypes[self.canonicalFeatureIdx[i]] = 4
+
+ self.predictions = np.zeros(len(ctxStrings), dtype=np.float32)
+
+ self.nativeInnerPredict(
+ fp(self.predictions), dataPtrs,
+ n, ip(self.leftNodes), ip(self.rightNodes), ip(self.missingNodes),
+ fp(self.splitCodes), ip(self.splitVar), ip(self.cSplits),
+ ip(varTypes), self.initialValue, self.treeSize, nTrees, self.maxCSplits)
+
+ return np.exp(self.predictions)
+
+ def getPredictions(self, ctxStrings, nTrees=None):
+ """Compute IPD predictions for arbitrary methylation-containing contexts."""
+ # C prototype that we call:
+ # void innerPredictCtx(
+ # int ctxSize, float[] radPredF,
+ # int[] contextPack,
+ # int cRows, int[] left, int[] right, int[] missing,
+ # float[] splitCode, int[] splitVar,
+ # int[] varTypes, float initialValue,
+ # int treeSize, int numTrees, int maxCSplitSize);
+
+ # Make sure native library is initialized
+ if self.nativeInnerPredictCtx is None:
+ self._initNativeTreePredict()
+
+ def fp(arr):
+ return arr.ctypes.data_as(C.POINTER(C.c_float))
+
+ def ip(arr):
+ return arr.ctypes.data_as(C.POINTER(C.c_int))
+
+ def ulp(arr):
+ return arr.ctypes.data_as(C.POINTER(C.c_uint64))
+
+ def sp(arr):
+ return arr.ctypes.data_as(C.POINTER(C.c_int16))
+
+ n = len(ctxStrings)
+
+ if nTrees is None:
+ nTrees = self.nTrees
+
+ packCol = np.zeros(n, dtype=np.uint64)
+
+ for stringIdx in xrange(len(ctxStrings)):
+ s = ctxStrings[stringIdx]
+ code = 0
+
+ for i in xrange(len(s)):
+ modBits = baseToCode[s[i]]
+
+ slotForPosition = self.modFeatureIdx[i]
+
+ code = code | (modBits << (4 * slotForPosition))
+
+ packCol[stringIdx] = code
+
+ # print packed base codes
+ # for v in packCol.flatten():
+ # print v
+ # for i in np.arange(12):
+ # print "%d: %o" % (i, (v.item() >> (5*i)) & 0x1f)
+
+ varTypes = np.zeros(2 * self.ctxSize, dtype=np.int32)
+
+ for i in xrange(self.ctxSize):
+ varTypes[self.modFeatureIdx[i]] = 8
+ varTypes[self.canonicalFeatureIdx[i]] = 4
+
+ self.predictions = np.zeros(len(ctxStrings), dtype=np.float32)
+
+ self.nativeInnerPredictCtx(
+ self.ctxSize, fp(self.predictions), ulp(packCol),
+ n, ip(self.leftNodesOffset), ip(self.rightNodesOffset), ip(self.missingNodesOffset),
+ fp(self.splitCodesCtx), sp(self.splitVar16),
+ ip(varTypes), self.initialValue, self.treeSize, nTrees, self.maxCSplits)
+
+ return np.exp(self.predictions)
+
+
+class IpdModel:
+
+ """
+ Predicts the IPD of an any context, possibly containing multiple modifications.
+ We use a 4^12 entry LUT to get the predictions for contexts without modifications,
+ then we use the GbmModel to get predictions in the presence of arbitrary mods.
+ Note on the coding scheme. For each contig we store a byte-array that has size = contig.length + 2*self.pad
+ The upper 4 bits contain a lookup into seqReverseMap, which can contains N's. This is used for giving
+ template snippets that may contains N's if the reference sequence does, or if the snippet
+ The lowe 4 bits contain a lookup into lutReverseMap, which
+ """
+
+ def __init__(self, fastaRecords, modelFile, modelIterations=-1):
+ """
+ Load the reference sequences and the ipd lut into shared arrays that can be
+ used as numpy arrays in worker processes.
+ fastaRecords is a list of FastaRecords, in the cmp.h5 file order
+ """
+
+ self.pre = 10
+ self.post = 4
+
+ self.pad = 30
+ self.base4 = 4 ** np.array(range(self.pre + self.post + 1))
+
+ self.refDict = {}
+ self.refLengthDict = {}
+
+ for contig in fastaRecords:
+ if contig.cmph5ID is None:
+ # This contig has no mapped reads -- skip it
+ continue
+
+ rawSeq = contig.sequence[:]
+ refSeq = np.fromstring(rawSeq, dtype=byte)
+
+ # Store the reference length
+ self.refLengthDict[contig.cmph5ID] = len(rawSeq)
+
+ # Make a shared array
+ sa = SharedArray(dtype='B', shape=len(rawSeq) + self.pad * 2)
+ saWrap = sa.getNumpyWrapper()
+
+ # Lut Codes convert Ns to As so that we don't put Ns into the Gbm Model
+ # Seq Codes leaves Ns as Ns for getting reference snippets out
+ innerLutCodes = lutCodeMap[refSeq]
+ innerSeqCodes = seqCodeMap[refSeq]
+ innerCodes = np.bitwise_or(innerLutCodes, np.left_shift(innerSeqCodes, 4))
+
+ saWrap[self.pad:(len(rawSeq) + self.pad)] = innerCodes
+
+ # Padding codes -- the lut array is padded with 0s the sequence array is padded with N's (4)
+ outerCodes = np.left_shift(np.ones(self.pad, dtype=uint8) * 4, 4)
+ saWrap[0:self.pad] = outerCodes
+ saWrap[(len(rawSeq) + self.pad):(len(rawSeq) + 2 * self.pad)] = outerCodes
+
+ self.refDict[contig.cmph5ID] = sa
+
+ # No correction factor for IPDs everything is normalized to 1
+ self.meanIpd = 1
+
+ # Find and open the ipd model file
+ self.lutPath = modelFile
+ if os.path.exists(self.lutPath):
+ h5File = h5py.File(self.lutPath, mode='r')
+
+ gbmModelGroup = h5File["/AllMods_GbmModel"]
+ self.gbmModel = GbmContextModel(gbmModelGroup, modelIterations)
+
+ # We always use the model -- no more LUTS
+ self.predictIpdFunc = self.predictIpdFuncModel
+ self.predictManyIpdFunc = self.predictManyIpdFuncModel
+ else:
+ logging.info("Couldn't find model file: %s" % self.lutPath)
+
+ def _loadIpdTable(self, nullModelGroup):
+ """
+ Read the null kinetic model into a shared numpy array dataset
+ """
+ nullModelDataset = nullModelGroup["KineticValues"]
+
+ # assert that the dataset is a uint8
+ assert(nullModelDataset.dtype == uint8)
+
+ # Construct a 'shared array' (a numpy wrapper around some shared memory
+ # Read the LUT into this table
+ self.sharedArray = SharedArray('B', nullModelDataset.shape[0])
+ lutArray = self.sharedArray.getNumpyWrapper()
+ nullModelDataset.read_direct(lutArray)
+
+ # Load the second-level LUT
+ self.floatLut = nullModelGroup["Lut"][:]
+
+ def refLength(self, refId):
+ return self.refLengthDict[refId]
+
+ def cognateBaseFunc(self, refId):
+ """
+ Return a function that returns a snippet of the reference sequence around a given position
+ """
+
+ # FIXME -- what is the correct strand to return?!
+ # FIXME -- what to do about padding when the snippet runs off the end of the reference
+ # how do we account for / indicate what is happening
+ refArray = self.refDict[refId].getNumpyWrapper()
+
+ def f(tplPos, tplStrand):
+
+ # skip over the padding
+ tplPos += self.pad
+
+ # Forward strand
+ if tplStrand == 0:
+ slc = refArray[tplPos]
+ slc = np.right_shift(slc, 4)
+ return seqMap[slc]
+
+ # Reverse strand
+ else:
+ slc = refArray[tplPos]
+ slc = np.right_shift(slc, 4)
+ return seqMapComplement[slc]
+
+ return f
+
+ def snippetFunc(self, refId, pre, post):
+ """
+ Return a function that returns a snippet of the reference sequence around a given position
+ """
+
+ refArray = self.refDict[refId].getNumpyWrapper()
+
+ def f(tplPos, tplStrand):
+ """Closure for returning a reference snippet. The reference is padded with N's for bases falling outside the extents of the reference"""
+ # skip over the padding
+ tplPos += self.pad
+
+ # Forward strand
+ if tplStrand == 0:
+ slc = refArray[(tplPos - pre):(tplPos + 1 + post)]
+ slc = np.right_shift(slc, 4)
+ return seqMapNp[slc].tostring()
+
+ # Reverse strand
+ else:
+ slc = refArray[(tplPos + pre):(tplPos - post - 1):-1]
+ slc = np.right_shift(slc, 4)
+ return seqMapComplementNp[slc].tostring()
+
+ return f
+
+ def getReferenceWindow(self, refId, tplStrand, start, end):
+ """
+ Return a snippet of the reference sequence
+ """
+
+ refArray = self.refDict[refId].getNumpyWrapper()
+
+ # adjust position for reference padding
+ start += self.pad
+ end += self.pad
+
+ # Forward strand
+ if tplStrand == 0:
+ slc = refArray[start:end]
+ slc = np.right_shift(slc, 4)
+ return "".join(seqMap[x] for x in slc)
+
+ # Reverse strand
+ else:
+ slc = refArray[end:start:-1]
+ slc = np.right_shift(slc, 4)
+ return "".join(seqMapComplement[x] for x in slc)
+
+ def predictIpdFuncLut(self, refId):
+ """
+ Each (pre+post+1) base context gets mapped to an integer
+ by converting each nucleotide to a base-4 number A=0, C=1, etc,
+ and treating the 'pre' end of the context of the least significant
+ digit. This code is used to lookup the expected IPD in a
+ pre-computed table. Contexts near the ends of the reference
+ are coded by padding the context with 0
+ """
+
+ # Materialized the numpy wrapper around the shared data
+ refArray = self.refDict[refId].getNumpyWrapper()
+ lutArray = self.sharedArray.getNumpyWrapper()
+ floatLut = self.floatLut
+
+ def f(tplPos, tplStrand):
+
+ # skip over the padding
+ tplPos += self.pad
+
+ # Forward strand
+ if tplStrand == 0:
+ slc = np.bitwise_and(refArray[(tplPos + self.pre):(tplPos - self.post - 1):-1], 0xf)
+
+ # Reverse strand
+ else:
+ slc = 3 - np.bitwise_and(refArray[(tplPos - self.pre):(tplPos + 1 + self.post)], 0xf)
+
+ code = (self.base4 * slc).sum()
+ return floatLut[max(1, lutArray[code])]
+
+ return f
+
+ def predictIpdFuncModel(self, refId):
+ """
+ Each (pre+post+1) base context gets mapped to an integer
+ by converting each nucleotide to a base-4 number A=0, C=1, etc,
+ and treating the 'pre' end of the context of the least significant
+ digit. This code is used to lookup the expected IPD in a
+ pre-computed table. Contexts near the ends of the reference
+ are coded by padding the context with 0
+ """
+
+ # Materialized the numpy wrapper around the shared data
+ snipFunction = self.snippetFunc(refId, self.post, self.pre)
+
+ def f(tplPos, tplStrand):
+ # Get context string
+ context = snipFunction(tplPos, tplStrand)
+
+ # Get prediction
+ return self.gbmModel.getPredictions([context])[0]
+
+ return f
+
+ def predictManyIpdFuncModel(self, refId):
+ """
+ Each (pre+post+1) base context gets mapped to an integer
+ by converting each nucleotide to a base-4 number A=0, C=1, etc,
+ and treating the 'pre' end of the context of the least significant
+ digit. This code is used to lookup the expected IPD in a
+ pre-computed table. Contexts near the ends of the reference
+ are coded by padding the context with 0
+ """
+
+ # Materialized the numpy wrapper around the shared data
+ snipFunction = self.snippetFunc(refId, self.post, self.pre)
+
+ def fMany(sites):
+ contexts = [snipFunction(x[0], x[1]) for x in sites]
+ return self.gbmModel.getPredictions(contexts)
+
+ return fMany
+
+ def modPredictIpdFunc(self, refId, mod):
+ """
+ Each (pre+post+1) base context gets mapped to an integer
+ by converting each nucleotide to a base-4 number A=0, C=1, etc,
+ and treating the 'pre' end of the context of the least significant
+ digit. This code is used to lookup the expected IPD in a
+ pre-computed table. Contexts near the ends of the reference
+ are coded by padding the context with 0
+ """
+
+ refArray = self.refDict[refId].getNumpyWrapper()
+
+ def f(tplPos, relativeModPos, readStrand):
+
+ # skip over the padding
+ tplPos += self.pad
+
+ # Read sequence matches forward strand
+ if readStrand == 0:
+ slc = 3 - np.bitwise_and(refArray[(tplPos - self.pre):(tplPos + 1 + self.post)], 0xf)
+
+ # Reverse strand
+ else:
+ slc = np.bitwise_and(refArray[(tplPos + self.pre):(tplPos - self.post - 1):-1], 0xf)
+
+ # Modify the indicated position
+ slc[relativeModPos + self.pre] = baseToCode[mod]
+
+ slcString = "".join([codeToBase[x] for x in slc])
+
+ # Get the prediction for this context
+ # return self.gbmModel.getPredictions([slcString])[0]
+ return 0.0
+
+ return f
diff --git a/kineticsTools/ipdSummary.py b/kineticsTools/ipdSummary.py
new file mode 100755
index 0000000..783d251
--- /dev/null
+++ b/kineticsTools/ipdSummary.py
@@ -0,0 +1,780 @@
+#!/usr/bin/env python
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+"""
+Tool for detecting DNA base-modifications from kinetic signatures.
+"""
+
+import cProfile
+import functools
+import gc
+import itertools
+import argparse
+import json
+
+import os
+import logging
+import sys
+import multiprocessing
+import time
+import threading
+import numpy as np
+import Queue
+import traceback
+from pkg_resources import Requirement, resource_filename
+
+from pbcommand.models import FileTypes, SymbolTypes, get_pbparser
+from pbcommand.cli import pbparser_runner
+from pbcommand.utils import setup_log
+from pbcore.io import AlignmentSet
+
+from kineticsTools.KineticWorker import KineticWorkerThread, KineticWorkerProcess
+from kineticsTools.ResultWriter import KineticsWriter
+from kineticsTools.ipdModel import IpdModel
+from kineticsTools.ReferenceUtils import ReferenceUtils
+
+# Version info
+__p4revision__ = "$Revision: #1 $"
+__p4change__ = "$Change: 100972 $"
+revNum = int(__p4revision__.strip("$").split(" ")[1].strip("#"))
+changeNum = int(__p4change__.strip("$").split(":")[-1])
+__version__ = "2.2"
+
+class Constants(object):
+ TOOL_ID = "kinetics_tools.tasks.ipd_summary"
+ TOOL_NAME = "ipdSummary"
+ DRIVER_EXE = "python -m kineticsTools.ipdSummary --resolved-tool-contract"
+ PVALUE_ID = "kinetics_tools.task_options.pvalue"
+ PVALUE_DEFAULT = 0.01
+ MAX_LENGTH_ID = "kinetics_tools.task_options.max_length"
+ MAX_LENGTH_DEFAULT = int(3e12)
+ METHYL_FRACTION_ID = "kinetics_tools.task_options.compute_methyl_fraction"
+ IDENTIFY_ID = "kinetics_tools.task_options.identify"
+
+def _getResourcePath():
+ return resource_filename(Requirement.parse('kineticsTools'),'kineticsTools/resources')
+
+def _validateResource(func, p):
+ """Basic func for validating files, dirs, etc..."""
+ if func(p):
+ return os.path.abspath(p)
+ else:
+ raise IOError("Unable to find {p}".format(p=p))
+
+
+def _validateNoneOrResource(func, p):
+ """
+ Handle optional values. If a file or dir is explicitly provided, then
+ it will validated.
+ """
+ if p is None:
+ return p
+ else:
+ return _validateResource(func, p)
+
+
+validateFile = functools.partial(_validateResource, os.path.isfile)
+validateDir = functools.partial(_validateResource, os.path.isdir)
+
+validateNoneOrFile = functools.partial(_validateNoneOrResource, os.path.isfile)
+validateNoneOrDir = functools.partial(_validateNoneOrResource, os.path.isdir)
+
+def get_parser():
+ p = get_pbparser(
+ tool_id=Constants.TOOL_ID,
+ version=__version__,
+ name=Constants.TOOL_NAME,
+ description=__doc__,
+ driver_exe=Constants.DRIVER_EXE,
+ is_distributed=True,
+ nproc=SymbolTypes.MAX_NPROC)
+ p.add_input_file_type(FileTypes.DS_ALIGN, "alignment_set",
+ "Alignment DataSet", "BAM or Alignment DataSet")
+ # FIXME just use a positional argument...
+ p.tool_contract_parser.add_input_file_type(FileTypes.DS_REF, "reference",
+ "Reference DataSet", "Fasta or Reference DataSet")
+ p.arg_parser.parser.add_argument("--reference", action="store",
+ required=True,
+ type=validateFile, help="Fasta or Reference DataSet")
+ # XXX GFF and CSV are "option" for arg parser, not tool contract
+ p.tool_contract_parser.add_output_file_type(FileTypes.GFF, "gff",
+ name="GFF file",
+ description="GFF file of modified bases",
+ default_name="basemods.gff")
+ p.tool_contract_parser.add_output_file_type(FileTypes.CSV, "csv",
+ name="CSV file",
+ description="CSV file of per-nucleotide information",
+ default_name="basemods.csv")
+ p.arg_parser.parser.add_argument("--gff", action="store", default=None,
+ help="Output GFF file of modified bases")
+ p.arg_parser.parser.add_argument("--csv", action="store", default=None,
+ help="Output CSV file out per-nucleotide information")
+ # FIXME use central --nproc option
+ p.arg_parser.parser.add_argument('--numWorkers', '-j',
+ dest='numWorkers',
+ default=1,
+ type=int,
+ help='Number of thread to use (-1 uses all logical cpus)')
+ # common options
+ p.add_float(Constants.PVALUE_ID, "pvalue",
+ default=Constants.PVALUE_DEFAULT,
+ name="P-value",
+ description="P-value cutoff")
+ p.add_int(Constants.MAX_LENGTH_ID,
+ option_str="maxLength",
+ default=Constants.MAX_LENGTH_DEFAULT,
+ name="Max sequence length",
+ description="Maximum number of bases to process per contig")
+ p.add_str(Constants.IDENTIFY_ID,
+ option_str="identify",
+ default="",
+ name="Identify basemods",
+ description="Specific modifications to identify (comma-separated "+\
+ "list). Currrent options are m6A, m4C, m5C_TET. Cannot be "+\
+ "used with --control.")
+ _DESC = "In the --identify mode, add --methylFraction to "+\
+ "command line to estimate the methylated fraction, along with "+\
+ "95%% confidence interval bounds."
+ # FIXME tool contract parser and argparser conflict
+ p.tool_contract_parser.add_boolean(Constants.METHYL_FRACTION_ID,
+ option_str="methylFraction",
+ default=False,
+ name="Compute methyl fraction",
+ description=_DESC)
+ p.arg_parser.parser.add_argument("--methylFraction", action="store_true",
+ help=_DESC)
+ _get_more_options(p.arg_parser.parser)
+ return p
+
+def _get_more_options(parser):
+ """
+ Advanced options that won't be exposed via tool contract interface.
+ """
+ parser.add_argument('--outfile',
+ dest='outfile',
+ default=None,
+ help='Use this option to generate all possible output files. Argument here is the root filename of the output files.')
+
+ # FIXME: Need to add an extra check for this; it can only be used if --useLDA flag is set.
+ parser.add_argument('--m5Cgff',
+ dest='m5Cgff',
+ default=None,
+ help='Name of output GFF file containing m5C scores')
+
+ # FIXME: Make sure that this is specified if --useLDA flag is set.
+ parser.add_argument('--m5Cclassifer',
+ dest='m5Cclassifier',
+ default=None,
+ help='Specify csv file containing a 127 x 2 matrix')
+
+
+ parser.add_argument('--csv_h5',
+ dest='csv_h5',
+ default=None,
+ help='Name of csv output to be written in hdf5 format.')
+
+ parser.add_argument('--pickle',
+ dest='pickle',
+ default=None,
+ help='Name of output pickle file.')
+
+ parser.add_argument('--summary_h5',
+ dest='summary_h5',
+ default=None,
+ help='Name of output summary h5 file.')
+
+
+ parser.add_argument('--ms_csv',
+ dest='ms_csv',
+ default=None,
+ help='Multisite detection CSV file.')
+
+
+ # Calculation options:
+
+
+ parser.add_argument('--control',
+ dest='control',
+ default=None,
+ type=validateNoneOrFile,
+ help='cmph.h5 file containing a control sample. Tool will perform a case-control analysis')
+
+ # Temporary addition to test LDA for Ca5C detection:
+ parser.add_argument('--useLDA',
+ action="store_true",
+ dest='useLDA',
+ default=False,
+ help='Set this flag to debug LDA for m5C/Ca5C detection')
+
+
+
+ # Parameter options:
+
+ parser.add_argument('--paramsPath',
+ dest='paramsPath',
+ default=_getResourcePath(),
+ type=validateNoneOrDir,
+ help='Directory containing in-silico trained model for each chemistry')
+
+ parser.add_argument('--minCoverage',
+ dest='minCoverage',
+ default=3,
+ type=int,
+ help='Minimum coverage required to call a modified base')
+
+ parser.add_argument('--maxQueueSize',
+ dest='maxQueueSize',
+ default=20,
+ type=int,
+ help='Max Queue Size')
+
+ parser.add_argument('--maxCoverage',
+ dest='maxCoverage',
+ type=int, default=-1,
+ help='Maximum coverage to use at each site')
+
+ parser.add_argument('--mapQvThreshold',
+ dest='mapQvThreshold',
+ type=float,
+ default=-1.0)
+
+ parser.add_argument('--ipdModel',
+ dest='ipdModel',
+ default=None,
+ help='Alternate synthetic IPD model HDF5 file')
+
+ parser.add_argument('--modelIters',
+ dest='modelIters',
+ type=int,
+ default=-1,
+ help='[Internal] Number of GBM model iteration to use')
+
+ parser.add_argument('--cap_percentile',
+ dest='cap_percentile',
+ type=float,
+ default=99.0,
+ help='Global IPD percentile to cap IPDs at')
+
+
+ parser.add_argument("--methylMinCov",
+ type=int,
+ dest='methylMinCov',
+ default=10,
+ help="Do not try to estimate methylFraction unless coverage is at least this.")
+
+ parser.add_argument("--identifyMinCov",
+ type=int,
+ dest='identifyMinCov',
+ default=5,
+ help="Do not try to identify the modification type unless coverage is at least this.")
+
+ parser.add_argument("--maxAlignments",
+ type=int,
+ dest="maxAlignments",
+ default=1500,
+ help="Maximum number of alignments to use for a given window")
+
+
+ # Computation management options:
+
+ parser.add_argument("-w", "--referenceWindow", "--referenceWindows",
+ "--refContigs", # backwards compatibility
+ type=str,
+ dest='referenceWindowsAsString',
+ default=None,
+ help="The window (or multiple comma-delimited windows) of the reference to " + \
+ "be processed, in the format refGroup[:refStart-refEnd] " + \
+ "(default: entire reference).")
+
+ def slurpWindowFile(fname):
+ return ",".join(map(str.strip, open(fname).readlines()))
+
+
+ parser.add_argument("--refContigIndex", type=int, dest='refContigIndex', default=-1,
+ help="For debugging purposes only - rather than enter a reference contig name, simply enter an index" )
+
+ parser.add_argument("-W", "--referenceWindowsFile",
+ "--refContigsFile", # backwards compatibility
+ type=slurpWindowFile,
+ dest='referenceWindowsAsString',
+ default=None,
+ help="A file containing reference window designations, one per line")
+
+ parser.add_argument("--skipUnrecognizedContigs",
+ type=bool,
+ default=False,
+ help="Whether to skip, or abort, unrecognized contigs in the -w/-W flags")
+ # FIXME shouldn't it always do this?
+ parser.add_argument("--alignmentSetRefWindows",
+ action="store_true",
+ dest="referenceWindowsFromAlignment",
+ help="Use refWindows in dataset")
+
+ # Debugging help options:
+
+ parser.add_argument("--threaded", "-T",
+ action="store_true",
+ dest="threaded",
+ default=False,
+ help="Run threads instead of processes (for debugging purposes only)")
+
+ parser.add_argument("--profile",
+ action="store_true",
+ dest="doProfiling",
+ default=False,
+ help="Enable Python-level profiling (using cProfile).")
+
+ parser.add_argument('--usePdb',
+ action='store_true',
+ dest="usePdb",
+ default=False,
+ help="Enable dropping down into pdb debugger if an Exception is raised.")
+
+ parser.add_argument("--seed",
+ action="store",
+ dest="randomSeed",
+ type=int,
+ default=None,
+ help="Random seed (for development and debugging purposes only)")
+
+ # Verbosity
+ parser.add_argument("--verbose",
+ action="store_true",
+ default=False)
+ return parser
+
+
+class KineticsToolsRunner(object):
+ def __init__(self, args):
+ self.args = args
+ self._sharedAlignmentSet = None
+
+ def start(self):
+ self.validateArgs()
+ return self.run()
+
+ def getVersion(self):
+ return __version__
+
+ def validateArgs(self):
+ parser = get_parser()
+ if not os.path.exists(self.args.alignment_set):
+ parser.error('Input AlignmentSet file provided does not exist')
+
+ if self.args.identify and self.args.control:
+ parser.error('--control and --identify are mutally exclusive. Please choose one or the other')
+
+ if self.args.useLDA:
+ if self.args.m5Cclassifier is None:
+ parser.error('Please specify a folder containing forward.csv and reverse.csv classifiers in --m5Cclassifier.')
+
+ if self.args.m5Cgff:
+ if not self.args.useLDA:
+ parser.error('m5Cgff file can only be generated in --useLDA mode.')
+
+ # if self.args.methylFraction and not self.args.identify:
+ # parser.error('Currently, --methylFraction only works when the --identify option is specified.')
+
+ def run(self):
+
+ # Figure out what modifications to identify
+ mods = self.args.identify
+ modsToCall = []
+ if mods:
+ items = mods.split(",")
+
+ if 'm6A' in items:
+ modsToCall.append('H')
+
+ if 'm4C' in items:
+ modsToCall.append('J')
+
+ if 'm5C_TET' in items:
+ modsToCall.append('K')
+
+ self.args.identify = True
+ self.args.modsToCall = modsToCall
+
+ self.options = self.args
+ self.options.cmdLine = " ".join(sys.argv)
+ self._workers = []
+
+ # set random seed
+ # XXX note that this is *not* guaranteed to yield reproducible results
+ # indepenently of the number of processing cores used!
+ if self.options.randomSeed is not None:
+ np.random.seed(self.options.randomSeed)
+
+ if self.args.doProfiling:
+ cProfile.runctx("self._mainLoop()",
+ globals=globals(),
+ locals=locals(),
+ filename="profile.out")
+
+ else:
+ try:
+ ret = self._mainLoop()
+ finally:
+ # Be sure to shutdown child processes if we get an exception on the main thread
+ if not self.args.threaded:
+ for w in self._workers:
+ if w.is_alive():
+ w.terminate()
+
+ return ret
+
+ def _initQueues(self):
+ if self.options.threaded:
+ # Work chunks are created by the main thread and put on this queue
+ # They will be consumed by KineticWorker threads, stored in self._workers
+ self._workQueue = Queue.Queue(self.options.maxQueueSize)
+
+ # Completed chunks are put on this queue by KineticWorker threads
+ # They are consumed by the KineticsWriter process
+ self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)
+ else:
+ # Work chunks are created by the main thread and put on this queue
+ # They will be consumed by KineticWorker threads, stored in self._workers
+ self._workQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)
+
+ # Completed chunks are put on this queue by KineticWorker threads
+ # They are consumed by the KineticsWriter process
+ self._resultsQueue = multiprocessing.JoinableQueue(self.options.maxQueueSize)
+
+ def _launchSlaveProcesses(self):
+ """
+ Launch a group of worker processes (self._workers), the queue
+ (self._workQueue) that will be used to send them chunks of
+ work, and the queue that will be used to receive back the
+ results (self._resultsQueue).
+
+ Additionally, launch the result collector process.
+ """
+ availableCpus = multiprocessing.cpu_count()
+ logging.info("Available CPUs: %d" % (availableCpus,))
+ logging.info("Requested worker processes: %d" % (self.options.numWorkers,))
+
+ # Use all CPUs if numWorkers < 1
+ if self.options.numWorkers < 1:
+ self.options.numWorkers = availableCpus
+
+ # Warn if we make a bad numWorker argument is used
+ if self.options.numWorkers > availableCpus:
+ logging.warn("More worker processes requested (%d) than CPUs available (%d);"
+ " may result in suboptimal performance."
+ % (self.options.numWorkers, availableCpus))
+
+ self._initQueues()
+
+ if self.options.threaded:
+ self.options.numWorkers = 1
+ WorkerType = KineticWorkerThread
+ else:
+ WorkerType = KineticWorkerProcess
+
+ # Launch the worker processes
+ self._workers = []
+ for i in xrange(self.options.numWorkers):
+ p = WorkerType(self.options, self._workQueue, self._resultsQueue,
+ self.ipdModel,
+ sharedAlignmentSet=self._sharedAlignmentSet)
+ self._workers.append(p)
+ p.start()
+ logging.info("Launched worker processes.")
+
+ # Launch result collector
+ self._resultCollectorProcess = KineticsWriter(self.options, self._resultsQueue, self.refInfo, self.ipdModel)
+ self._resultCollectorProcess.start()
+ logging.info("Launched result collector process.")
+
+ # Spawn a thread that monitors worker threads for crashes
+ self.monitoringThread = threading.Thread(target=monitorChildProcesses, args=(self._workers + [self._resultCollectorProcess],))
+ self.monitoringThread.start()
+
+ def _queueChunksForWindow(self, refWindow):
+ """
+ Compute the chunk extents and queue up the work for a single reference
+ """
+ winId = refWindow.refId
+ winStart = refWindow.start
+ winEnd = refWindow.end
+ pass
+
+ def loadReferenceAndModel(self, referencePath):
+ assert self._sharedAlignmentSet is not None
+ # Load the reference contigs - annotated with their refID from the cmp.h5
+ logging.info("Loading reference contigs %s" % referencePath)
+ contigs = ReferenceUtils.loadReferenceContigs(referencePath,
+ alignmentSet=self._sharedAlignmentSet)
+
+ # There are three different ways the ipdModel can be loaded.
+ # In order of precedence they are:
+ # 1. Explicit path passed to --ipdModel
+ # 2. Path to parameter bundle, model selected using the cmp.h5's sequencingChemistry tags
+ # 3. Fall back to built-in model.
+
+ # By default, use built-in model
+ ipdModel = None
+
+ if self.args.ipdModel:
+ ipdModel = self.args.ipdModel
+ logging.info("Using passed in ipd model: %s" % self.args.ipdModel)
+ if not os.path.exists(self.args.ipdModel):
+ logging.error("Couldn't find model file: %s" % self.args.ipdModel)
+ sys.exit(1)
+ elif self.args.paramsPath:
+ if not os.path.exists(self.args.paramsPath):
+ logging.error("Params path doesn't exist: %s" % self.args.paramsPath)
+ sys.exit(1)
+
+ majorityChem = ReferenceUtils.loadAlignmentChemistry(
+ self._sharedAlignmentSet)
+ ipdModel = os.path.join(self.args.paramsPath, majorityChem + ".h5")
+ if majorityChem == 'unknown':
+ logging.error("Chemistry cannot be identified---cannot perform kinetic analysis")
+ sys.exit(1)
+ elif not os.path.exists(ipdModel):
+ logging.error("Aborting, no kinetics model available for this chemistry: %s" % ipdModel)
+ sys.exit(1)
+ else:
+ logging.info("Using Chemistry matched IPD model: %s" % ipdModel)
+
+ self.ipdModel = IpdModel(contigs, ipdModel, self.args.modelIters)
+
+ def loadSharedAlignmentSet(self, cmpH5Filename):
+ """
+ Read the input AlignmentSet so the indices can be shared with the
+ slaves. This is also used to pass to ReferenceUtils for setting up
+ the ipdModel object.
+ """
+ logging.info("Reading AlignmentSet: %s" % cmpH5Filename)
+ logging.info(" reference: %s" % self.args.reference)
+ self._sharedAlignmentSet = AlignmentSet(cmpH5Filename,
+ referenceFastaFname=self.args.reference)
+ # XXX this should ensure that the file(s) get opened, including any
+ # .pbi indices - but need to confirm this
+ self.refInfo = self._sharedAlignmentSet.referenceInfoTable
+
+ def _mainLoop(self):
+ """
+ Main loop
+ First launch the worker and writer processes
+ Then we loop over ReferenceGroups in the cmp.h5. For each contig we will:
+ 1. Load the sequence into the main memory of the parent process
+ 3. Chunk up the contig and submit the chunk descriptions to the work queue
+ Finally, wait for the writer process to finish.
+ """
+
+ # This looks scary but it's not. Python uses reference
+ # counting and has a secondary, optional garbage collector for
+ # collecting garbage cycles. Unfortunately when a cyclic GC
+ # happens when a thread is calling cPickle.dumps, the
+ # interpreter crashes sometimes. See Bug 19704. Since we
+ # don't leak garbage cycles, disabling the cyclic GC is
+ # essentially harmless.
+ gc.disable()
+
+ # Load a copy of the cmpH5 alignment index to share with the slaves
+ self.loadSharedAlignmentSet(self.args.alignment_set)
+
+ # Load reference and IpdModel
+ self.loadReferenceAndModel(self.args.reference)
+
+ # Spawn workers
+ self._launchSlaveProcesses()
+
+ # WARNING -- cmp.h5 file must be opened AFTER worker processes have been spawned
+ # cmp.h5 we're using -- use this to orchestrate the work
+ self.cmph5 = self._sharedAlignmentSet
+ logging.info('Generating kinetics summary for [%s]' % self.args.alignment_set)
+
+ #self.referenceMap = self.cmph5['/RefGroup'].asDict('RefInfoID', 'ID')
+ #self.alnInfo = self.cmph5['/AlnInfo'].asRecArray()
+
+ # Resolve the windows that will be visited.
+ if self.args.referenceWindowsAsString is not None:
+ self.referenceWindows = []
+ for s in self.args.referenceWindowsAsString.split(","):
+ try:
+ win = ReferenceUtils.parseReferenceWindow(s, self.cmph5.referenceInfo)
+ self.referenceWindows.append(win)
+ except:
+ if self.args.skipUnrecognizedContigs:
+ continue
+ else:
+ raise Exception, "Unrecognized contig!"
+ elif self.args.referenceWindowsFromAlignment:
+ self.referenceWindows = ReferenceUtils.referenceWindowsFromAlignment(self._sharedAlignmentSet, self.cmph5.referenceInfo)
+ else:
+ self.referenceWindows = ReferenceUtils.createReferenceWindows(
+ self.refInfo)
+
+ # Main loop -- we loop over ReferenceGroups in the cmp.h5. For each contig we will:
+ # 1. Load the sequence into the main memory of the parent process
+ # 2. Fork the workers
+ # 3. chunk up the contig and
+
+ self.workChunkCounter = 0
+
+ # Iterate over references
+ for window in self.referenceWindows:
+ logging.info('Processing window/contig: %s' % (window,))
+ for chunk in ReferenceUtils.enumerateChunks(1000, window):
+ self._workQueue.put((self.workChunkCounter, chunk))
+ self.workChunkCounter += 1
+
+ # Shutdown worker threads with None sentinels
+ for i in xrange(self.args.numWorkers):
+ self._workQueue.put(None)
+
+ for w in self._workers:
+ w.join()
+
+ # Join on the result queue and the resultsCollector process.
+ # This ensures all the results are written before shutdown.
+ self.monitoringThread.join()
+ self._resultsQueue.join()
+ self._resultCollectorProcess.join()
+ logging.info("ipdSummary.py finished. Exiting.")
+ del self.cmph5
+ return 0
+
+
+def monitorChildProcesses(children):
+ """
+ Monitors child processes: promptly exits if a child is found to
+ have exited with a nonzero exit code received; otherwise returns
+ zero when all processes exit cleanly (0).
+
+ This approach is portable--catching SIGCHLD doesn't work on
+ Windows.
+ """
+ while True:
+ all_exited = all(not p.is_alive() for p in children)
+ nonzero_exits = [p.exitcode for p in children if p.exitcode]
+ if nonzero_exits:
+ exitcode = nonzero_exits[0]
+ logging.error("Child process exited with exitcode=%d. Aborting." % exitcode)
+
+ # Kill all the child processes
+ for p in children:
+ if p.is_alive():
+ p.terminate()
+
+ os._exit(exitcode)
+ elif all_exited:
+ return 0
+ time.sleep(1)
+
+def args_runner(args):
+ log = logging.getLogger()
+ if args.verbose:
+ log.setLevel(logging.INFO)
+ else:
+ log.setLevel(logging.WARN)
+ kt = KineticsToolsRunner(args)
+ return kt.start()
+
+def resolved_tool_contract_runner(resolved_contract):
+ """
+ Run ipdSummary from a resolved tool contract. This basically just
+ translates the contract into arguments that can be passed to the argparse
+ parser and then args_runner.
+
+ :param resolved_contract:
+ :type resolved_contract: ResolvedToolContract
+ :return: Exit code
+ """
+ rc = resolved_contract
+ alignment_path = rc.task.input_files[0]
+ reference_path = rc.task.input_files[1]
+ gff_path = rc.task.output_files[0]
+ csv_path = rc.task.output_files[1]
+ args = [
+ alignment_path,
+ "--reference", reference_path,
+ "--gff", gff_path,
+ "--csv", csv_path,
+ "--numWorkers", str(rc.task.nproc),
+ "--pvalue", str(rc.task.options[Constants.PVALUE_ID]),
+ "--alignmentSetRefWindows",
+ ]
+ if not "PACBIO_TEST_ENV" in os.environ:
+ args.append("--verbose") # we need this for pbsmrtpipe debugging
+ if rc.task.options[Constants.MAX_LENGTH_ID]:
+ args.extend([
+ "--maxLength", str(rc.task.options[Constants.MAX_LENGTH_ID]),
+ ])
+ if rc.task.options[Constants.METHYL_FRACTION_ID]:
+ args.append("--methylFraction")
+ if rc.task.options[Constants.IDENTIFY_ID]:
+ args.extend([
+ "--identify", rc.task.options[Constants.IDENTIFY_ID],
+ ])
+ args_ = get_parser().arg_parser.parser.parse_args(args)
+ return args_runner(args_)
+
+def main(argv=sys.argv, out=sys.stdout):
+ # Log generously
+ logFormat = '%(asctime)s [%(levelname)s] %(message)s'
+ logging.basicConfig(format=logFormat, level=logging.WARN)
+ stdOutHandler = logging.StreamHandler(sys.stdout)
+ log = logging.getLogger()
+ try:
+ return pbparser_runner(
+ argv=argv[1:],
+ parser=get_parser(),
+ args_runner_func=args_runner,
+ contract_runner_func=resolved_tool_contract_runner,
+ alog=log,
+ setup_log_func=setup_log)
+ # FIXME is there a more central place to deal with this?
+ except Exception as e:
+ type, value, tb = sys.exc_info()
+ traceback.print_exc(file=sys.stderr)
+ # Note: if kt.args.usePdb
+ # This won't work. If an exception is raised in parseArgs,
+ # then kt.args is not defined yet.
+ if '--pdb' in argv:
+ try:
+ # this has better integration with ipython and is nicer
+ # pip install ipdb
+ import ipdb
+ ipdb.post_mortem(tb)
+ except ImportError:
+ import pdb
+ pdb.post_mortem(tb)
+ else:
+ # exit non-zero
+ raise
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/kineticsTools/pipelineTools.py b/kineticsTools/pipelineTools.py
new file mode 100755
index 0000000..9bebc1f
--- /dev/null
+++ b/kineticsTools/pipelineTools.py
@@ -0,0 +1,49 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+
+def consumer(func):
+ def start(*args, **kwargs):
+ c = func(*args, **kwargs)
+ c.next()
+ return c
+ return start
+
+
+def broadcast(source, consumers):
+ for item in source:
+ for c in consumers:
+ c.send(item)
+
+
+def concat(sources):
+ for source in sources:
+ for item in source:
+ yield item
diff --git a/kineticsTools/resources/C2.h5 b/kineticsTools/resources/C2.h5
new file mode 100644
index 0000000..24f7aa9
Binary files /dev/null and b/kineticsTools/resources/C2.h5 differ
diff --git a/kineticsTools/resources/P4-C2.h5 b/kineticsTools/resources/P4-C2.h5
new file mode 100644
index 0000000..07257d7
Binary files /dev/null and b/kineticsTools/resources/P4-C2.h5 differ
diff --git a/kineticsTools/resources/P5-C3.h5 b/kineticsTools/resources/P5-C3.h5
new file mode 100644
index 0000000..2ae6398
Binary files /dev/null and b/kineticsTools/resources/P5-C3.h5 differ
diff --git a/kineticsTools/resources/P6-C4.h5 b/kineticsTools/resources/P6-C4.h5
new file mode 100644
index 0000000..cf4cfd1
Binary files /dev/null and b/kineticsTools/resources/P6-C4.h5 differ
diff --git a/kineticsTools/resources/XL-C2.h5 b/kineticsTools/resources/XL-C2.h5
new file mode 100644
index 0000000..1faba2c
Binary files /dev/null and b/kineticsTools/resources/XL-C2.h5 differ
diff --git a/kineticsTools/resources/XL-XL.h5 b/kineticsTools/resources/XL-XL.h5
new file mode 100644
index 0000000..33f0835
Binary files /dev/null and b/kineticsTools/resources/XL-XL.h5 differ
diff --git a/kineticsTools/resources/p4_c2_arabidopsis_2_2_binary_classifier.csv b/kineticsTools/resources/p4_c2_arabidopsis_2_2_binary_classifier.csv
new file mode 100644
index 0000000..31c42b8
--- /dev/null
+++ b/kineticsTools/resources/p4_c2_arabidopsis_2_2_binary_classifier.csv
@@ -0,0 +1,127 @@
+-0.218870747,-0.252296538
+0.009324668,0.013659583
+0.139073625,0.004327513
+-0.010024067,0.002370422
+0.001229129,0.013383895
+0.231935545,-0.012581095
+-0.133892317,0.030021062
+0.072941839,0.014663078
+0.200562058,0.012691494
+0.336882256,0.068263955
+-0.161984698,0.008037882
+0.156208948,0.151879085
+0.017805064,-0.174598072
+0.084943053,0.343678448
+0.021894413,0.189624168
+0.027199322,0.046792204
+0.038449519,-0.157653145
+-0.004553913,0.233520756
+0.017739224,-0.00656977
+0.00040477,-0.009147228
+0.018913452,0.141442149
+0.01723663,0.005939711
+-0.001509455,-0.026567498
+-0.091656331,-0.042768396
+0.006739003,-0.023139076
+0.006679003,-0.034187465
+-0.200488411,-0.05390173
+0.011096867,-0.026677887
+0.040737686,-0.019861829
+-0.036645252,0.012537116
+-0.182446634,-0.086781613
+-0.021607461,-0.059958426
+-0.153131769,-0.162308175
+-0.058371593,-0.030380554
+-0.085603514,-0.169454223
+0.014903424,-0.044905908
+-0.021743656,0.028112579
+-0.021922745,-0.000132207
+-0.048161677,-0.205708718
+-0.037487143,0.003347113
+-0.032050087,-0.003627029
+-0.036263561,-0.091881356
+-0.04104759,-0.010528393
+0.006027174,0.009387437
+-0.027318759,0.008528345
+-0.010480415,0.003833058
+0.027518674,0.002791557
+0.004471273,0.009885338
+0.05319448,-0.001450885
+0.016878561,0.013034766
+-0.02470007,0.01688694
+-0.041653994,0.00370469
+0.005170487,0.020647149
+-0.023842272,-0.022235403
+0.013998305,0.02461326
+-0.001358025,-0.053077243
+0.007812159,-0.019603953
+0.013817534,0.02176098
+-0.004356039,0.063094906
+0.007357544,0.005873228
+-0.001224538,0.024301919
+0.004310251,-4.63E-06
+0.004540021,-0.026968278
+0.002385904,0.014666919
+-0.067739851,-0.021594789
+0.170076815,-0.018122064
+-0.063900244,-0.022972143
+-0.077113697,-0.026427232
+0.339599231,-0.032022644
+-0.261863089,-0.013452093
+-0.025196034,-0.017754203
+0.192768366,-0.070623944
+0.356266243,0.075288945
+-0.252447128,-0.022874061
+0.192092776,0.200421851
+-0.012982893,-0.264650804
+0.086885173,0.350656269
+-0.088931805,0.181672619
+-0.027270774,-0.038357158
+-0.026733385,-0.229478971
+-0.029584447,0.348460548
+-0.024587521,-0.071602598
+-0.047511176,-0.061912675
+-0.021336366,0.185451741
+-0.024487417,-0.0420043
+0.010834123,0.040227081
+0.230729956,0.047095909
+-0.01676307,0.025509498
+-0.005449874,0.04757136
+0.432423956,0.041320635
+-0.144989185,0.056698948
+0.032204153,0.034524907
+0.237207309,0.000154378
+0.51932889,0.155045568
+-0.140377237,0.067996309
+0.309340717,0.31418726
+0.076176657,-0.144217517
+0.170546567,0.513132671
+0.006990989,0.234530076
+0.048942978,0.018679625
+0.060372264,-0.157520938
+0.043607765,0.439229474
+0.055226367,-0.009916884
+0.032454858,-0.005520198
+0.055177013,0.233323504
+0.058284221,0.016468104
+0.009263449,0.009841781
+-0.030901101,0.010728086
+-0.006903412,0.002724818
+0.02996559,0.004538181
+-0.005128673,0.010166215
+0.058584076,-0.000427652
+0.017853652,0.012975994
+-0.024880741,0.01962102
+-0.04452473,0.003296791
+0.010087771,0.021114386
+-0.02643034,-0.024186631
+0.013735312,0.030075644
+-0.002309233,-0.055766848
+0.00963127,-0.02000103
+0.014702049,0.022520047
+-0.004204991,0.067413949
+0.008792223,-0.002654732
+-0.000693335,0.026412245
+0.005814738,0.003737305
+0.0056328,-0.031868488
+0.003960519,0.016194588
diff --git a/kineticsTools/resources/p4_c2_medaka_2_binary_classifier.csv b/kineticsTools/resources/p4_c2_medaka_2_binary_classifier.csv
new file mode 100644
index 0000000..f5ab9df
--- /dev/null
+++ b/kineticsTools/resources/p4_c2_medaka_2_binary_classifier.csv
@@ -0,0 +1,127 @@
+0.51596517,0.492119279
+0.033244393,0.061960192
+0.190050033,0.07724606
+0.050472699,0.051253638
+0.03484651,0.074793294
+0.274164546,0.054307813
+-0.053919259,0.052039278
+0.000916289,0.050683811
+0.313818371,0.058962758
+0.354856114,0.133637498
+-0.013269899,0.095682742
+0.15862209,0.144413298
+0.10145909,0.005189001
+0.122764215,0.353988031
+0.074612792,0.315113798
+0.042484104,0.026806174
+0.051312336,-0.038696728
+0.046116639,0.277172933
+0.066033022,0.036370075
+0.051521887,0.053583259
+0.053644854,0.201273742
+0.064185234,0.033201108
+0.036569791,0.038506794
+0.013502063,0.049026131
+0.047648606,0.042746656
+0.084075431,0.041432757
+-0.11103893,0.030791378
+0.015979415,0.035440508
+-0.00392581,0.027834947
+0.168791058,0.043659745
+-0.030639907,-0.033777346
+0.166435199,0.042538152
+-0.157416958,-0.173785499
+0.047167848,0.159901009
+-0.040307089,-0.021491724
+0.049709494,0.165602053
+0.025628597,0.00552635
+0.0432618,0.016856426
+0.008143651,-0.105205315
+0.040731462,0.085655752
+0.031970515,0.048387649
+0.044793904,0.03556469
+0.03182507,0.057174611
+0.002993061,-0.024733883
+-0.056141071,-0.022198043
+-0.025265651,-0.024136111
+0.029004868,-0.035583871
+-0.03059251,-0.020038671
+0.051245849,-0.021048014
+-0.004330038,-0.014506825
+-0.067422923,-0.014351906
+-0.059883638,-0.034066748
+0.035698984,-0.000371064
+-0.053383378,-0.059245859
+-0.000775527,0.022136995
+-0.021382978,-0.065390222
+-0.012627214,-0.079083255
+-0.010989587,-0.019282829
+-0.014839439,0.037083375
+-0.013747,-0.037679005
+-0.024341655,0.030759433
+-0.022409187,-0.035544417
+-0.013760928,-0.069526323
+-0.017670613,-0.019340712
+-0.047259529,0.023905666
+0.15746913,0.003672149
+-0.025278789,-0.015711883
+-0.066790302,-0.000344686
+0.344708178,-0.018705273
+-0.141986923,0.008983992
+0.022559771,0.003192303
+0.179348456,-0.031655483
+0.307530374,0.096549919
+-0.202572131,-0.016684057
+0.177746887,0.214684354
+-0.032879363,-0.196683971
+0.076079142,0.298400985
+-0.053825153,0.189464778
+-0.008235229,-0.004655509
+-0.002104796,-0.110291414
+-0.012813954,0.316057427
+0.01711505,-0.054533252
+-0.028319053,-0.022486186
+-0.021754743,0.164912854
+0.0071595,-0.032270022
+-0.003325397,0.023453398
+0.17654797,0.028219929
+0.002824093,0.008506981
+-0.04922892,0.033360537
+0.385203477,0.023516436
+-0.069898674,0.01659877
+0.004842098,0.022848864
+0.145027313,0.015303013
+0.385496021,0.167414844
+-0.179705098,0.05314459
+0.316039048,0.318198797
+0.054291242,-0.154712008
+0.163071303,0.375479755
+0.024903298,0.149511745
+0.016855507,0.021279824
+0.008050536,-0.055553153
+0.037972988,0.382378247
+0.02530156,-0.049285676
+0.019551372,0.00519561
+0.00885095,0.165709053
+0.032360164,-0.023973502
+-0.000847717,-0.028408572
+-0.060053632,-0.023689772
+-0.028183122,-0.025223134
+0.032518964,-0.031836728
+-0.041653442,-0.020359697
+0.050424451,-0.022486612
+-0.011566061,-0.016965572
+-0.06391329,-0.019035785
+-0.056769378,-0.037838451
+0.032809697,0.000554223
+-0.064850447,-0.067641001
+0.000704702,0.021263061
+-0.027349863,-0.060079548
+-0.012936899,-0.073479743
+-0.011051679,-0.025649713
+-0.017112687,0.039532864
+-0.016648634,-0.035849036
+-0.025995846,0.030533353
+-0.021676951,-0.036789507
+-0.014375702,-0.068707132
+-0.019733061,-0.018526155
diff --git a/kineticsTools/resources/unknown.h5 b/kineticsTools/resources/unknown.h5
new file mode 100644
index 0000000..5162e73
Binary files /dev/null and b/kineticsTools/resources/unknown.h5 differ
diff --git a/kineticsTools/sharedArray.py b/kineticsTools/sharedArray.py
new file mode 100755
index 0000000..4a09c93
--- /dev/null
+++ b/kineticsTools/sharedArray.py
@@ -0,0 +1,51 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+from multiprocessing.sharedctypes import RawArray
+import warnings
+import numpy as np
+
+
+class SharedArray:
+
+ """
+ Very simple wrapper for a chunk of shared memory that can be accessed across processes
+ """
+
+ def __init__(self, dtype, shape):
+ self._rawArray = RawArray(dtype, shape)
+
+ def getNumpyWrapper(self):
+ """
+ Construct a numpy array that wraps the raw shared memory array
+ """
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ return np.ctypeslib.as_array(self._rawArray)
diff --git a/kineticsTools/summarizeModifications.py b/kineticsTools/summarizeModifications.py
new file mode 100755
index 0000000..59c6257
--- /dev/null
+++ b/kineticsTools/summarizeModifications.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+"""
+Summarizes kinetic modifications in the alignment_summary.gff file.
+"""
+
+import cProfile
+from itertools import groupby
+import os
+import logging
+import sys
+
+from pbcommand.models import FileTypes, get_pbparser
+from pbcommand.cli import pbparser_runner
+from pbcommand.common_options import add_debug_option
+from pbcommand.utils import setup_log
+from pbcore.io import GffReader, Gff3Record
+
+# Version info...
+__version__ = "1.0"
+
+
+class Constants(object):
+ TOOL_ID = "kinetics_tools.tasks.summarize_modifications"
+ DRIVER_EXE = "python -m kineticsTools.summarizeModifications --resolved-tool-contract"
+
+
+class ModificationSummary(object):
+ def __init__(self, modifications, alignmentSummary, outfile):
+ self.modifications = modifications
+ self.alignmentSummary = alignmentSummary
+ self.outfile = outfile
+
+ def run(self, profile=False):
+ self.knownModificationEvents = ["modified_base", "m6A", "m4C", "m5C"]
+ if profile:
+ cProfile.runctx("self._mainLoop()",
+ globals=globals(),
+ locals=locals(),
+ filename="profile.out")
+ return 0
+ else:
+ return self._mainLoop()
+
+ def countModificationTypes(self, mods):
+ mods = sorted(mods, key=lambda x: x["type"])
+
+ counts = dict([(x, 0) for x in self.knownModificationEvents])
+ for k, g in groupby(mods, lambda x: x["type"]):
+ counts[k] = len(list(g))
+
+ return counts
+
+ def _mainLoop(self):
+
+ # Read in the existing modifications.gff
+ modReader = GffReader(self.modifications)
+
+ headerString = ",".join(['"' + x + '"' for x in self.knownModificationEvents])
+
+ # Set up some additional headers to be injected
+ headers = [
+ ('source', 'kineticModificationCaller 1.3.3'),
+ ('source-commandline', " ".join(sys.argv)),
+ ('attribute-description', 'modsfwd - count of detected DNA modifications on forward strand by modification event type'),
+ ('attribute-description', 'modsrev - count of detected DNA modifications on reverse strand by modification event type'),
+ ('region-modsfwd', headerString),
+ ('region-modsfwd', headerString)
+ ]
+
+ hitsByEvent = dict([(x, []) for x in self.knownModificationEvents])
+
+ # Get modification calls
+ hits = [{"pos": x.start, "strand": x.strand, "seqid": x.seqid, "type": x.type}
+ for x in modReader if x.type in self.knownModificationEvents]
+
+ # Summary reader
+ summaryFile = file(self.alignmentSummary)
+
+ # Modified gff file
+ summaryWriter = file(self.outfile, "w")
+
+ self.seqMap = {}
+ inHeader = True
+
+ # Loop through
+ for line in summaryFile:
+ # Pass any metadata line straight through
+ if line[0] == "#":
+
+ # Parse headers
+ splitFields = line.replace('#', '').split(' ')
+ field = splitFields[0]
+ value = " ".join(splitFields[1:])
+ if field == 'sequence-header':
+ [internalTag, delim, externalTag] = value.strip().partition(' ')
+ self.seqMap[internalTag] = externalTag
+ print >>summaryWriter, line.strip()
+ continue
+
+ if inHeader:
+ # We are at the end of the header -- write the tool-specific headers
+ for field in headers:
+ print >>summaryWriter, ("##%s %s" % field)
+ inHeader = False
+
+ # Parse the line
+ rec = Gff3Record.fromString(line)
+
+ if rec.type == 'region':
+ # Get the hits in this interval, add them to the gff record
+ intervalHits = [h for h in hits if rec.start <= h['pos'] <= rec.end and rec.seqid == h['seqid']]
+
+ cFwd = self.countModificationTypes([h for h in intervalHits if h['strand'] == '+'])
+ cRev = self.countModificationTypes([h for h in intervalHits if h['strand'] == '-'])
+
+ rec.modsfwd = ",".join([str(cFwd[x]) for x in self.knownModificationEvents])
+ rec.modsrev = ",".join([str(cRev[x]) for x in self.knownModificationEvents])
+
+ print >>summaryWriter, str(rec)
+ return 0
+
+
+def args_runner(args):
+ return ModificationSummary(
+ modifications=args.modifications,
+ alignmentSummary=args.alignmentSummary,
+ outfile=args.outfile).run()
+
+def resolved_tool_contract_runner(resolved_tool_contract):
+ rtc = resolved_tool_contract
+ return ModificationSummary(
+ modifications=rtc.task.input_files[0],
+ alignmentSummary=rtc.task.input_files[1],
+ outfile=rtc.task.output_files[0]).run()
+
+def get_parser():
+ p = get_pbparser(
+ tool_id=Constants.TOOL_ID,
+ version=__version__,
+ name=Constants.TOOL_ID,
+ description=__doc__,
+ driver_exe=Constants.DRIVER_EXE)
+ p.add_input_file_type(FileTypes.GFF, "modifications",
+ name="GFF file",
+ description="Base modification GFF file")
+ p.add_input_file_type(FileTypes.GFF, "alignmentSummary",
+ name="GFF file",
+ description="Alignment summary GFF")
+ p.add_output_file_type(FileTypes.GFF, "gff_out",
+ name="GFF file",
+ description="Modified alignment summary file",
+ default_name="alignment_summary_with_basemods.gff")
+ return p
+
+def main(argv=sys.argv):
+ mp = get_parser()
+ logFormat = '%(asctime)s [%(levelname)s] %(message)s'
+ logging.basicConfig(level=logging.INFO, format=logFormat)
+ stdOutHandler = logging.StreamHandler(sys.stdout)
+ logging.Logger.root.addHandler(stdOutHandler)
+ log = logging.getLogger()
+ return pbparser_runner(
+ argv=argv[1:],
+ parser=mp,
+ args_runner_func=args_runner,
+ contract_runner_func=resolved_tool_contract_runner,
+ alog=log,
+ setup_log_func=setup_log)
+
+if __name__ == "__main__":
+ main()
diff --git a/kineticsTools/tree_predict.c b/kineticsTools/tree_predict.c
new file mode 100644
index 0000000..ae394f0
--- /dev/null
+++ b/kineticsTools/tree_predict.c
@@ -0,0 +1,212 @@
+// Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// * Neither the name of Pacific Biosciences nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// Description:
+// Fast GBM tree predict routintes
+//
+// Author: Patrick Marks (pmarks at pacificbiosciences.com)
+//
+
+#include <assert.h>
+#ifndef __APPLE__
+#include <malloc.h>
+#endif
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdint.h>
+
+int init_native(int a)
+{
+ return 1;
+}
+
+#ifndef isnan
+inline bool isnan(double x) {
+ return x != x;
+}
+#endif
+
+#ifndef min
+inline int min(int x, int y) {
+ return (x > y) ? y : x;
+}
+#endif
+
+/// <summary>
+/// Walk the tree for each example, and sum up the leaf nodes. Emit the total
+/// scores for each observation.
+/// </summary>
+void innerPredict(float radPredF[], float **dataMatrix, int nCtxs, int left[], int right[], int missing[], float splitCode[], int splitVar[], int cSplits[], int varTypes[], float initialValue, int treeSize, int numTrees, int maxCSplitSize)
+{
+
+ int tStep = 50;
+ int obsStep = 60;
+
+ for(int i = 0; i < nCtxs; i++)
+ {
+ radPredF[i] = initialValue;
+ }
+
+ for (int t0 = 0; t0 < numTrees; t0 += tStep)
+ {
+ for (int obs0 = 0; obs0 < nCtxs; obs0 += obsStep)
+ {
+ for (int t = t0; t < min(t0 + tStep, numTrees); t++)
+ {
+ int offset = t * treeSize;
+
+ for (int iObs = obs0; iObs < min(obs0 + obsStep, nCtxs); iObs++)
+ {
+ int iCurrentNode = 0;
+ while (splitVar[offset + iCurrentNode] != -1)
+ {
+ float dX = dataMatrix[splitVar[offset + iCurrentNode]][iObs];
+ // missing?
+ if (isnan(dX))
+ {
+ iCurrentNode = missing[offset + iCurrentNode];
+ }
+ // continuous?
+ else if (varTypes[splitVar[offset + iCurrentNode]] == 0)
+ {
+ if (dX < splitCode[offset + iCurrentNode])
+ {
+ iCurrentNode = left[offset + iCurrentNode];
+ }
+ else
+ {
+ iCurrentNode = right[offset + iCurrentNode];
+ }
+ }
+ else // categorical
+ {
+ int iCatSplitIndicator = cSplits[((int)splitCode[offset + iCurrentNode]*maxCSplitSize) + (int)dX];
+ if (iCatSplitIndicator == -1)
+ {
+ iCurrentNode = left[offset + iCurrentNode];
+ }
+ else if (iCatSplitIndicator == 1)
+ {
+ iCurrentNode = right[offset + iCurrentNode];
+ }
+ else // categorical level not present in training
+ {
+ iCurrentNode = missing[offset + iCurrentNode];
+ }
+ }
+ }
+ radPredF[iObs] += (float)splitCode[offset + iCurrentNode]; // add the prediction
+ }
+ } // iObs
+ } // iTree
+ }
+}
+
+
+static uint32_t modToCanonicalMap[8] = { 0, 1, 2, 3, 0, 1, 1, 1 };
+
+/// <summary>
+/// Walk the tree for each example, and sum up the leaf nodes. Emit the total
+/// scores for each observation.
+/// </summary>
+void innerPredictCtx(int ctxSize, float radPredF[], uint64_t contextPack[], int nCtxs, int left[], int right[], int missing[], float splitCode[], int16_t splitVar[], int varTypes[], float initialValue, int treeSize, int numTrees, int maxCSplitSize)
+{
+
+ // contextPack contains 24 3-bit numbers in feature order
+
+ uint32_t* uintSplitCode = (uint32_t*) splitCode;
+
+ int tStep = 20;
+ int obsStep = 1000;
+
+ for(int i = 0; i < nCtxs; i++)
+ {
+ radPredF[i] = initialValue;
+ }
+
+ for (int t0 = 0; t0 < numTrees; t0 += tStep)
+ {
+ for (int obs0 = 0; obs0 < nCtxs; obs0 += obsStep)
+ {
+ for (int t = t0; t < min(t0 + tStep, numTrees); t++)
+ {
+ int offset = t * treeSize;
+
+ for (int iObs = obs0; iObs < min(obs0 + obsStep, nCtxs); iObs++)
+ {
+ uint64_t ctx = contextPack[iObs];
+
+ int currentNode = offset;
+ while (splitVar[currentNode] >= 0)
+ {
+ int currentVar = splitVar[currentNode];
+ int ctxPos = currentVar;
+
+ // Canonical feature means feature over canonical bases A,C,G,T
+ int isCanonicalFeature = currentVar >= ctxSize;
+
+ if (isCanonicalFeature)
+ ctxPos = currentVar - ctxSize;
+
+ // context is packed 4 bits per slot, lower 3 bits are the modified base code
+ uint32_t dX = (ctx >> (4*ctxPos)) & 0x7;
+
+ if (isCanonicalFeature)
+ // Need the canonical base -- convert
+ // from the general base back to the canonical base
+ dX = modToCanonicalMap[dX];
+
+ // split code contains packed indicators for each categorical level
+ uint32_t splitPack = uintSplitCode[currentNode];
+ uint32_t ind = (splitPack >> dX) & 0x1;
+
+ if (ind == 0)
+ {
+ // Left node comes precomputed with offset
+ currentNode = left[currentNode];
+ }
+ else
+ {
+ // Right node come precomputed with offset
+ currentNode = right[currentNode];
+ }
+ }
+ radPredF[iObs] += (float)splitCode[currentNode]; // add the prediction
+ }
+ } // iObs
+ } // iTree
+ }
+}
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..91c87bd
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,29 @@
+from setuptools import setup, Extension, find_packages
+import os
+import sys
+
+setup(
+ name='kineticsTools',
+ version='0.5.1',
+ author='Pacific Biosciences',
+ author_email='devnet at pacificbiosciences.com',
+ license=open('LICENSES.txt').read(),
+ packages=find_packages('.'),
+ package_dir={'': '.'},
+ package_data={'kineticsTools': ['resources/*.h5']},
+ ext_modules=[Extension('kineticsTools/tree_predict', ['kineticsTools/tree_predict.c'],
+ extra_compile_args=["-O3", "-shared", "-std=c99"],
+ export_symbols=["innerPredict", "innerPredictCtx", "init_native"])],
+ zip_safe=False,
+ install_requires=[
+ 'pbcore >= 1.2.2',
+ 'numpy >= 1.6.0',
+ 'h5py >= 1.3.0',
+ 'scipy >= 0.9.0',
+ 'pbcommand >= 0.2.0',
+ ],
+ entry_points={'console_scripts': [
+ "ipdSummary = kineticsTools.ipdSummary:main",
+ "summarizeModifications = kineticsTools.summarizeModifications:main",
+ ]},
+)
diff --git a/strand-conventions.txt b/strand-conventions.txt
new file mode 100644
index 0000000..4a4c7ab
--- /dev/null
+++ b/strand-conventions.txt
@@ -0,0 +1,33 @@
+
+ Note on IPD classifier training conventions
+ ===========================================
+
+ Getting the strand and context correct is notoriously confusing.
+ In the output of the basemods package (kineticsTools), strand refers the
+ _template_ strand, because that's where the modification that we're detecting
+ are. The PacBio basecaller and dye/base mapping always work in the product strand.
+
+ The IPD model itself is trained to take sequences in the template strand and make predictions about
+ the IPD that will observed when synthesizing the product strand. In KEC, we will be working entirely
+ with the product strand sequence. We set up the Kinetic Model code to accept product strand sequences and give
+ product strand predictions.
+
+ Here's a reference for the context windows and strands:
+
+ Case 1: the + strand is the product strand:
+ - The input sequence to pass to the classfier is the - strand sequence, indexed according to the top numbers
+ - The IPD GBM model prediction is for the G incorporation in the product strand synthesis
+
+ Case 2: the - strand is the product strand:
+ - The input sequence to pass to the classfier is the + strand sequence, indexed according to the bottom numbers
+ - The IPD GBM model prediction for the C incorporation in the product strand synthesis
+
+
+ strand sequence pol motion
+ 14 4 0
+ | | |
+ - 3'-xxxxxxxxxNNNNNNNNNNCNNNNxxxxx-5' <-
+ + 5'-xxxxxxxxxNNNNNNNNNNGNNNNxxxxx-3' ->
+ | | |
+ 0 10 14
+
diff --git a/test/cram/case-ctrl.t b/test/cram/case-ctrl.t
new file mode 100644
index 0000000..7442a3b
--- /dev/null
+++ b/test/cram/case-ctrl.t
@@ -0,0 +1,37 @@
+Case-control of a job against itself---shouldn't find any differences.
+
+ $ . $TESTDIR/portability.sh
+
+Load in data:
+
+ $ DATA=$TESTDIR/../data
+ $ INPUT=$DATA/p4-c2-lambda-mod-decode.cmp.h5
+ $ REFERENCE=$DATA/lambda/sequence/lambda.fasta
+
+Run basic ipdSummary:
+
+ $ ipdSummary --numWorkers 1 --csv tmp.csv --gff tmp.gff --summary_h5 tmp.h5 --control $INPUT --reference $REFERENCE $INPUT
+
+Look at output csv file:
+
+ $ head -3 tmp.csv
+ refName,tpl,strand,base,score,pvalue,caseMean,controlMean,caseStd,controlStd,ipdRatio,testStatistic,coverage,controlCoverage,caseCoverage
+ "lambda_NEB3011",*,?,?,*,*,*,*,*,*,1.000,0.0,3,3,3 (glob)
+ "lambda_NEB3011",*,?,?,*,*,*,*,*,*,1.000,0.0,3,3,3 (glob)
+
+ $ linecount tmp.csv
+ 7603
+
+Look at output gff file:
+
+ $ cat tmp.gff
+ ##gff-version 3
+ ##source ipdSummary * (glob)
+ ##source-commandline * (glob)
+ ##sequence-region lambda_NEB3011 1 48502
+
+What about the IPD ratio H5 file?
+
+ $ h5ls -r tmp.h5
+ / Group
+ /lambda_NEB3011 Dataset {48502}
diff --git a/test/cram/detection.t b/test/cram/detection.t
new file mode 100644
index 0000000..b8d2fe6
--- /dev/null
+++ b/test/cram/detection.t
@@ -0,0 +1,76 @@
+Test basic detection mode of ipdSummary.
+
+ $ . $TESTDIR/portability.sh
+
+Load in data:
+
+ $ DATA=$TESTDIR/../data
+ $ INPUT=$DATA/p4-c2-lambda-mod-decode.cmp.h5
+ $ REFERENCE=$DATA/lambda/sequence/lambda.fasta
+
+Run basic ipdSummary:
+
+ $ ipdSummary --pvalue 0.001 --numWorkers 1 --csv tmp.csv --gff tmp.gff --summary_h5 tmp.h5 --reference $REFERENCE $INPUT
+
+Look at output csv file:
+
+ $ head -3 tmp.csv
+ refName,tpl,strand,base,score,tMean,tErr,modelPrediction,ipdRatio,coverage
+ "lambda_NEB3011",13190,1,T,1,0.909,0.252,1.126,0.808,3
+ "lambda_NEB3011",13190,0,A,3,1.120,0.096,1.125,0.996,3
+
+ $ linecount tmp.csv
+ 7603
+
+Look at output gff file:
+
+ $ cat tmp.gff
+ ##gff-version 3
+ ##source ipdSummary * (glob)
+ ##source-commandline * (glob)
+ ##sequence-region lambda_NEB3011 1 48502
+ lambda_NEB3011\tkinModCall\tmodified_base\t14060\t14060\t31\t-\t.\tcoverage=49;context=ACGTTATTGCGGAACTTACAACCGCTCAGGCATTTGCTGCA;IPDRatio=2.15 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14657\t14657\t34\t+\t.\tcoverage=155;context=CGGCACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCA;IPDRatio=1.54 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14743\t14743\t35\t-\t.\tcoverage=173;context=TACCTCTCTCGTTTGCTCAGTTGTTCAGGAATATGGTGCAG;IPDRatio=1.54 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14769\t14769\t31\t-\t.\tcoverage=168;context=GTGTGCGTCGCTGCCATTTGTCGGTGTACCTCTCTCGTTTG;IPDRatio=1.56 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14800\t14800\t34\t-\t.\tcoverage=173;context=GCGCGCCATGCCCGGTGACGCCAGAGGGAGTGTGTGCGTCG;IPDRatio=1.68 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14831\t14831\t32\t+\t.\tcoverage=161;context=CATGGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTG;IPDRatio=1.55 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14834\t14834\t32\t+\t.\tcoverage=166;context=GGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTGGTC;IPDRatio=1.70 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14841\t14841\t33\t+\t.\tcoverage=162;context=ATCTGCCTTTACGGGGATTTACAACGATTTGGTCGCCGCAT;IPDRatio=1.59 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14847\t14847\t35\t-\t.\tcoverage=172;context=AGGTCGATGCGGCGACCAAATCGTTGTAAATCCCCGTAAAG;IPDRatio=1.83 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14864\t14864\t46\t-\t.\tcoverage=166;context=CCCCCGTTTTCACACGAAGGTCGATGCGGCGACCAAATCGT;IPDRatio=1.71 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14884\t14884\t31\t-\t.\tcoverage=173;context=CAGTGCCCGGATGGCTTCAGCCCCCGTTTTCACACGAAGGT;IPDRatio=2.12 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14885\t14885\t33\t-\t.\tcoverage=166;context=CCAGTGCCCGGATGGCTTCAGCCCCCGTTTTCACACGAAGG;IPDRatio=2.51 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14909\t14909\t36\t+\t.\tcoverage=162;context=AGCCATCCGGGCACTGGCCACACAGCTCCCGGCGTTTCGTC;IPDRatio=1.76 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14983\t14983\t210\t+\t.\tcoverage=160;context=TTGCCGGGCGGGACGTCAGCACGTCCGGGTTAACGGCGCAG;IPDRatio=6.76 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14991\t14991\t32\t-\t.\tcoverage=172;context=TCATGTAACTGCGCCGTTAACCCGGACGTGCTGACGTCCCG;IPDRatio=1.59 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14992\t14992\t209\t-\t.\tcoverage=162;context=CTCATGTAACTGCGCCGTTAACCCGGACGTGCTGACGTCCC;IPDRatio=6.64 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14997\t14997\t60\t-\t.\tcoverage=140;context=AGAGTCTCATGTAACTGCGCCGTTAACCCGGACGTGCTGAC;IPDRatio=2.29 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15006\t15006\t39\t-\t.\tcoverage=170;context=CCATCAGGCAGAGTCTCATGTAACTGCGCCGTTAACCCGGA;IPDRatio=1.74 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15008\t15008\t47\t+\t.\tcoverage=162;context=CGGGTTAACGGCGCAGTTACATGAGACTCTGCCTGATGGCG;IPDRatio=1.74 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15039\t15039\t50\t-\t.\tcoverage=169;context=CCGGCGACTCTGGGAACAATATGAATTACAGCGCCATCAGG;IPDRatio=1.82 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15066\t15066\t37\t+\t.\tcoverage=161;context=CCCAGAGTCGCCGGGGCCAAGTCAGGTGGCGTATTCCAGAT;IPDRatio=1.69 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15074\t15074\t33\t-\t.\tcoverage=171;context=CCAGGACAATCTGGAATACGCCACCTGACTTGGCCCCGGCG;IPDRatio=1.80 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15078\t15078\t33\t-\t.\tcoverage=168;context=GCCCCCAGGACAATCTGGAATACGCCACCTGACTTGGCCCC;IPDRatio=1.62 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15095\t15095\t32\t-\t.\tcoverage=167;context=ATCCGGCAATGGCGGCAGCCCCCAGGACAATCTGGAATACG;IPDRatio=1.59 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15118\t15118\t32\t-\t.\tcoverage=163;context=GGTGGCTCCGGCGGTAAAGAATGATCCGGCAATGGCGGCAG;IPDRatio=1.54 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15121\t15121\t31\t-\t.\tcoverage=149;context=AAGGGTGGCTCCGGCGGTAAAGAATGATCCGGCAATGGCGG;IPDRatio=1.58 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15124\t15124\t34\t-\t.\tcoverage=170;context=TGCAAGGGTGGCTCCGGCGGTAAAGAATGATCCGGCAATGG;IPDRatio=1.57 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15195\t15195\t33\t-\t.\tcoverage=164;context=AGCACCATACTGGCACCGAGAGAAAACAGGATGCCGGTCAT;IPDRatio=1.60 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15239\t15239\t33\t+\t.\tcoverage=157;context=TGGTGTGGCGCAGATGCTGGCACCGAAAGCCAGAACTCCCC;IPDRatio=1.65 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15261\t15261\t38\t-\t.\tcoverage=167;context=CCGTTATCCGTTGTCTGTATACGGGGAGTTCTGGCTTTCGG;IPDRatio=1.73 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15291\t15291\t31\t+\t.\tcoverage=158;context=ACGGATAACGGTAAGCAGAACACCTATTTCTCCTCACTGGA;IPDRatio=1.68 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15292\t15292\t41\t-\t.\tcoverage=166;context=ATCCAGTGAGGAGAAATAGGTGTTCTGCTTACCGTTATCCG;IPDRatio=1.61 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15311\t15311\t31\t-\t.\tcoverage=169;context=TGCCCTGGGCAACCATGTTATCCAGTGAGGAGAAATAGGTG;IPDRatio=1.47 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15402\t15402\t31\t-\t.\tcoverage=169;context=TGACCACCGTCCCCTTCGTCTGCCGTGCTGATCTCCTGAGA;IPDRatio=1.45 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15566\t15566\t32\t+\t.\tcoverage=118;context=GAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCG;IPDRatio=1.79 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t16035\t16035\t32\t-\t.\tcoverage=38;context=ATCCTGCGCATCCGGATATTAAACGGGCGCGGCGGCAGGTT;IPDRatio=1.96 (esc)
+
+ $ linecount tmp.gff
+ 40
+
+What about the H5 file?
+
+ $ h5ls -r tmp.h5
+ / Group
+ /lambda_NEB3011 Dataset {48502}
diff --git a/test/cram/detection_bam.t b/test/cram/detection_bam.t
new file mode 100644
index 0000000..0724a2d
--- /dev/null
+++ b/test/cram/detection_bam.t
@@ -0,0 +1,56 @@
+Test detection and identification modes of ipdSummary using .bam file as input.
+
+ $ . $TESTDIR/portability.sh
+
+Load in data:
+
+ $ DATA=/mnt/secondary-siv/testdata/kineticsTools
+ $ INPUT=$DATA/Hpyl_1_5000.bam
+ $ REFERENCE=/mnt/secondary-siv/references/Helicobacter_pylori_J99/sequence/Helicobacter_pylori_J99.fasta
+
+Run basic ipdSummary:
+
+ $ ipdSummary --gff tmp1.gff --csv tmp1.csv --numWorkers 12 --pvalue 0.001 --identify m6A,m4C --reference $REFERENCE --referenceWindows="gi|12057207|gb|AE001439.1|:0-5000" $INPUT
+
+Look at output csv file:
+
+ $ head -3 tmp1.csv
+ refName,tpl,strand,base,score,tMean,tErr,modelPrediction,ipdRatio,coverage
+ "gi|12057207|gb|AE001439.1|",1,0,A,10,2.387,0.464,1.710,1.396,29
+ "gi|12057207|gb|AE001439.1|",1,1,T,1,0.492,0.075,0.602,0.817,57
+
+ $ linecount tmp1.csv
+ 10001
+
+Look at output gff file:
+
+ $ linecount tmp1.gff
+ 274
+ $ cat tmp1.gff | head -20
+ ##gff-version 3
+ ##source ipdSummary v2.0
+ ##source-commandline * (glob)
+ ##sequence-region gi|12057207|gb|AE001439.1| 1 1643831
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t35\t35\t187\t-\t.\tcoverage=118;context=TTTAAGGGCGTTTTATGCCTAAATTTAAAAAATGATGCTGT;IPDRatio=5.68;identificationQv=196 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm4C\t60\t60\t49\t-\t.\tcoverage=112;context=AAAAAGCTCGCTCAAAAACCCTTGATTTAAGGGCGTTTTAT;IPDRatio=2.58;identificationQv=33 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t89\t89\t223\t+\t.\tcoverage=139;context=AGCGAGCTTTTTGCTCAAAGAATCCAAGATAGCGTTTAAAA;IPDRatio=5.69;identificationQv=187 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t91\t91\t217\t-\t.\tcoverage=143;context=ATTTTTAAACGCTATCTTGGATTCTTTGAGCAAAAAGCTCG;IPDRatio=6.34;identificationQv=214 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tmodified_base\t113\t113\t41\t+\t.\tcoverage=132;context=CAAGATAGCGTTTAAAAATTTAGGGGTGTTAGGCTCAGCGT;IPDRatio=1.69 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tmodified_base\t115\t115\t33\t+\t.\tcoverage=147;context=AGATAGCGTTTAAAAATTTAGGGGTGTTAGGCTCAGCGTAG;IPDRatio=1.88 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t122\t122\t222\t-\t.\tcoverage=158;context=GCAAACTCTACGCTGAGCCTAACACCCCTAAATTTTTAAAC;IPDRatio=6.51;identificationQv=204 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t232\t232\t221\t+\t.\tcoverage=173;context=AGCGTAAAATCGCCTTTTCCATGCTCCCTAATCGCTTGAAA;IPDRatio=5.90;identificationQv=209 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t233\t233\t282\t-\t.\tcoverage=183;context=ATTTCAAGCGATTAGGGAGCATGGAAAAGGCGATTTTACGC;IPDRatio=6.43;identificationQv=262 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t241\t241\t267\t+\t.\tcoverage=178;context=TCGCCTTTTCCATGCTCCCTAATCGCTTGAAATCCCAGTCT;IPDRatio=5.57;identificationQv=234 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t248\t248\t239\t-\t.\tcoverage=185;context=ATTTAAAAGACTGGGATTTCAAGCGATTAGGGAGCATGGAA;IPDRatio=6.25;identificationQv=223 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t274\t274\t229\t-\t.\tcoverage=190;context=TGAGATTGACGCTCTCATCGAACCGCATTTAAAAGACTGGG;IPDRatio=6.83;identificationQv=220 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t277\t277\t272\t+\t.\tcoverage=188;context=AGTCTTTTAAATGCGGTTCGATGAGAGCGTCAATCTCATTG;IPDRatio=7.50;identificationQv=257 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm4C\t312\t312\t37\t-\t.\tcoverage=204;context=GCTTTAAGCCTTTTTAATGGCGTGTTAGAAAAAATCAATGA;IPDRatio=1.88;identificationQv=3 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t373\t373\t393\t+\t.\tcoverage=219;context=TAATCTTTTTTTCTTCTAACATGCTGGAAGCGATTTTTTTA;IPDRatio=7.11;identificationQv=353 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t374\t374\t337\t-\t.\tcoverage=221;context=TTAAAAAAATCGCTTCCAGCATGTTAGAAGAAAAAAAGATT;IPDRatio=6.06;identificationQv=323 (esc)
+
+Now try limiting the number of alignments:
+
+ $ ipdSummary --gff tmp2.gff --csv tmp2.csv --numWorkers 12 --pvalue 0.001 --identify m6A,m4C --maxAlignments 100 --reference $REFERENCE --referenceWindows="gi|12057207|gb|AE001439.1|:0-5000" $INPUT
+
+ $ N_DIFF=`diff tmp1.gff tmp2.gff | wc --lines`
+ $ python -c "assert 100 < ${N_DIFF}, ${N_DIFF}"
diff --git a/test/cram/detection_bam_dataset.t b/test/cram/detection_bam_dataset.t
new file mode 100644
index 0000000..11c070b
--- /dev/null
+++ b/test/cram/detection_bam_dataset.t
@@ -0,0 +1,56 @@
+Test detection and identification modes of ipdSummary using .xml dataset file as input. Results should be identical to those using the equivalent .bam file. This will also be tested for a split dataset (two non-overlapping .bam files in one .xml).
+
+ $ . $TESTDIR/portability.sh
+
+Load in data:
+
+ $ DATA=/mnt/secondary-siv/testdata/kineticsTools
+ $ INPUT=$DATA/Hpyl_1_5000.xml
+ $ REFERENCE=/mnt/secondary-siv/references/Helicobacter_pylori_J99/sequence/Helicobacter_pylori_J99.fasta
+
+Run basic ipdSummary:
+
+ $ ipdSummary --outfile tmp_xml1 --numWorkers 12 --pvalue 0.001 --identify m6A,m4C --reference $REFERENCE --referenceWindows="gi|12057207|gb|AE001439.1|:0-5000" $INPUT
+
+Look at output csv file:
+
+ $ head -3 tmp_xml1.csv
+ refName,tpl,strand,base,score,tMean,tErr,modelPrediction,ipdRatio,coverage
+ "gi|12057207|gb|AE001439.1|",1,0,A,10,2.387,0.464,1.710,1.396,29
+ "gi|12057207|gb|AE001439.1|",1,1,T,1,0.492,0.075,0.602,0.817,57
+
+ $ linecount tmp_xml1.csv
+ 10001
+
+Look at output gff file:
+
+ $ linecount tmp_xml1.gff
+ 274
+ $ cat tmp_xml1.gff | head -20
+ ##gff-version 3
+ ##source ipdSummary v2.0
+ ##source-commandline * (glob)
+ ##sequence-region gi|12057207|gb|AE001439.1| 1 1643831
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t35\t35\t187\t-\t.\tcoverage=118;context=TTTAAGGGCGTTTTATGCCTAAATTTAAAAAATGATGCTGT;IPDRatio=5.68;identificationQv=196 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm4C\t60\t60\t49\t-\t.\tcoverage=112;context=AAAAAGCTCGCTCAAAAACCCTTGATTTAAGGGCGTTTTAT;IPDRatio=2.58;identificationQv=33 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t89\t89\t223\t+\t.\tcoverage=139;context=AGCGAGCTTTTTGCTCAAAGAATCCAAGATAGCGTTTAAAA;IPDRatio=5.69;identificationQv=187 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t91\t91\t217\t-\t.\tcoverage=143;context=ATTTTTAAACGCTATCTTGGATTCTTTGAGCAAAAAGCTCG;IPDRatio=6.34;identificationQv=214 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tmodified_base\t113\t113\t41\t+\t.\tcoverage=132;context=CAAGATAGCGTTTAAAAATTTAGGGGTGTTAGGCTCAGCGT;IPDRatio=1.69 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tmodified_base\t115\t115\t33\t+\t.\tcoverage=147;context=AGATAGCGTTTAAAAATTTAGGGGTGTTAGGCTCAGCGTAG;IPDRatio=1.88 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t122\t122\t222\t-\t.\tcoverage=158;context=GCAAACTCTACGCTGAGCCTAACACCCCTAAATTTTTAAAC;IPDRatio=6.51;identificationQv=204 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t232\t232\t221\t+\t.\tcoverage=173;context=AGCGTAAAATCGCCTTTTCCATGCTCCCTAATCGCTTGAAA;IPDRatio=5.90;identificationQv=209 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t233\t233\t282\t-\t.\tcoverage=183;context=ATTTCAAGCGATTAGGGAGCATGGAAAAGGCGATTTTACGC;IPDRatio=6.43;identificationQv=262 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t241\t241\t267\t+\t.\tcoverage=178;context=TCGCCTTTTCCATGCTCCCTAATCGCTTGAAATCCCAGTCT;IPDRatio=5.57;identificationQv=234 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t248\t248\t239\t-\t.\tcoverage=185;context=ATTTAAAAGACTGGGATTTCAAGCGATTAGGGAGCATGGAA;IPDRatio=6.25;identificationQv=223 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t274\t274\t229\t-\t.\tcoverage=190;context=TGAGATTGACGCTCTCATCGAACCGCATTTAAAAGACTGGG;IPDRatio=6.83;identificationQv=220 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t277\t277\t272\t+\t.\tcoverage=188;context=AGTCTTTTAAATGCGGTTCGATGAGAGCGTCAATCTCATTG;IPDRatio=7.50;identificationQv=257 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm4C\t312\t312\t37\t-\t.\tcoverage=204;context=GCTTTAAGCCTTTTTAATGGCGTGTTAGAAAAAATCAATGA;IPDRatio=1.88;identificationQv=3 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t373\t373\t393\t+\t.\tcoverage=219;context=TAATCTTTTTTTCTTCTAACATGCTGGAAGCGATTTTTTTA;IPDRatio=7.11;identificationQv=353 (esc)
+ gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t374\t374\t337\t-\t.\tcoverage=221;context=TTAAAAAAATCGCTTCCAGCATGTTAGAAGAAAAAAAGATT;IPDRatio=6.06;identificationQv=323 (esc)
+
+Now try with a split dataset:
+
+ $ INPUT=$DATA/Hpyl_1_5000_split.xml
+ $ ipdSummary --gff tmp_xml2.gff --csv tmp_xml2.csv --numWorkers 12 --pvalue 0.001 --identify m6A,m4C --reference $REFERENCE --referenceWindows="gi|12057207|gb|AE001439.1|:0-5000" $INPUT
+ $ linecount tmp_xml2.gff
+ 274
diff --git a/test/cram/detection_bam_lossless.t b/test/cram/detection_bam_lossless.t
new file mode 100644
index 0000000..3f0c610
--- /dev/null
+++ b/test/cram/detection_bam_lossless.t
@@ -0,0 +1,50 @@
+Test detection and identification modes of ipdSummary using .bam file as input, with lossless encoding of pulse information.
+
+ $ . $TESTDIR/portability.sh
+
+Load in data:
+
+ $ DATA=/mnt/secondary-siv/testdata/kineticsTools
+ $ INPUT=$DATA/Mjan_1_5000_lossless.bam
+ $ export REF_DIR=/mnt/secondary-siv/references
+ $ export REF_SEQ=${REF_DIR}/Methanocaldococcus_jannaschii_DSM2661/sequence/Methanocaldococcus_jannaschii_DSM2661.fasta
+
+Run basic ipdSummary:
+
+ $ ipdSummary --gff tmp1.gff --csv tmp1.csv --numWorkers 12 --pvalue 0.001 --identify m6A,m4C --reference $REF_SEQ $INPUT
+
+Look at output csv file:
+
+ $ head -3 tmp1.csv
+ refName,tpl,strand,base,score,tMean,tErr,modelPrediction,ipdRatio,coverage
+ "gi|6626255|gb|L77117.1|",1,0,T,3,0.865,0.347,0.870,0.995,14
+ "gi|6626255|gb|L77117.1|",1,1,A,8,1.338,0.475,0.825,1.621,15
+
+ $ linecount tmp1.csv
+ 12182
+
+Look at output gff file:
+
+ $ linecount tmp1.gff
+ 66
+ $ cat tmp1.gff | head -20
+ ##gff-version 3
+ ##source ipdSummary v2.0
+ ##source-commandline * (glob)
+ ##sequence-region gi|6626255|gb|L77117.1| 1 1664970
+ ##sequence-region gi|1500644|gb|L77118.1|MII2CG 1 58407
+ ##sequence-region gi|1500688|gb|L77119.1|MII1CG 1 16550
+ gi|6626255|gb|L77117.1|\tkinModCall\tm6A\t424\t424\t171\t-\t.\tcoverage=116;context=TACGCTACATACGCTCCTCCATCTAATGCAGGAGCAAATTT;IPDRatio=4.81;identificationQv=153 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t738\t738\t33\t+\t.\tcoverage=169;context=AATTAAAATCAGACCGTTTCGGAATGGAAAGATTAATGTAA;IPDRatio=1.63 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tm6A\t845\t845\t263\t-\t.\tcoverage=186;context=TGATTTTAATTTTGATTTCCATCGTGAAGTAATCCAAGTCG;IPDRatio=7.76;identificationQv=242 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t916\t916\t37\t+\t.\tcoverage=210;context=TTGATGCTTTATTTGGATGTTTGAAGAATTAAAATCAGACC;IPDRatio=1.73 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t929\t929\t31\t-\t.\tcoverage=213;context=TCCATTCCGAAACGGTCTGATTTTAATTCTTCAAACATCCA;IPDRatio=1.46 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t988\t988\t43\t+\t.\tcoverage=197;context=CCTTCAAGTTCAAAATTTTTCCCCATAATTAAAATCAGACC;IPDRatio=1.67 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t994\t994\t32\t+\t.\tcoverage=197;context=AGTTCAAAATTTTTCCCCATAATTAAAATCAGACCGTTTCG;IPDRatio=1.57 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tm6A\t1126\t1126\t305\t-\t.\tcoverage=187;context=GGTCTGATTTTAATTGTTCCATCAACAAACGCAGTTATTGA;IPDRatio=5.99;identificationQv=289 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t1186\t1186\t34\t+\t.\tcoverage=153;context=ACAGCATTATAACAATTGATAGACAGAAATTCAGAATTAAA;IPDRatio=1.81 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t1188\t1188\t37\t+\t.\tcoverage=155;context=AGCATTATAACAATTGATAGACAGAAATTCAGAATTAAAAT;IPDRatio=1.65 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t1450\t1450\t36\t-\t.\tcoverage=193;context=TTTACCTGTGAGTGTTGTAGTTCCAAGTAGATATTTCCATT;IPDRatio=1.55 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t1611\t1611\t38\t+\t.\tcoverage=190;context=ATAATAATACTTTAACGCTTCTTTTAAATTAAAATCAGACC;IPDRatio=1.64 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t1633\t1633\t33\t-\t.\tcoverage=175;context=GTAATAATTTCCATTCCGAAACGGTCTGATTTTAATTTAAA;IPDRatio=1.56 (esc)
+ gi|6626255|gb|L77117.1|\tkinModCall\tmodified_base\t1781\t1781\t33\t+\t.\tcoverage=257;context=AAATCAGACCGTTTCGGAATGGAAATTTTTTATCGAAACCT;IPDRatio=1.41 (esc)
diff --git a/test/cram/identify.t b/test/cram/identify.t
new file mode 100644
index 0000000..da19d43
--- /dev/null
+++ b/test/cram/identify.t
@@ -0,0 +1,79 @@
+Test basic mode of ipdSummary.
+
+ $ . $TESTDIR/portability.sh
+
+Load in data:
+
+ $ DATA=$TESTDIR/../data
+ $ INPUT=$DATA/p4-c2-lambda-mod-decode.cmp.h5
+ $ REFERENCE=$DATA/lambda/sequence/lambda.fasta
+
+Run basic ipdSummary:
+
+ $ ipdSummary --numWorkers 1 --pvalue 0.001 --identify m6A,m4C --csv tmp.csv --gff tmp.gff --summary_h5 tmp.h5 --reference $REFERENCE $INPUT
+
+Look at output csv file:
+
+ $ head -3 tmp.csv
+ refName,tpl,strand,base,score,tMean,tErr,modelPrediction,ipdRatio,coverage
+ "lambda_NEB3011",13190,0,A,1,0.872,0.230,1.125,0.776,3
+ "lambda_NEB3011",13190,1,T,1,0.909,0.252,1.126,0.808,3
+
+
+ $ linecount tmp.csv
+ 7603
+
+Look at output gff file:
+
+ $ cat tmp.gff
+ ##gff-version 3
+ ##source ipdSummary * (glob)
+ ##source-commandline * (glob)
+ ##sequence-region lambda_NEB3011 1 48502
+ lambda_NEB3011\tkinModCall\tmodified_base\t14657\t14657\t34\t+\t.\tcoverage=155;context=CGGCACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCA;IPDRatio=1.55 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14743\t14743\t34\t-\t.\tcoverage=173;context=TACCTCTCTCGTTTGCTCAGTTGTTCAGGAATATGGTGCAG;IPDRatio=1.54 (esc)
+ lambda_NEB3011\tkinModCall\tm4C\t14756\t14756\t21\t-\t.\tcoverage=165;context=CCATTTGTCGGTGTACCTCTCTCGTTTGCTCAGTTGTTCAG;IPDRatio=1.32;identificationQv=24 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14769\t14769\t32\t-\t.\tcoverage=168;context=GTGTGCGTCGCTGCCATTTGTCGGTGTACCTCTCTCGTTTG;IPDRatio=1.56 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14800\t14800\t32\t-\t.\tcoverage=173;context=GCGCGCCATGCCCGGTGACGCCAGAGGGAGTGTGTGCGTCG;IPDRatio=1.66 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14805\t14805\t31\t-\t.\tcoverage=167;context=CAGATGCGCGCCATGCCCGGTGACGCCAGAGGGAGTGTGTG;IPDRatio=1.64 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14831\t14831\t31\t+\t.\tcoverage=161;context=CATGGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTG;IPDRatio=1.52 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14834\t14834\t32\t+\t.\tcoverage=166;context=GGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTGGTC;IPDRatio=1.71 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14841\t14841\t32\t+\t.\tcoverage=162;context=ATCTGCCTTTACGGGGATTTACAACGATTTGGTCGCCGCAT;IPDRatio=1.58 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14847\t14847\t36\t-\t.\tcoverage=172;context=AGGTCGATGCGGCGACCAAATCGTTGTAAATCCCCGTAAAG;IPDRatio=1.85 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14864\t14864\t45\t-\t.\tcoverage=166;context=CCCCCGTTTTCACACGAAGGTCGATGCGGCGACCAAATCGT;IPDRatio=1.70 (esc)
+ lambda_NEB3011\tkinModCall\tm4C\t14884\t14884\t30\t-\t.\tcoverage=173;context=CAGTGCCCGGATGGCTTCAGCCCCCGTTTTCACACGAAGGT;IPDRatio=2.10;identificationQv=19 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14885\t14885\t33\t-\t.\tcoverage=166;context=CCAGTGCCCGGATGGCTTCAGCCCCCGTTTTCACACGAAGG;IPDRatio=2.52 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t14909\t14909\t35\t+\t.\tcoverage=162;context=AGCCATCCGGGCACTGGCCACACAGCTCCCGGCGTTTCGTC;IPDRatio=1.76 (esc)
+ lambda_NEB3011\tkinModCall\tm6A\t14983\t14983\t208\t+\t.\tcoverage=160;context=TTGCCGGGCGGGACGTCAGCACGTCCGGGTTAACGGCGCAG;IPDRatio=6.71;identificationQv=191 (esc)
+ lambda_NEB3011\tkinModCall\tm6A\t14992\t14992\t207\t-\t.\tcoverage=162;context=CTCATGTAACTGCGCCGTTAACCCGGACGTGCTGACGTCCC;IPDRatio=6.56;identificationQv=186 (esc)
+ lambda_NEB3011\tkinModCall\tm4C\t14997\t14997\t62\t-\t.\tcoverage=140;context=AGAGTCTCATGTAACTGCGCCGTTAACCCGGACGTGCTGAC;IPDRatio=2.32;identificationQv=11 (esc)
+ lambda_NEB3011\tkinModCall\tm4C\t14998\t14998\t22\t-\t.\tcoverage=172;context=CAGAGTCTCATGTAACTGCGCCGTTAACCCGGACGTGCTGA;IPDRatio=1.51;identificationQv=41 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15006\t15006\t40\t-\t.\tcoverage=170;context=CCATCAGGCAGAGTCTCATGTAACTGCGCCGTTAACCCGGA;IPDRatio=1.75 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15008\t15008\t47\t+\t.\tcoverage=162;context=CGGGTTAACGGCGCAGTTACATGAGACTCTGCCTGATGGCG;IPDRatio=1.73 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15039\t15039\t51\t-\t.\tcoverage=169;context=CCGGCGACTCTGGGAACAATATGAATTACAGCGCCATCAGG;IPDRatio=1.83 (esc)
+ lambda_NEB3011\tkinModCall\tm6A\t15041\t15041\t29\t-\t.\tcoverage=161;context=CCCCGGCGACTCTGGGAACAATATGAATTACAGCGCCATCA;IPDRatio=1.54;identificationQv=3 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15066\t15066\t38\t+\t.\tcoverage=161;context=CCCAGAGTCGCCGGGGCCAAGTCAGGTGGCGTATTCCAGAT;IPDRatio=1.70 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15074\t15074\t33\t-\t.\tcoverage=171;context=CCAGGACAATCTGGAATACGCCACCTGACTTGGCCCCGGCG;IPDRatio=1.81 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15078\t15078\t33\t-\t.\tcoverage=168;context=GCCCCCAGGACAATCTGGAATACGCCACCTGACTTGGCCCC;IPDRatio=1.62 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15089\t15089\t32\t-\t.\tcoverage=167;context=CAATGGCGGCAGCCCCCAGGACAATCTGGAATACGCCACCT;IPDRatio=1.57 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15095\t15095\t31\t-\t.\tcoverage=167;context=ATCCGGCAATGGCGGCAGCCCCCAGGACAATCTGGAATACG;IPDRatio=1.57 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15118\t15118\t32\t-\t.\tcoverage=163;context=GGTGGCTCCGGCGGTAAAGAATGATCCGGCAATGGCGGCAG;IPDRatio=1.55 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15124\t15124\t35\t-\t.\tcoverage=170;context=TGCAAGGGTGGCTCCGGCGGTAAAGAATGATCCGGCAATGG;IPDRatio=1.57 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15195\t15195\t32\t-\t.\tcoverage=164;context=AGCACCATACTGGCACCGAGAGAAAACAGGATGCCGGTCAT;IPDRatio=1.59 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15239\t15239\t33\t+\t.\tcoverage=157;context=TGGTGTGGCGCAGATGCTGGCACCGAAAGCCAGAACTCCCC;IPDRatio=1.65 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15261\t15261\t39\t-\t.\tcoverage=167;context=CCGTTATCCGTTGTCTGTATACGGGGAGTTCTGGCTTTCGG;IPDRatio=1.73 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15292\t15292\t39\t-\t.\tcoverage=166;context=ATCCAGTGAGGAGAAATAGGTGTTCTGCTTACCGTTATCCG;IPDRatio=1.57 (esc)
+ lambda_NEB3011\tkinModCall\tm6A\t15381\t15381\t20\t-\t.\tcoverage=169;context=GCCGTGCTGATCTCCTGAGAAACCACGCGTGACCCCACGCG;IPDRatio=1.40;identificationQv=13 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15402\t15402\t32\t-\t.\tcoverage=169;context=TGACCACCGTCCCCTTCGTCTGCCGTGCTGATCTCCTGAGA;IPDRatio=1.47 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t15566\t15566\t32\t+\t.\tcoverage=118;context=GAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCG;IPDRatio=1.79 (esc)
+ lambda_NEB3011\tkinModCall\tm6A\t15704\t15704\t21\t-\t.\tcoverage=84;context=CCTGCTCACCAGCCCGGAACACCACCGTGACACCGGATATG;IPDRatio=1.47;identificationQv=3 (esc)
+ lambda_NEB3011\tkinModCall\tmodified_base\t16035\t16035\t31\t-\t.\tcoverage=38;context=ATCCTGCGCATCCGGATATTAAACGGGCGCGGCGGCAGGTT;IPDRatio=1.93 (esc)
+ lambda_NEB3011\tkinModCall\tm6A\t16380\t16380\t21\t-\t.\tcoverage=10;context=CATTTATCCACATCCGCCGCACCAAGACGTTTCCCCATGCC;IPDRatio=6.03;identificationQv=5 (esc)
+ lambda_NEB3011\tkinModCall\tm6A\t16658\t16658\t21\t-\t.\tcoverage=6;context=GGGCGCTGAAGCTGTAGCGGAACGGCGCGCCATCATCCGGC;IPDRatio=2.78;identificationQv=20 (esc)
+
+
+What about the H5 file?
+
+ $ h5ls -r tmp.h5
+ / Group
+ /lambda_NEB3011 Dataset {48502}
diff --git a/test/cram/long_running/README.txt b/test/cram/long_running/README.txt
new file mode 100644
index 0000000..6a19abc
--- /dev/null
+++ b/test/cram/long_running/README.txt
@@ -0,0 +1,42 @@
+====================================================
+README for /mnt/secondary-siv/testdata/kineticsTools
+====================================================
+
+Most of these files are derived from Tyson Clark's P6 chemistry validation
+experiments. Bsub is an amplified control.
+
+Bsub:
+/mnt/data3/vol56/2530923/0003
+/mnt/data3/vol56/2530923/0004
+
+Cagg:
+/mnt/data3/vol56/2530926/0003
+/mnt/data3/vol56/2530926/0004
+/mnt/data3/vol56/2530928/0003
+/mnt/data3/vol56/2530928/0004
+
+Hpyl:
+/mnt/data3/vol56/2530926/0007
+/mnt/data3/vol56/2530926/0008
+/mnt/data3/vol56/2530928/0006
+/mnt/data3/vol56/2530928/0005
+
+Mjan:
+/mnt/data3/vol56/2530924/0007
+/mnt/data3/vol56/2530924/0008
+/mnt/data3/vol56/2530928/0001
+/mnt/data3/vol56/2530928/0002
+
+Method to generate alignment files (python-like pseudo-code):
+ bam_files = []
+ for i, file_name in enumerate(h5_files):
+ call("bax2bam subreads.1.bax.h5 -o unaligned.1")
+ call("pbalign unaligned." + str(i) + ".subreads.bam /ref/seq/dir aligned." + str(i) + "1.bam --nproc=12 --seed=1 --minAccuracy=0.75 --minLength=50 --concordant --algorithmOptions=' -minMatch 12 -bestn 10 -minPctIdentity 70.0 -useQuality -minRawSubreadScore 800'")
+ bam_files.append("unaligned." + str(i) + ".bam")
+ call("~nechols/bin/bamtools merge -out aligned3.bam -in " +
+ " -in ".join(bam_files))
+ call("samtools index aligned3.bam")
+
+Running ipdSummary on these inputs takes 1-3 hours on our cluster. This
+can be significantly reduced by using parallelization, but this introduces
+stochastic behavior for some jobs.
diff --git a/test/cram/long_running/detect_and_identify_Bsub.t b/test/cram/long_running/detect_and_identify_Bsub.t
new file mode 100644
index 0000000..7b7b439
--- /dev/null
+++ b/test/cram/long_running/detect_and_identify_Bsub.t
@@ -0,0 +1,31 @@
+
+
+Run base modification detection on B. subtilis P6 chemistry validation data
+
+ $ . $TESTDIR/../portability.sh
+
+ $ export DATA_DIR=/mnt/secondary-siv/testdata/kineticsTools
+ $ export BAMFILE=${DATA_DIR}/Bsub_aligned.subreads.bam
+ $ export REF_DIR=/mnt/secondary-siv/references
+ $ export REF_SEQ=${REF_DIR}/B_subtilis_strW23/sequence/B_subtilis_strW23.fasta
+
+ $ ipdSummary ${BAMFILE} --reference ${REF_SEQ} --gff tst_Bsub.gff --csv tst_Bsub.csv --numWorkers 12 --pvalue 0.001 --identify m6A,m4C
+
+ $ linecount tst_Bsub.csv
+ 8055333
+
+This is an amplified control but it will still find some "modifications".
+
+ $ linecount tst_Bsub.gff
+ 25513
+
+Most of these can't be positively identified, however.
+
+ $ grep -c m4C tst_Bsub.gff
+ 3141
+
+ $ grep -c m6A tst_Bsub.gff
+ 573
+
+ $ grep -c modified_base tst_Bsub.gff
+ 21797
diff --git a/test/cram/long_running/detect_and_identify_Cagg.t b/test/cram/long_running/detect_and_identify_Cagg.t
new file mode 100644
index 0000000..aac1d6c
--- /dev/null
+++ b/test/cram/long_running/detect_and_identify_Cagg.t
@@ -0,0 +1,26 @@
+
+
+Run base modification detection on C. aggregans P6 chemistry validation data
+
+ $ . $TESTDIR/../portability.sh
+
+ $ export DATA_DIR=/mnt/secondary-siv/testdata/kineticsTools
+ $ export BAMFILE=${DATA_DIR}/Cagg_aligned.subreads.bam
+ $ export REF_DIR=/mnt/secondary/Smrtanalysis/current/common/references
+ $ export REF_SEQ=${REF_DIR}/Chloroflexus_aggregans_DSM9485/sequence/Chloroflexus_aggregans_DSM9485.fasta
+
+ $ ipdSummary ${BAMFILE} --reference ${REF_SEQ} --gff tst_Cagg.gff --csv tst_Cagg.csv --numWorkers 12 --pvalue 0.001 --identify m6A,m4C
+
+ $ linecount tst_Cagg.csv
+ 9369863
+
+This one actually has real modifications (and lots of them).
+
+ $ linecount tst_Cagg.gff
+ 177331
+ $ grep -c m4C tst_Cagg.gff
+ 35050
+ $ grep -c m6A tst_Cagg.gff
+ 136593
+ $ grep -c modified_base tst_Cagg.gff
+ 5686
diff --git a/test/cram/long_running/detect_and_identify_Hpyl.t b/test/cram/long_running/detect_and_identify_Hpyl.t
new file mode 100644
index 0000000..8067142
--- /dev/null
+++ b/test/cram/long_running/detect_and_identify_Hpyl.t
@@ -0,0 +1,28 @@
+
+Run base modification detection on H. pylori P6 chemistry validation data.
+
+ $ . $TESTDIR/../portability.sh
+
+ $ export DATA_DIR=/mnt/secondary-siv/testdata/kineticsTools
+ $ export BAMFILE=${DATA_DIR}/Hpyl_aligned.subreads.bam
+ $ export REF_DIR=/mnt/secondary-siv/references
+ $ export REF_SEQ=${REF_DIR}/Helicobacter_pylori_J99/sequence/Helicobacter_pylori_J99.fasta
+
+ $ ipdSummary ${BAMFILE} --reference ${REF_SEQ} --gff tst_Hpyl.gff --csv tst_Hpyl.csv --numWorkers 12 --pvalue 0.001 --identify m6A,m4C
+
+ $ linecount tst_Hpyl.csv
+ 3287635
+
+This one also has lots of modifications, mostly m6A.
+
+ $ linecount tst_Hpyl.gff
+ 80503
+
+ $ grep -c m4C tst_Hpyl.gff
+ 10508
+
+ $ grep -c m6A tst_Hpyl.gff
+ 57749
+
+ $ grep -c modified_base tst_Hpyl.gff
+ 12244
diff --git a/test/cram/long_running/detect_and_identify_Mjan.t b/test/cram/long_running/detect_and_identify_Mjan.t
new file mode 100644
index 0000000..3c5f9e1
--- /dev/null
+++ b/test/cram/long_running/detect_and_identify_Mjan.t
@@ -0,0 +1,28 @@
+
+Run base modification detection on M. jannaschii P6 chemistry validation data.
+
+ $ . $TESTDIR/../portability.sh
+
+ $ export DATA_DIR=/mnt/secondary-siv/testdata/kineticsTools
+ $ export BAMFILE=${DATA_DIR}/Mjan_aligned.subreads.bam
+ $ export REF_DIR=/mnt/secondary/Smrtanalysis/current/common/references
+ $ export REF_SEQ=${REF_DIR}/Methanocaldococcus_jannaschii_DSM2661/sequence/Methanocaldococcus_jannaschii_DSM2661.fasta
+
+ $ ipdSummary ${BAMFILE} --reference ${REF_SEQ} --gff tst_Mjan.gff --csv tst_Mjan.csv --numWorkers 12 --pvalue 0.001 --identify m6A,m4C
+
+ $ linecount tst_Mjan.csv
+ 3479855
+
+This one has relatively few modificiations, 60% of which are identifiable.
+
+ $ linecount tst_Mjan.gff
+ 18654
+
+ $ grep -c m4C tst_Mjan.gff
+ 3328
+
+ $ grep -c m6A tst_Mjan.gff
+ 9506
+
+ $ grep -c modified_base tst_Mjan.gff
+ 5816
diff --git a/test/cram/long_running/run_on_cluster.py b/test/cram/long_running/run_on_cluster.py
new file mode 100755
index 0000000..860e481
--- /dev/null
+++ b/test/cram/long_running/run_on_cluster.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+
+import subprocess
+import os.path
+
+dir_name = os.path.dirname(__file__)
+for org_code in ["Bsub", "Cagg", "Hpyl", "Mjan"]:
+ script_name = "test_kineticsTools_%s.sh" % org_code
+ with open(script_name, "w") as f:
+ f.write("#!/bin/sh\n")
+ f.write("module load cram\n")
+ f.write("time cram %s/detect_and_identify_%s.t\n" % (dir_name, org_code))
+ subprocess.call(["chmod","755", "%s" % script_name])
+ subprocess.call(["qsub", "-cwd", "-pe", "smp", "12", "%s" % script_name])
diff --git a/test/cram/methyl-fraction-case-ctrl.t b/test/cram/methyl-fraction-case-ctrl.t
new file mode 100644
index 0000000..25e9fd1
--- /dev/null
+++ b/test/cram/methyl-fraction-case-ctrl.t
@@ -0,0 +1,35 @@
+Test basic mode of ipdSummary.
+
+ $ . $TESTDIR/portability.sh
+
+Load in data:
+
+ $ DATA=$TESTDIR/../data
+ $ INPUT=$DATA/p4-c2-lambda-mod-decode.cmp.h5
+ $ REFERENCE=$DATA/lambda/sequence/lambda.fasta
+
+Run basic ipdSummary:
+
+ $ ipdSummary --numWorkers 1 --methylFraction --csv tmp.csv --gff tmp.gff --summary_h5 tmp.h5 --control $INPUT --reference $REFERENCE $INPUT
+
+Look at output csv file:
+
+ $ head -3 tmp.csv
+ refName,tpl,strand,base,score,pvalue,caseMean,controlMean,caseStd,controlStd,ipdRatio,testStatistic,coverage,controlCoverage,caseCoverage,frac,fracLow,fracUp
+ "lambda_NEB3011",*,?,?,*,*,*,*,*,*,1.000,0.0,*,*,*,,, (glob)
+ "lambda_NEB3011",*,?,?,*,*,*,*,*,*,1.000,0.0,*,*,*,,, (glob)
+
+Look at output gff file:
+
+ $ cat tmp.gff
+ ##gff-version 3
+ ##source ipdSummary * (glob)
+ ##source-commandline * (glob)
+ ##sequence-region lambda_NEB3011 1 48502
+
+
+What about the H5 file?
+
+ $ h5ls -r tmp.h5
+ / Group
+ /lambda_NEB3011 Dataset {48502}
diff --git a/test/cram/methyl-fraction-identify.t.off b/test/cram/methyl-fraction-identify.t.off
new file mode 100644
index 0000000..dbc1126
--- /dev/null
+++ b/test/cram/methyl-fraction-identify.t.off
@@ -0,0 +1,77 @@
+Test basic mode of ipdSummary.py.
+
+ $ . $TESTDIR/portability.sh
+
+Load in data:
+
+ $ DATA=$TESTDIR/../data
+ $ INPUT=$DATA/p4-c2-lambda-mod-decode.cmp.h5
+ $ REFERENCE=$DATA/lambda/sequence/lambda.fasta
+
+Run basic ipdSummary.py:
+
+ $ ipdSummary.py --pvalue 0.001 --numWorkers 1 --methylFraction --identify m6A,m4C --csv tmp.csv --gff tmp.gff --summary_h5 tmp.h5 --reference $REFERENCE $INPUT
+
+Look at output csv file:
+
+ $ head -3 tmp.csv
+ refName,tpl,strand,base,score,tMean,tErr,modelPrediction,ipdRatio,coverage,frac,fracLow,fracUp
+ "lambda_NEB3011",13190,0,A,3,1.119,0.096,1.125,0.995,3,,,
+ "lambda_NEB3011",13190,1,T,2,0.952,0.256,1.126,0.846,3,,,
+
+ $ grep 14756 tmp.csv
+ "lambda_NEB3011",14756,0,G,0,0.209,0.030,0.288,0.726,164,,,
+ "lambda_NEB3011",14756,1,C,23,0.692,0.049,0.519,1.333,165,0.370,0.154,0.614
+
+Look at output gff file:
+
+ $ sed 's/\t/ /g' tmp.gff
+ ##gff-version 3
+ ##source ipdSummary.py v2.0
+ ##source-commandline /home/UNIXHOME/dalexander/.virtualenvs/VE/bin/ipdSummary.py --pvalue 0.001 --numWorkers 1 --methylFraction --identify m6A,m4C --csv tmp.csv --gff tmp.gff --summary_h5 tmp.h5 --reference /home/UNIXHOME/dalexander/Projects/software/smrtanalysis/bioinformatics/tools/kineticsTools/test/cram/../data/lambda/sequence/lambda.fasta /home/UNIXHOME/dalexander/Projects/software/smrtanalysis/bioinformatics/tools/kineticsTools/test/cram/../data/p4-c2-lambda-mod-decode.cmp.h5
+ ##sequence-region ref000001 1 48502
+ lambda_NEB3011 kinModCall modified_base 14060 14060 31 - . coverage=49;context=ACGTTATTGCGGAACTTACAACCGCTCAGGCATTTGCTGCA;IPDRatio=2.16
+ lambda_NEB3011 kinModCall modified_base 14657 14657 35 + . coverage=155;context=CGGCACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCA;IPDRatio=1.56
+ lambda_NEB3011 kinModCall modified_base 14743 14743 32 - . coverage=173;context=TACCTCTCTCGTTTGCTCAGTTGTTCAGGAATATGGTGCAG;IPDRatio=1.51
+ lambda_NEB3011 kinModCall m4C 14756 14756 23 - . coverage=165;context=CCATTTGTCGGTGTACCTCTCTCGTTTGCTCAGTTGTTCAG;IPDRatio=1.33;frac=0.370;fracLow=0.154;fracUp=0.614;identificationQv=25
+ lambda_NEB3011 kinModCall modified_base 14769 14769 32 - . coverage=168;context=GTGTGCGTCGCTGCCATTTGTCGGTGTACCTCTCTCGTTTG;IPDRatio=1.57
+ lambda_NEB3011 kinModCall modified_base 14800 14800 34 - . coverage=173;context=GCGCGCCATGCCCGGTGACGCCAGAGGGAGTGTGTGCGTCG;IPDRatio=1.70
+ lambda_NEB3011 kinModCall modified_base 14831 14831 33 + . coverage=161;context=CATGGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTG;IPDRatio=1.57
+ lambda_NEB3011 kinModCall modified_base 14834 14834 31 + . coverage=166;context=GGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTGGTC;IPDRatio=1.69
+ lambda_NEB3011 kinModCall modified_base 14841 14841 32 + . coverage=162;context=ATCTGCCTTTACGGGGATTTACAACGATTTGGTCGCCGCAT;IPDRatio=1.58
+ lambda_NEB3011 kinModCall modified_base 14847 14847 35 - . coverage=172;context=AGGTCGATGCGGCGACCAAATCGTTGTAAATCCCCGTAAAG;IPDRatio=1.86
+ lambda_NEB3011 kinModCall modified_base 14864 14864 46 - . coverage=166;context=CCCCCGTTTTCACACGAAGGTCGATGCGGCGACCAAATCGT;IPDRatio=1.71
+ lambda_NEB3011 kinModCall m4C 14884 14884 30 - . coverage=173;context=CAGTGCCCGGATGGCTTCAGCCCCCGTTTTCACACGAAGGT;IPDRatio=2.18;frac=0.313;fracLow=0.169;fracUp=0.450;identificationQv=19
+ lambda_NEB3011 kinModCall modified_base 14909 14909 35 + . coverage=162;context=AGCCATCCGGGCACTGGCCACACAGCTCCCGGCGTTTCGTC;IPDRatio=1.77
+ lambda_NEB3011 kinModCall m6A 14983 14983 210 + . coverage=160;context=TTGCCGGGCGGGACGTCAGCACGTCCGGGTTAACGGCGCAG;IPDRatio=6.76;frac=0.982;fracLow=0.883;fracUp=0.992;identificationQv=194
+ lambda_NEB3011 kinModCall m6A 14992 14992 208 - . coverage=162;context=CTCATGTAACTGCGCCGTTAACCCGGACGTGCTGACGTCCC;IPDRatio=6.67;frac=0.854;fracLow=0.723;fracUp=0.934;identificationQv=194
+ lambda_NEB3011 kinModCall m4C 14997 14997 61 - . coverage=140;context=AGAGTCTCATGTAACTGCGCCGTTAACCCGGACGTGCTGAC;IPDRatio=2.35;frac=0.475;fracLow=0.080;fracUp=0.877;identificationQv=21
+ lambda_NEB3011 kinModCall m4C 14998 14998 23 - . coverage=172;context=CAGAGTCTCATGTAACTGCGCCGTTAACCCGGACGTGCTGA;IPDRatio=1.53;frac=0.336;fracLow=0.097;fracUp=0.553;identificationQv=48
+ lambda_NEB3011 kinModCall modified_base 15008 15008 47 + . coverage=162;context=CGGGTTAACGGCGCAGTTACATGAGACTCTGCCTGATGGCG;IPDRatio=1.73
+ lambda_NEB3011 kinModCall modified_base 15039 15039 51 - . coverage=169;context=CCGGCGACTCTGGGAACAATATGAATTACAGCGCCATCAGG;IPDRatio=1.83
+ lambda_NEB3011 kinModCall m6A 15041 15041 28 - . coverage=161;context=CCCCGGCGACTCTGGGAACAATATGAATTACAGCGCCATCA;IPDRatio=1.52;frac=0.207;fracLow=0.070;fracUp=0.355;identificationQv=4
+ lambda_NEB3011 kinModCall modified_base 15066 15066 37 + . coverage=161;context=CCCAGAGTCGCCGGGGCCAAGTCAGGTGGCGTATTCCAGAT;IPDRatio=1.68
+ lambda_NEB3011 kinModCall modified_base 15074 15074 33 - . coverage=171;context=CCAGGACAATCTGGAATACGCCACCTGACTTGGCCCCGGCG;IPDRatio=1.79
+ lambda_NEB3011 kinModCall modified_base 15078 15078 33 - . coverage=168;context=GCCCCCAGGACAATCTGGAATACGCCACCTGACTTGGCCCC;IPDRatio=1.61
+ lambda_NEB3011 kinModCall modified_base 15089 15089 32 - . coverage=167;context=CAATGGCGGCAGCCCCCAGGACAATCTGGAATACGCCACCT;IPDRatio=1.56
+ lambda_NEB3011 kinModCall modified_base 15095 15095 33 - . coverage=167;context=ATCCGGCAATGGCGGCAGCCCCCAGGACAATCTGGAATACG;IPDRatio=1.59
+ lambda_NEB3011 kinModCall modified_base 15118 15118 32 - . coverage=163;context=GGTGGCTCCGGCGGTAAAGAATGATCCGGCAATGGCGGCAG;IPDRatio=1.54
+ lambda_NEB3011 kinModCall modified_base 15121 15121 31 - . coverage=149;context=AAGGGTGGCTCCGGCGGTAAAGAATGATCCGGCAATGGCGG;IPDRatio=1.58
+ lambda_NEB3011 kinModCall modified_base 15124 15124 35 - . coverage=170;context=TGCAAGGGTGGCTCCGGCGGTAAAGAATGATCCGGCAATGG;IPDRatio=1.57
+ lambda_NEB3011 kinModCall modified_base 15195 15195 33 - . coverage=164;context=AGCACCATACTGGCACCGAGAGAAAACAGGATGCCGGTCAT;IPDRatio=1.60
+ lambda_NEB3011 kinModCall modified_base 15239 15239 33 + . coverage=157;context=TGGTGTGGCGCAGATGCTGGCACCGAAAGCCAGAACTCCCC;IPDRatio=1.64
+ lambda_NEB3011 kinModCall modified_base 15261 15261 38 - . coverage=167;context=CCGTTATCCGTTGTCTGTATACGGGGAGTTCTGGCTTTCGG;IPDRatio=1.73
+ lambda_NEB3011 kinModCall modified_base 15291 15291 31 + . coverage=158;context=ACGGATAACGGTAAGCAGAACACCTATTTCTCCTCACTGGA;IPDRatio=1.67
+ lambda_NEB3011 kinModCall modified_base 15292 15292 41 - . coverage=166;context=ATCCAGTGAGGAGAAATAGGTGTTCTGCTTACCGTTATCCG;IPDRatio=1.61
+ lambda_NEB3011 kinModCall m6A 15381 15381 21 - . coverage=169;context=GCCGTGCTGATCTCCTGAGAAACCACGCGTGACCCCACGCG;IPDRatio=1.41;frac=0.170;fracLow=0.086;fracUp=0.242;identificationQv=13
+ lambda_NEB3011 kinModCall modified_base 15402 15402 33 - . coverage=169;context=TGACCACCGTCCCCTTCGTCTGCCGTGCTGATCTCCTGAGA;IPDRatio=1.48
+ lambda_NEB3011 kinModCall m4C 15439 15439 30 + . coverage=164;context=GTCAGGTTGTGGTGATTGGTCGCTGATGCAAAATGTTTTAT;IPDRatio=1.57;frac=0.627;fracLow=0.346;fracUp=0.896;identificationQv=3
+ lambda_NEB3011 kinModCall modified_base 15566 15566 31 + . coverage=118;context=GAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCG;IPDRatio=1.78
+ lambda_NEB3011 kinModCall m6A 16380 16380 21 - . coverage=10;context=CATTTATCCACATCCGCCGCACCAAGACGTTTCCCCATGCC;IPDRatio=6.05;identificationQv=5
+ lambda_NEB3011 kinModCall m6A 16658 16658 21 - . coverage=6;context=GGGCGCTGAAGCTGTAGCGGAACGGCGCGCCATCATCCGGC;IPDRatio=2.78;identificationQv=21
+
+What about the H5 file?
+
+ $ h5ls -r tmp.h5
+ / Group
+ /ref000001 Dataset {48502}
diff --git a/test/cram/portability.sh b/test/cram/portability.sh
new file mode 100755
index 0000000..cea9fdf
--- /dev/null
+++ b/test/cram/portability.sh
@@ -0,0 +1,4 @@
+#
+# This is a portable alternative to "wc", which differs b/w GNU and BSD
+#
+alias linecount="awk 'END{print NR}'"
diff --git a/test/cram/version.t b/test/cram/version.t
new file mode 100644
index 0000000..0dd4ff3
--- /dev/null
+++ b/test/cram/version.t
@@ -0,0 +1,32 @@
+A simple test of the version and help options:
+
+ $ ipdSummary --version
+ 2.2
+
+ $ ipdSummary
+ usage: ipdSummary [-h] [-v] [--emit-tool-contract]
+ [--resolved-tool-contract RESOLVED_TOOL_CONTRACT]
+ [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--debug]
+ --reference REFERENCE [--gff GFF] [--csv CSV]
+ [--numWorkers NUMWORKERS] [--pvalue PVALUE]
+ [--maxLength MAXLENGTH] [--identify IDENTIFY]
+ [--methylFraction] [--outfile OUTFILE] [--m5Cgff M5CGFF]
+ [--m5Cclassifer M5CCLASSIFIER] [--csv_h5 CSV_H5]
+ [--pickle PICKLE] [--summary_h5 SUMMARY_H5]
+ [--ms_csv MS_CSV] [--control CONTROL] [--useLDA]
+ [--paramsPath PARAMSPATH] [--minCoverage MINCOVERAGE]
+ [--maxQueueSize MAXQUEUESIZE] [--maxCoverage MAXCOVERAGE]
+ [--mapQvThreshold MAPQVTHRESHOLD] [--ipdModel IPDMODEL]
+ [--modelIters MODELITERS] [--cap_percentile CAP_PERCENTILE]
+ [--methylMinCov METHYLMINCOV]
+ [--identifyMinCov IDENTIFYMINCOV]
+ [--maxAlignments MAXALIGNMENTS]
+ [-w REFERENCEWINDOWSASSTRING]
+ [--refContigIndex REFCONTIGINDEX]
+ [-W REFERENCEWINDOWSASSTRING]
+ [--skipUnrecognizedContigs SKIPUNRECOGNIZEDCONTIGS]
+ [--alignmentSetRefWindows] [--threaded] [--profile]
+ [--usePdb] [--seed RANDOMSEED] [--verbose]
+ alignment_set
+ ipdSummary: error: too few arguments
+ [2]
diff --git a/test/data/c2-c2-lambda-mod-decode.cmp.h5 b/test/data/c2-c2-lambda-mod-decode.cmp.h5
new file mode 100644
index 0000000..4851790
Binary files /dev/null and b/test/data/c2-c2-lambda-mod-decode.cmp.h5 differ
diff --git a/test/data/lambda/reference.info.xml b/test/data/lambda/reference.info.xml
new file mode 100644
index 0000000..4ba0dd7
--- /dev/null
+++ b/test/data/lambda/reference.info.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<reference_info xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="v1.4.0" last_modified="2012-08-09T16:20:05-0700" id="lambda" xsi:schemaLocation="http://www.w3.org/2001/XMLSchema-instance">
+ <reference>
+ <description>lambda</description>
+ <file format="text/fasta">./sequence/lambda.fasta</file>
+ <index_file type="indexer">./sequence/lambda.fasta.index</index_file>
+ <index_file type="sawriter">./sequence/lambda.fasta.sa</index_file>
+ <index_file type="sam_idx">./sequence/lambda.fasta.fai</index_file>
+ <max_contig_length>48502</max_contig_length>
+ <num_contigs>1</num_contigs>
+ <type>sample</type>
+ </reference>
+ <contigs>
+ <contig length="48502" id="ref000001" displayName="lambda_NEB3011">
+ <digest type="md5">a1319ff90e994c8190a4fe6569d0822a</digest>
+ <header>lambda_NEB3011</header>
+ </contig>
+ </contigs>
+</reference_info>
diff --git a/test/data/lambda/sequence/lambda.fasta b/test/data/lambda/sequence/lambda.fasta
new file mode 100644
index 0000000..8cf0bce
--- /dev/null
+++ b/test/data/lambda/sequence/lambda.fasta
@@ -0,0 +1,810 @@
+>lambda_NEB3011
+GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG
+TTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACG
+ACAGGTGCTGAAAGCGAGCTTTTTGGCCTCTGTCGTTTCCTTTCTCTGTTTTTGTCCGTG
+GAATGAACAATGGAAGTCAACAAAAAGCAGCTGGCTGACATTTTCGGTGCGAGTATCCGT
+ACCATTCAGAACTGGCAGGAACAGGGAATGCCCGTTCTGCGAGGCGGTGGCAAGGGTAAT
+GAGGTGCTTTATGACTCTGCCGCCGTCATAAAATGGTATGCCGAAAGGGATGCTGAAATT
+GAGAACGAAAAGCTGCGCCGGGAGGTTGAAGAACTGCGGCAGGCCAGCGAGGCAGATCTC
+CAGCCAGGAACTATTGAGTACGAACGCCATCGACTTACGCGTGCGCAGGCCGACGCACAG
+GAACTGAAGAATGCCAGAGACTCCGCTGAAGTGGTGGAAACCGCATTCTGTACTTTCGTG
+CTGTCGCGGATCGCAGGTGAAATTGCCAGTATTCTCGACGGGCTCCCCCTGTCGGTGCAG
+CGGCGTTTTCCGGAACTGGAAAACCGACATGTTGATTTCCTGAAACGGGATATCATCAAA
+GCCATGAACAAAGCAGCCGCGCTGGATGAACTGATACCGGGGTTGCTGAGTGAATATATC
+GAACAGTCAGGTTAACAGGCTGCGGCATTTTGTCCGCGCCGGGCTTCGCTCACTGTTCAG
+GCCGGAGCCACAGACCGCCGTTGAATGGGCGGATGCTAATTACTATCTCCCGAAAGAATC
+CGCATACCAGGAAGGGCGCTGGGAAACACTGCCCTTTCAGCGGGCCATCATGAATGCGAT
+GGGCAGCGACTACATCCGTGAGGTGAATGTGGTGAAGTCTGCCCGTGTCGGTTATTCCAA
+AATGCTGCTGGGTGTTTATGCCTACTTTATAGAGCATAAGCAGCGCAACACCCTTATCTG
+GTTGCCGACGGATGGTGATGCCGAGAACTTTATGAAAACCCACGTTGAGCCGACTATTCG
+TGATATTCCGTCGCTGCTGGCGCTGGCCCCGTGGTATGGCAAAAAGCACCGGGATAACAC
+GCTCACCATGAAGCGTTTCACTAATGGGCGTGGCTTCTGGTGCCTGGGCGGTAAAGCGGC
+AAAAAACTACCGTGAAAAGTCGGTGGATGTGGCGGGTTATGATGAACTTGCTGCTTTTGA
+TGATGATATTGAACAGGAAGGCTCTCCGACGTTCCTGGGTGACAAGCGTATTGAAGGCTC
+GGTCTGGCCAAAGTCCATCCGTGGCTCCACGCCAAAAGTGAGAGGCACCTGTCAGATTGA
+GCGTGCAGCCAGTGAATCCCCGCATTTTATGCGTTTTCATGTTGCCTGCCCGCATTGCGG
+GGAGGAGCAGTATCTTAAATTTGGCGACAAAGAGACGCCGTTTGGCCTCAAATGGACGCC
+GGATGACCCCTCCAGCGTGTTTTATCTCTGCGAGCATAATGCCTGCGTCATCCGCCAGCA
+GGAGCTGGACTTTACTGATGCCCGTTATATCTGCGAAAAGACCGGGATCTGGACCCGTGA
+TGGCATTCTCTGGTTTTCGTCATCCGGTGAAGAGATTGAGCCACCTGACAGTGTGACCTT
+TCACATCTGGACAGCGTACAGCCCGTTCACCACCTGGGTGCAGATTGTCAAAGACTGGAT
+GAAAACGAAAGGGGATACGGGAAAACGTAAAACCTTCGTAAACACCACGCTCGGTGAGAC
+GTGGGAGGCGAAAATTGGCGAACGTCCGGATGCTGAAGTGATGGCAGAGCGGAAAGAGCA
+TTATTCAGCGCCCGTTCCTGACCGTGTGGCTTACCTGACCGCCGGTATCGACTCCCAGCT
+GGACCGCTACGAAATGCGCGTATGGGGATGGGGGCCGGGTGAGGAAAGCTGGCTGATTGA
+CCGGCAGATTATTATGGGCCGCCACGACGATGAACAGACGCTGCTGCGTGTGGATGAGGC
+CATCAATAAAACCTATACCCGCCGGAATGGTGCAGAAATGTCGATATCCCGTATCTGCTG
+GGATACTGGCGGGATTGACCCGACCATTGTGTATGAACGCTCGAAAAAACATGGGCTGTT
+CCGGGTGATCCCCATTAAAGGGGCATCCGTCTACGGAAAGCCGGTGGCCAGCATGCCACG
+TAAGCGAAACAAAAACGGGGTTTACCTTACCGAAATCGGTACGGATACCGCGAAAGAGCA
+GATTTATAACCGCTTCACACTGACGCCGGAAGGGGATGAACCGCTTCCCGGTGCCGTTCA
+CTTCCCGAATAACCCGGATATTTTTGATCTGACCGAAGCGCAGCAGCTGACTGCTGAAGA
+GCAGGTCGAAAAATGGGTGGATGGCAGGAAAAAAATACTGTGGGACAGCAAAAAGCGACG
+CAATGAGGCACTCGACTGCTTCGTTTATGCGCTGGCGGCGCTGCGCATCAGTATTTCCCG
+CTGGCAGCTGGATCTCAGTGCGCTGCTGGCGAGCCTGCAGGAAGAGGATGGTGCAGCAAC
+CAACAAGAAAACACTGGCAGATTACGCCCGTGCCTTATCCGGAGAGGATGAATGACGCGA
+CAGGAAGAACTTGCCGCTGCCCGTGCGGCACTGCATGACCTGATGACAGGTAAACGGGTG
+GCAACAGTACAGAAAGACGGACGAAGGGTGGAGTTTACGGCCACTTCCGTGTCTGACCTG
+AAAAAATATATTGCAGAGCTGGAAGTGCAGACCGGCATGACACAGCGACGCAGGGGACCT
+GCAGGATTTTATGTATGAAAACGCCCACCATTCCCACCCTTCTGGGGCCGGACGGCATGA
+CATCGCTGCGCGAATATGCCGGTTATCACGGCGGTGGCAGCGGATTTGGAGGGCAGTTGC
+GGTCGTGGAACCCACCGAGTGAAAGTGTGGATGCAGCCCTGTTGCCCAACTTTACCCGTG
+GCAATGCCCGCGCAGACGATCTGGTACGCAATAACGGCTATGCCGCCAACGCCATCCAGC
+TGCATCAGGATCATATCGTCGGGTCTTTTTTCCGGCTCAGTCATCGCCCAAGCTGGCGCT
+ATCTGGGCATCGGGGAGGAAGAAGCCCGTGCCTTTTCCCGCGAGGTTGAAGCGGCATGGA
+AAGAGTTTGCCGAGGATGACTGCTGCTGCATTGACGTTGAGCGAAAACGCACGTTTACCA
+TGATGATTCGGGAAGGTGTGGCCATGCACGCCTTTAACGGTGAACTGTTCGTTCAGGCCA
+CCTGGGATACCAGTTCGTCGCGGCTTTTCCGGACACAGTTCCGGATGGTCAGCCCGAAGC
+GCATCAGCAACCCGAACAATACCGGCGACAGCCGGAACTGCCGTGCCGGTGTGCAGATTA
+ATGACAGCGGTGCGGCGCTGGGATATTACGTCAGCGAGGACGGGTATCCTGGCTGGATGC
+CGCAGAAATGGACATGGATACCCCGTGAGTTACCCGGCGGGCGCGCCTCGTTCATTCACG
+TTTTTGAACCCGTGGAGGACGGGCAGACTCGCGGTGCAAATGTGTTTTACAGCGTGATGG
+AGCAGATGAAGATGCTCGACACGCTGCAGAACACGCAGCTGCAGAGCGCCATTGTGAAGG
+CGATGTATGCCGCCACCATTGAGAGTGAGCTGGATACGCAGTCAGCGATGGATTTTATTC
+TGGGCGCGAACAGTCAGGAGCAGCGGGAAAGGCTGACCGGCTGGATTGGTGAAATTGCCG
+CGTATTACGCCGCAGCGCCGGTCCGGCTGGGAGGCGCAAAAGTACCGCACCTGATGCCGG
+GTGACTCACTGAACCTGCAGACGGCTCAGGATACGGATAACGGCTACTCCGTGTTTGAGC
+AGTCACTGCTGCGGTATATCGCTGCCGGGCTGGGTGTCTCGTATGAGCAGCTTTCCCGGA
+ATTACGCCCAGATGAGCTACTCCACGGCACGGGCCAGTGCGAACGAGTCGTGGGCGTACT
+TTATGGGGCGGCGAAAATTCGTCGCATCCCGTCAGGCGAGCCAGATGTTTCTGTGCTGGC
+TGGAAGAGGCCATCGTTCGCCGCGTGGTGACGTTACCTTCAAAAGCGCGCTTCAGTTTTC
+AGGAAGCCCGCAGTGCCTGGGGGAACTGCGACTGGATAGGCTCCGGTCGTATGGCCATCG
+ATGGTCTGAAAGAAGTTCAGGAAGCGGTGATGCTGATAGAAGCCGGACTGAGTACCTACG
+AGAAAGAGTGCGCAAAACGCGGTGACGACTATCAGGAAATTTTTGCCCAGCAGGTCCGTG
+AAACGATGGAGCGCCGTGCAGCCGGTCTTAAACCGCCCGCCTGGGCGGCTGCAGCATTTG
+AATCCGGGCTGCGACAATCAACAGAGGAGGAGAAGAGTGACAGCAGAGCTGCGTAATCTC
+CCGCATATTGCCAGCATGGCCTTTAATGAGCCGCTGATGCTTGAACCCGCCTATGCGCGG
+GTTTTCTTTTGTGCGCTTGCAGGCCAGCTTGGGATCAGCAGCCTGACGGATGCGGTGTCC
+GGCGACAGCCTGACTGCCCAGGAGGCACTCGCGACGCTGGCATTATCCGGTGATGATGAC
+GGACCACGACAGGCCCGCAGTTATCAGGTCATGAACGGCATCGCCGTGCTGCCGGTGTCC
+GGCACGCTGGTCAGCCGGACGCGGGCGCTGCAGCCGTACTCGGGGATGACCGGTTACAAC
+GGCATTATCGCCCGTCTGCAACAGGCTGCCAGCGATCCGATGGTGGACGGCATTCTGCTC
+GATATGGACACGCCCGGCGGGATGGTGGCGGGGGCATTTGACTGCGCTGACATCATCGCC
+CGTGTGCGTGACATAAAACCGGTATGGGCGCTTGCCAACGACATGAACTGCAGTGCAGGT
+CAGTTGCTTGCCAGTGCCGCCTCCCGGCGTCTGGTCACGCAGACCGCCCGGACAGGCTCC
+ATCGGCGTCATGATGGCTCACAGTAATTACGGTGCTGCGCTGGAGAAACAGGGTGTGGAA
+ATCACGCTGATTTACAGCGGCAGCCATAAGGTGGATGGCAACCCCTACAGCCATCTTCCG
+GATGACGTCCGGGAGACACTGCAGTCCCGGATGGACGCAACCCGCCAGATGTTTGCGCAG
+AAGGTGTCGGCATATACCGGCCTGTCCGTGCAGGTTGTGCTGGATACCGAGGCTGCAGTG
+TACAGCGGTCAGGAGGCCATTGATGCCGGACTGGCTGATGAACTTGTTAACAGCACCGAT
+GCGATCACCGTCATGCGTGATGCACTGGATGCACGTAAATCCCGTCTCTCAGGAGGGCGA
+ATGACCAAAGAGACTCAATCAACAACTGTTTCAGCCACTGCTTCGCAGGCTGACGTTACT
+GACGTGGTGCCAGCGACGGAGGGCGAGAACGCCAGCGCGGCGCAGCCGGACGTGAACGCG
+CAGATCACCGCAGCGGTTGCGGCAGAAAACAGCCGCATTATGGGGATCCTCAACTGTGAG
+GAGGCTCACGGACGCGAAGAACAGGCACGCGTGCTGGCAGAAACCCCCGGTATGACCGTG
+AAAACGGCCCGCCGCATTCTGGCCGCAGCACCACAGAGTGCACAGGCGCGCAGTGACACT
+GCGCTGGATCGTCTGATGCAGGGGGCACCGGCACCGCTGGCTGCAGGTAACCCGGCATCT
+GATGCCGTTAACGATTTGCTGAACACACCAGTGTAAGGGATGTTTATGACGAGCAAAGAA
+ACCTTTACCCATTACCAGCCGCAGGGCAACAGTGACCCGGCTCATACCGCAACCGCGCCC
+GGCGGATTGAGTGCGAAAGCGCCTGCAATGACCCCGCTGATGCTGGACACCTCCAGCCGT
+AAGCTGGTTGCGTGGGATGGCACCACCGACGGTGCTGCCGTTGGCATTCTTGCGGTTGCT
+GCTGACCAGACCAGCACCACGCTGACGTTCTACAAGTCCGGCACGTTCCGTTATGAGGAT
+GTGCTCTGGCCGGAGGCTGCCAGCGACGAGACGAAAAAACGGACCGCGTTTGCCGGAACG
+GCAATCAGCATCGTTTAACTTTACCCTTCATCACTAAAGGCCGCCTGTGCGGCTTTTTTT
+ACGGGATTTTTTTATGTCGATGTACACAACCGCCCAACTGCTGGCGGCAAATGAGCAGAA
+ATTTAAGTTTGATCCGCTGTTTCTGCGTCTCTTTTTCCGTGAGAGCTATCCCTTCACCAC
+GGAGAAAGTCTATCTCTCACAAATTCCGGGACTGGTAAACATGGCGCTGTACGTTTCGCC
+GATTGTTTCCGGTGAGGTTATCCGTTCCCGTGGCGGCTCCACCTCTGAATTTACGCCGGG
+ATATGTCAAGCCGAAGCATGAAGTGAATCCGCAGATGACCCTGCGTCGCCTGCCGGATGA
+AGATCCGCAGAATCTGGCGGACCCGGCTTACCGCCGCCGTCGCATCATCATGCAGAACAT
+GCGTGACGAAGAGCTGGCCATTGCTCAGGTCGAAGAGATGCAGGCAGTTTCTGCCGTGCT
+TAAGGGCAAATACACCATGACCGGTGAAGCCTTCGATCCGGTTGAGGTGGATATGGGCCG
+CAGTGAGGAGAATAACATCACGCAGTCCGGCGGCACGGAGTGGAGCAAGCGTGACAAGTC
+CACGTATGACCCGACCGACGATATCGAAGCCTACGCGCTGAACGCCAGCGGTGTGGTGAA
+TATCATCGTGTTCGATCCGAAAGGCTGGGCGCTGTTCCGTTCCTTCAAAGCCGTCAAGGA
+GAAGCTGGATACCCGTCGTGGCTCTAATTCCGAGCTGGAGACAGCGGTGAAAGACCTGGG
+CAAAGCGGTGTCCTATAAGGGGATGTATGGCGATGTGGCCATCGTCGTGTATTCCGGACA
+GTACGTGGAAAACGGCGTCAAAAAGAACTTCCTGCCGGACAACACGATGGTGCTGGGGAA
+CACTCAGGCACGCGGTCTGCGCACCTATGGCTGCATTCAGGATGCGGACGCACAGCGCGA
+AGGCATTAACGCCTCTGCCCGTTACCCGAAAAACTGGGTGACCACCGGCGATCCGGCGCG
+TGAGTTCACCATGATTCAGTCAGCACCGCTGATGCTGCTGGCTGACCCTGATGAGTTCGT
+GTCCGTACAACTGGCGTAATCATGGCCCTTCGGGGCCATTGTTTCTCTGTGGAGGAGTCC
+ATGACGAAAGATGAACTGATTGCCCGTCTCCGCTCGCTGGGTGAACAACTGAACCGTGAT
+GTCAGCCTGACGGGGACGAAAGAAGAACTGGCGCTCCGTGTGGCAGAGCTGAAAGAGGAG
+CTTGATGACACGGATGAAACTGCCGGTCAGGACACCCCTCTCAGCCGGGAAAATGTGCTG
+ACCGGACATGAAAATGAGGTGGGATCAGCGCAGCCGGATACCGTGATTCTGGATACGTCT
+GAACTGGTCACGGTCGTGGCACTGGTGAAGCTGCATACTGATGCACTTCACGCCACGCGG
+GATGAACCTGTGGCATTTGTGCTGCCGGGAACGGCGTTTCGTGTCTCTGCCGGTGTGGCA
+GCCGAAATGACAGAGCGCGGCCTGGCCAGAATGCAATAACGGGAGGCGCTGTGGCTGATT
+TCGATAACCTGTTCGATGCTGCCATTGCCCGCGCCGATGAAACGATACGCGGGTACATGG
+GAACGTCAGCCACCATTACATCCGGTGAGCAGTCAGGTGCGGTGATACGTGGTGTTTTTG
+ATGACCCTGAAAATATCAGCTATGCCGGACAGGGCGTGCGCGTTGAAGGCTCCAGCCCGT
+CCCTGTTTGTCCGGACTGATGAGGTGCGGCAGCTGCGGCGTGGAGACACGCTGACCATCG
+GTGAGGAAAATTTCTGGGTAGATCGGGTTTCGCCGGATGATGGCGGAAGTTGTCATCTCT
+GGCTTGGACGGGGCGTACCGCCTGCCGTTAACCGTCGCCGCTGAAAGGGGGATGTATGGC
+CATAAAAGGTCTTGAGCAGGCCGTTGAAAACCTCAGCCGTATCAGCAAAACGGCGGTGCC
+TGGTGCCGCCGCAATGGCCATTAACCGCGTTGCTTCATCCGCGATATCGCAGTCGGCGTC
+ACAGGTTGCCCGTGAGACAAAGGTACGCCGGAAACTGGTAAAGGAAAGGGCCAGGCTGAA
+AAGGGCCACGGTCAAAAATCCGCAGGCCAGAATCAAAGTTAACCGGGGGGATTTGCCCGT
+AATCAAGCTGGGTAATGCGCGGGTTGTCCTTTCGCGCCGCAGGCGTCGTAAAAAGGGGCA
+GCGTTCATCCCTGAAAGGTGGCGGCAGCGTGCTTGTGGTGGGTAACCGTCGTATTCCCGG
+CGCGTTTATTCAGCAACTGAAAAATGGCCGGTGGCATGTCATGCAGCGTGTGGCTGGGAA
+AAACCGTTACCCCATTGATGTGGTGAAAATCCCGATGGCGGTGCCGCTGACCACGGCGTT
+TAAACAAAATATTGAGCGGATACGGCGTGAACGTCTTCCGAAAGAGCTGGGCTATGCGCT
+GCAGCATCAACTGAGGATGGTAATAAAGCGATGAAACATACTGAACTCCGTGCAGCCGTA
+CTGGATGCACTGGAGAAGCATGACACCGGGGCGACGTTTTTTGATGGTCGCCCCGCTGTT
+TTTGATGAGGCGGATTTTCCGGCAGTTGCCGTTTATCTCACCGGCGCTGAATACACGGGC
+GAAGAGCTGGACAGCGATACCTGGCAGGCGGAGCTGCATATCGAAGTTTTCCTGCCTGCT
+CAGGTGCCGGATTCAGAGCTGGATGCGTGGATGGAGTCCCGGATTTATCCGGTGATGAGC
+GATATCCCGGCACTGTCAGATTTGATCACCAGTATGGTGGCCAGCGGCTATGACTACCGG
+CGCGACGATGATGCGGGCTTGTGGAGTTCAGCCGATCTGACTTATGTCATTACCTATGAA
+ATGTGAGGACGCTATGCCTGTACCAAATCCTACAATGCCGGTGAAAGGTGCCGGGACCAC
+CCTGTGGGTTTATAAGGGGAGCGGTGACCCTTACGCGAATCCGCTTTCAGACGTTGACTG
+GTCGCGTCTGGCAAAAGTTAAAGACCTGACGCCCGGCGAACTGACCGCTGAGTCCTATGA
+CGACAGCTATCTCGATGATGAAGATGCAGACTGGACTGCGACCGGGCAGGGGCAGAAATC
+TGCCGGAGATACCAGCTTCACGCTGGCGTGGATGCCCGGAGAGCAGGGGCAGCAGGCGCT
+GCTGGCGTGGTTTAATGAAGGCGATACCCGTGCCTATAAAATCCGCTTCCCGAACGGCAC
+GGTCGATGTGTTCCGTGGCTGGGTCAGCAGTATCGGTAAGGCGGTGACGGCGAAGGAAGT
+GATCACCCGCACGGTGAAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAG
+CACGGTAACAGCGGCAACCGGCATGACCGTGACGCCTGCCAGCACCTCGGTGGTGAAAGG
+GCAGAGCACCACGCTGACCGTGGCCTTCCAGCCGGAGGGCGTAACCGACAAGAGCTTTCG
+TGCGGTGTCTGCGGATAAAACAAAAGCCACCGTGTCGGTCAGTGGTATGACCATCACCGT
+GAACGGCGTTGCTGCAGGCAAGGTCAACATTCCGGTTGTATCCGGTAATGGTGAGTTTGC
+TGCGGTTGCAGAAATTACCGTCACCGCCAGTTAATCCGGAGAGTCAGCGATGTTCCTGAA
+AACCGAATCATTTGAACATAACGGTGTGACCGTCACGCTTTCTGAACTGTCAGCCCTGCA
+GCGCATTGAGCATCTCGCCCTGATGAAACGGCAGGCAGAACAGGCGGAGTCAGACAGCAA
+CCGGAAGTTTACTGTGGAAGACGCCATCAGAACCGGCGCGTTTCTGGTGGCGATGTCCCT
+GTGGCATAACCATCCGCAGAAGACGCAGATGCCGTCCATGAATGAAGCCGTTAAACAGAT
+TGAGCAGGAAGTGCTTACCACCTGGCCCACGGAGGCAATTTCTCATGCTGAAAACGTGGT
+GTACCGGCTGTCTGGTATGTATGAGTTTGTGGTGAATAATGCCCCTGAACAGACAGAGGA
+CGCCGGGCCCGCAGAGCCTGTTTCTGCGGGAAAGTGTTCGACGGTGAGCTGAGTTTTGCC
+CTGAAACTGGCGCGTGAGATGGGGCGACCCGACTGGCGTGCCATGCTTGCCGGGATGTCA
+TCCACGGAGTATGCCGACTGGCACCGCTTTTACAGTACCCATTATTTTCATGATGTTCTG
+CTGGATATGCACTTTTCCGGGCTGACGTACACCGTGCTCAGCCTGTTTTTCAGCGATCCG
+GATATGCATCCGCTGGATTTCAGTCTGCTGAACCGGCGCGAGGCTGACGAAGAGCCTGAA
+GATGATGTGCTGATGCAGAAAGCGGCAGGGCTTGCCGGAGGTGTCCGCTTTGGCCCGGAC
+GGGAATGAAGTTATCCCCGCTTCCCCGGATGTGGCGGACATGACGGAGGATGACGTAATG
+CTGATGACAGTATCAGAAGGGATCGCAGGAGGAGTCCGGTATGGCTGAACCGGTAGGCGA
+TCTGGTCGTTGATTTGAGTCTGGATGCGGCCAGATTTGACGAGCAGATGGCCAGAGTCAG
+GCGTCATTTTTCTGGTACGGAAAGTGATGCGAAAAAAACAGCGGCAGTCGTTGAACAGTC
+GCTGAGCCGACAGGCGCTGGCTGCACAGAAAGCGGGGATTTCCGTCGGGCAGTATAAAGC
+CGCCATGCGTATGCTGCCTGCACAGTTCACCGACGTGGCCACGCAGCTTGCAGGCGGGCA
+AAGTCCGTGGCTGATCCTGCTGCAACAGGGGGGGCAGGTGAAGGACTCCTTCGGCGGGAT
+GATCCCCATGTTCAGGGGGCTTGCCGGTGCGATCACCCTGCCGATGGTGGGGGCCACCTC
+GCTGGCGGTGGCGACCGGTGCGCTGGCGTATGCCTGGTATCAGGGCAACTCAACCCTGTC
+CGATTTCAACAAAACGCTGGTCCTTTCCGGCAATCAGGCGGGACTGACGGCAGATCGTAT
+GCTGGTCCTGTCCAGAGCCGGGCAGGCGGCAGGGCTGACGTTTAACCAGACCAGCGAGTC
+ACTCAGCGCACTGGTTAAGGCGGGGGTAAGCGGTGAGGCTCAGATTGCGTCCATCAGCCA
+GAGTGTGGCGCGTTTCTCCTCTGCATCCGGCGTGGAGGTGGACAAGGTCGCTGAAGCCTT
+CGGGAAGCTGACCACAGACCCGACGTCGGGGCTGACGGCGATGGCTCGCCAGTTCCATAA
+CGTGTCGGCGGAGCAGATTGCGTATGTTGCTCAGTTGCAGCGTTCCGGCGATGAAGCCGG
+GGCATTGCAGGCGGCGAACGAGGCCGCAACGAAAGGGTTTGATGACCAGACCCGCCGCCT
+GAAAGAGAACATGGGCACGCTGGAGACCTGGGCAGACAGGACTGCGCGGGCATTCAAATC
+CATGTGGGATGCGGTGCTGGATATTGGTCGTCCTGATACCGCGCAGGAGATGCTGATTAA
+GGCAGAGGCTGCGTATAAGAAAGCAGACGACATCTGGAATCTGCGCAAGGATGATTATTT
+TGTTAACGATGAAGCGCGGGCGCGTTACTGGGATGATCGTGAAAAGGCCCGTCTTGCGCT
+TGAAGCCGCCCGAAAGAAGGCTGAGCAGCAGACTCAACAGGACAAAAATGCGCAGCAGCA
+GAGCGATACCGAAGCGTCACGGCTGAAATATACCGAAGAGGCGCAGAAGGCTTACGAACG
+GCTGCAGACGCCGCTGGAGAAATATACCGCCCGTCAGGAAGAACTGAACAAGGCACTGAA
+AGACGGGAAAATCCTGCAGGCGGATTACAACACGCTGATGGCGGCGGCGAAAAAGGATTA
+TGAAGCGACGCTGAAAAAGCCGAAACAGTCCAGCGTGAAGGTGTCTGCGGGCGATCGTCA
+GGAAGACAGTGCTCATGCTGCCCTGCTGACGCTTCAGGCAGAACTCCGGACGCTGGAGAA
+GCATGCCGGAGCAAATGAGAAAATCAGCCAGCAGCGCCGGGATTTGTGGAAGGCGGAGAG
+TCAGTTCGCGGTACTGGAGGAGGCGGCGCAACGTCGCCAGCTGTCTGCACAGGAGAAATC
+CCTGCTGGCGCATAAAGATGAGACGCTGGAGTACAAACGCCAGCTGGCTGCACTTGGCGA
+CAAGGTTACGTATCAGGAGCGCCTGAACGCGCTGGCGCAGCAGGCGGATAAATTCGCACA
+GCAGCAACGGGCAAAACGGGCCGCCATTGATGCGAAAAGCCGGGGGCTGACTGACCGGCA
+GGCAGAACGGGAAGCCACGGAACAGCGCCTGAAGGAACAGTATGGCGATAATCCGCTGGC
+GCTGAATAACGTCATGTCAGAGCAGAAAAAGACCTGGGCGGCTGAAGACCAGCTTCGCGG
+GAACTGGATGGCAGGCCTGAAGTCCGGCTGGAGTGAGTGGGAAGAGAGCGCCACGGACAG
+TATGTCGCAGGTAAAAAGTGCAGCCACGCAGACCTTTGATGGTATTGCACAGAATATGGC
+GGCGATGCTGACCGGCAGTGAGCAGAACTGGCGCAGCTTCACCCGTTCCGTGCTGTCCAT
+GATGACAGAAATTCTGCTTAAGCAGGCAATGGTGGGGATTGTCGGGAGTATCGGCAGCGC
+CATTGGCGGGGCTGTTGGTGGCGGCGCATCCGCGTCAGGCGGTACAGCCATTCAGGCCGC
+TGCGGCGAAATTCCATTTTGCAACCGGAGGATTTACGGGAACCGGCGGCAAATATGAGCC
+AGCGGGGATTGTTCACCGTGGTGAGTTTGTCTTCACGAAGGAGGCAACCAGCCGGATTGG
+CGTGGGGAATCTTTACCGGCTGATGCGCGGCTATGCCACCGGCGGTTATGTCGGTACACC
+GGGCAGCATGGCAGACAGCCGGTCGCAGGCGTCCGGGACGTTTGAGCAGAATAACCATGT
+GGTGATTAACAACGACGGCACGAACGGGCAGATAGGTCCGGCTGCTCTGAAGGCGGTGTA
+TGACATGGCCCGCAAGGGTGCCCGTGATGAAATTCAGACACAGATGCGTGATGGTGGCCT
+GTTCTCCGGAGGTGGACGATGAAGACCTTCCGCTGGAAAGTGAAACCCGGTATGGATGTG
+GCTTCGGTCCCTTCTGTAAGAAAGGTGCGCTTTGGTGATGGCTATTCTCAGCGAGCGCCT
+GCCGGGCTGAATGCCAACCTGAAAACGTACAGCGTGACGCTTTCTGTCCCCCGTGAGGAG
+GCCACGGTACTGGAGTCGTTTCTGGAAGAGCACGGGGGCTGGAAATCCTTTCTGTGGACG
+CCGCCTTATGAGTGGCGGCAGATAAAGGTGACCTGCGCAAAATGGTCGTCGCGGGTCAGT
+ATGCTGCGTGTTGAGTTCAGCGCAGAGTTTGAACAGGTGGTGAACTGATGCAGGATATCC
+GGCAGGAAACACTGAATGAATGCACCCGTGCGGAGCAGTCGGCCAGCGTGGTGCTCTGGG
+AAATCGACCTGACAGAGGTCGGTGGAGAACGTTATTTTTTCTGTAATGAGCAGAACGAAA
+AAGGTGAGCCGGTCACCTGGCAGGGGCGACAGTATCAGCCGTATCCCATTCAGGGGAGCG
+GTTTTGAACTGAATGGCAAAGGCACCAGTACGCGCCCCACGCTGACGGTTTCTAACCTGT
+ACGGTATGGTCACCGGGATGGCGGAAGATATGCAGAGTCTGGTCGGCGGAACGGTGGTCC
+GGCGTAAGGTTTACGCCCGTTTTCTGGATGCGGTGAACTTCGTCAACGGAAACAGTTACG
+CCGATCCGGAGCAGGAGGTGATCAGCCGCTGGCGCATTGAGCAGTGCAGCGAACTGAGCG
+CGGTGAGTGCCTCCTTTGTACTGTCCACGCCGACGGAAACGGATGGCGCTGTTTTTCCGG
+GACGTATCATGCTGGCCAACACCTGCACCTGGACCTATCGCGGTGACGAGTGCGGTTATA
+GCGGTCCGGCTGTCGCGGATGAATATGACCAGCCAACGTCCGATATCACGAAGGATAAAT
+GCAGCAAATGCCTGAGCGGTTGTAAGTTCCGCAATAACGTCGGCAACTTTGGCGGCTTCC
+TTTCCATTAACAAACTTTCGCAGTAAATCCCATGACACAGACAGAATCAGCGATTCTGGC
+GCACGCCCGGCGATGTGCGCCAGCGGAGTCGTGCGGCTTCGTGGTAAGCACGCCGGAGGG
+GGAAAGATATTTCCCCTGCGTGAATATCTCCGGTGAGCCGGAGGCGTATTTCCGTATGTC
+GCCGGAAGACTGGCTGCAGGCAGAAATGCAGGGTGAGATTGTGGCGCTGGTCCACAGCCA
+CCCCGGTGGTCTGCCCTGGCTGAGTGAGGCCGACCGGCGGCTGCAGGTGCAGAGTGATTT
+GCCGTGGTGGCTGGTCTGCCGGGGGACGATTCATAAGTTCCGCTGTGTGCCGCATCTCAC
+CGGGCGGCGCTTTGAGCACGGTGTGACGGACTGTTACACACTGTTCCGGGATGCTTATCA
+TCTGGCGGGGATTGAGATGCCGGACTTTCATCGTGAGGATGACTGGTGGCGTAACGGCCA
+GAATCTCTATCTGGATAATCTGGAGGCGACGGGGCTGTATCAGGTGCCGTTGTCAGCGGC
+ACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCATCAGTGCCGAATCACGCCGCAAT
+TTACTGCGGCGACGGCGAGCTGCTGCACCATATTCCTGAACAACTGAGCAAACGAGAGAG
+GTACACCGACAAATGGCAGCGACGCACACACTCCCTCTGGCGTCACCGGGCATGGCGCGC
+ATCTGCCTTTACGGGGATTTACAACGATTTGGTCGCCGCATCGACCTTCGTGTGAAAACG
+GGGGCTGAAGCCATCCGGGCACTGGCCACACAGCTCCCGGCGTTTCGTCAGAAACTGAGC
+GACGGCTGGTATCAGGTACGGATTGCCGGGCGGGACGTCAGCACGTCCGGGTTAACGGCG
+CAGTTACATGAGACTCTGCCTGATGGCGCTGTAATTCATATTGTTCCCAGAGTCGCCGGG
+GCCAAGTCAGGTGGCGTATTCCAGATTGTCCTGGGGGCTGCCGCCATTGCCGGATCATTC
+TTTACCGCCGGAGCCACCCTTGCAGCATGGGGGGCAGCCATTGGGGCCGGTGGTATGACC
+GGCATCCTGTTTTCTCTCGGTGCCAGTATGGTGCTCGGTGGTGTGGCGCAGATGCTGGCA
+CCGAAAGCCAGAACTCCCCGTATACAGACAACGGATAACGGTAAGCAGAACACCTATTTC
+TCCTCACTGGATAACATGGTTGCCCAGGGCAATGTTCTGCCTGTTCTGTACGGGGAAATG
+CGCGTGGGGTCACGCGTGGTTTCTCAGGAGATCAGCACGGCAGACGAAGGGGACGGTGGT
+CAGGTTGTGGTGATTGGTCGCTGATGCAAAATGTTTTATGTGAAACCGCCTGCGGGCGGT
+TTTGTCATTTATGGAGCGTGAGGAATGGGTAAAGGAAGCAGTAAGGGGCATACCCCGCGC
+GAAGCGAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCGATGCCATCAGCGAA
+GGGCCGATTGAAGGTCCGGTGGATGGCTTAAAAAGCGTGCTGCTGAACAGTACGCCGGTG
+CTGGACACTGAGGGGAATACCAACATATCCGGTGTCACGGTGGTGTTCCGGGCTGGTGAG
+CAGGAGCAGACTCCGCCGGAGGGATTTGAATCCTCCGGCTCCGAGACGGTGCTGGGTACG
+GAAGTGAAATATGACACGCCGATCACCCGCACCATTACGTCTGCAAACATCGACCGTCTG
+CGCTTTACCTTCGGTGTACAGGCACTGGTGGAAACCACCTCAAAGGGTGACAGGAATCCG
+TCGGAAGTCCGCCTGCTGGTTCAGATACAACGTAACGGTGGCTGGGTGACGGAAAAAGAC
+ATCACCATTAAGGGCAAAACCACCTCGCAGTATCTGGCCTCGGTGGTGATGGGTAACCTG
+CCGCCGCGCCCGTTTAATATCCGGATGCGCAGGATGACGCCGGACAGCACCACAGACCAG
+CTGCAGAACAAAACGCTCTGGTCGTCATACACTGAAATCATCGATGTGAAACAGTGCTAC
+CCGAACACGGCACTGGTCGGCGTGCAGGTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTG
+AGCCGTAATTATCATCTGCGCGGGCGTATTCTGCAGGTGCCGTCGAACTATAACCCGCAG
+ACGCGGCAATACAGCGGTATCTGGGACGGAACGTTTAAACCGGCATACAGCAACAACATG
+GCCTGGTGTCTGTGGGATATGCTGACCCATCCGCGCTACGGCATGGGGAAACGTCTTGGT
+GCGGCGGATGTGGATAAATGGGCGCTGTATGTCATCGGCCAGTACTGCGACCAGTCAGTG
+CCGGACGGCTTTGGCGGCACGGAGCCGCGCATCACCTGTAATGCGTACCTGACCACACAG
+CGTAAGGCGTGGGATGTGCTCAGCGATTTCTGCTCGGCGATGCGCTGTATGCCGGTATGG
+AACGGGCAGACGCTGACGTTCGTGCAGGACCGACCGTCGGATAAGACGTGGACCTATAAC
+CGCAGTAATGTGGTGATGCCGGATGATGGCGCGCCGTTCCGCTACAGCTTCAGCGCCCTG
+AAGGACCGCCATAATGCCGTTGAGGTGAACTGGATTGACCCGAACAACGGCTGGGAGACG
+GCGACAGAGCTTGTTGAAGATACGCAGGCCATTGCCCGTTACGGTCGTAATGTTACGAAG
+ATGGATGCCTTTGGCTGTACCAGCCGGGGGCAGGCACACCGCGCCGGGCTGTGGCTGATT
+AAAACAGAACTGCTGGAAACGCAGACCGTGGATTTCAGCGTCGGCGCAGAAGGGCTTCGC
+CATGTACCGGGCGATGTTATTGAAATCTGCGATGATGACTATGCCGGTATCAGCACCGGT
+GGTCGTGTGCTGGCGGTGAACAGCCAGACCCGGACGCTGACGCTCGACCGTGAAATCACG
+CTGCCATCCTCCGGTACCGCGCTGATAAGCCTGGTTGACGGAAGTGGCAATCCGGTCAGC
+GTGGAGGTTCAGTCCGTCACCGACGGCGTGAAGGTAAAAGTGAGCCGTGTTCCTGACGGT
+GTTGCTGAATACAGCGTATGGGAGCTGAAGCTGCCGACGCTGCGCCAGCGACTGTTCCGC
+TGCGTGAGTATCCGTGAGAACGACGACGGCACGTATGCCATCACCGCCGTGCAGCATGTG
+CCGGAAAAAGAGGCCATCGTGGATAACGGGGCGCACTTTGACGGCGAACAGAGTGGCACG
+GTGAATGGTGTCACGCCGCCAGCGGTGCAGCACCTGACCGCAGAAGTCACTGCAGACAGC
+GGGGAATATCAGGTGCTGGCGCGATGGGACACACCGAAGGTGGTGAAGGGCGTGAGTTTC
+CTGCTCCGTCTGACCGTAACAGCGGACGACGGCAGTGAGCGGCTGGTCAGCACGGCCCGG
+ACGACGGAAACCACATACCGCTTCACGCAACTGGCGCTGGGGAACTACAGGCTGACAGTC
+CGGGCGGTAAATGCGTGGGGGCAGCAGGGCGATCCGGCGTCGGTATCGTTCCGGATTGCC
+GCACCGGCAGCACCGTCGAGGATTGAGCTGACGCCGGGCTATTTTCAGATAACCGCCACG
+CCGCATCTTGCCGTTTATGACCCGACGGTACAGTTTGAGTTCTGGTTCTCGGAAAAGCAG
+ATTGCGGATATCAGACAGGTTGAAACCAGCACGCGTTATCTTGGTACGGCGCTGTACTGG
+ATAGCCGCCAGTATCAATATCAAACCGGGCCATGATTATTACTTTTATATCCGCAGTGTG
+AACACCGTTGGCAAATCGGCATTCGTGGAGGCCGTCGGTCGGGCGAGCGATGATGCGGAA
+GGTTACCTGGATTTTTTCAAAGGCAAGATAACCGAATCCCATCTCGGCAAGGAGCTGCTG
+GAAAAAGTCGAGCTGACGGAGGATAACGCCAGCAGACTGGAGGAGTTTTCGAAAGAGTGG
+AAGGATGCCAGTGATAAGTGGAATGCCATGTGGGCTGTCAAAATTGAGCAGACCAAAGAC
+GGCAAACATTATGTCGCGGGTATTGGCCTCAGCATGGAGGACACGGAGGAAGGCAAACTG
+AGCCAGTTTCTGGTTGCCGCCAATCGTATCGCATTTATTGACCCGGCAAACGGGAATGAA
+ACGCCGATGTTTGTGGCGCAGGGCAACCAGATATTCATGAACGACGTGTTCCTGAAGCGC
+CTGACGGCCCCCACCATTACCAGCGGCGGCAATCCTCCGGCCTTTTCCCTGACACCGGAC
+GGAAAGCTGACCGCTAAAAATGCGGATATCAGTGGCAGTGTGAATGCGAACTCCGGGACG
+CTCAGTAATGTGACGATAGCTGAAAACTGTACGATAAACGGTACGCTGAGGGCGGAAAAA
+ATCGTCGGGGACATTGTAAAGGCGGCGAGCGCGGCTTTTCCGCGCCAGCGTGAAAGCAGT
+GTGGACTGGCCGTCAGGTACCCGTACTGTCACCGTGACCGATGACCATCCTTTTGATCGC
+CAGATAGTGGTGCTTCCGCTGACGTTTCGCGGAAGTAAGCGTACTGTCAGCGGCAGGACA
+ACGTATTCGATGTGTTATCTGAAAGTACTGATGAACGGTGCGGTGATTTATGATGGCGCG
+GCGAACGAGGCGGTACAGGTGTTCTCCCGTATTGTTGACATGCCAGCGGGTCGGGGAAAC
+GTGATCCTGACGTTCACGCTTACGTCCACACGGCATTCGGCAGATATTCCGCCGTATACG
+TTTGCCAGCGATGTGCAGGTTATGGTGATTAAGAAACAGGCGCTGGGCATCAGCGTGGTC
+TGAGTGTGTTACAGAGGTTCGTCCGGGAACGGGCGTTTTATTATAAAACAGTGAGAGGTG
+AACGATGCGTAATGTGTGTATTGCCGTTGCTGTCTTTGCCGCACTTGCGGTGACAGTCAC
+TCCGGCCCGTGCGGAAGGTGGACATGGTACGTTTACGGTGGGCTATTTTCAAGTGAAACC
+GGGTACATTGCCGTCGTTGTCGGGCGGGGATACCGGTGTGAGTCATCTGAAAGGGATTAA
+CGTGAAGTACCGTTATGAGCTGACGGACAGTGTGGGGGTGATGGCTTCCCTGGGGTTCGC
+CGCGTCGAAAAAGAGCAGCACAGTGATGACCGGGGAGGATACGTTTCACTATGAGAGCCT
+GCGTGGACGTTATGTGAGCGTGATGGCCGGACCGGTTTTACAAATCAGTAAGCAGGTCAG
+TGCGTACGCCATGGCCGGAGTGGCTCACAGTCGGTGGTCCGGCAGTACAATGGATTACCG
+TAAGACGGAAATCACTCCCGGGTATATGAAAGAGACGACCACTGCCAGGGACGAAAGTGC
+AATGCGGCATACCTCAGTGGCGTGGAGTGCAGGTATACAGATTAATCCGGCAGCGTCCGT
+CGTTGTTGATATTGCTTATGAAGGCTCCGGCAGTGGCGACTGGCGTACTGACGGATTCAT
+CGTTGGGGTCGGTTATAAATTCTGATTAGCCAGGTAACACAGTGTTATGACAGCCCGCCG
+GAACCGGTGGGCTTTTTTGTGGGGTGAATATGGCAGTAAAGATTTCAGGAGTCCTGAAAG
+ACGGCACAGGAAAACCGGTACAGAACTGCACCATTCAGCTGAAAGCCAGACGTAACAGCA
+CCACGGTGGTGGTGAACACGGTGGGCTCAGAGAATCCGGATGAAGCCGGGCGTTACAGCA
+TGGATGTGGAGTACGGTCAGTACAGTGTCATCCTGCAGGTTGACGGTTTTCCACCATCGC
+ACGCCGGGACCATCACCGTGTATGAAGATTCACAACCGGGGACGCTGAATGATTTTCTCT
+GTGCCATGACGGAGGATGATGCCCGGCCGGAGGTGCTGCGTCGTCTTGAACTGATGGTGG
+AAGAGGTGGCGCGTAACGCGTCCGTGGTGGCACAGAGTACGGCAGACGCGAAGAAATCAG
+CCGGCGATGCCAGTGCATCAGCTGCTCAGGTCGCGGCCCTTGTGACTGATGCAACTGACT
+CAGCACGCGCCGCCAGCACGTCCGCCGGACAGGCTGCATCGTCAGCTCAGGAAGCGTCCT
+CCGGCGCAGAAGCGGCATCAGCAAAGGCCACTGAAGCGGAAAAAAGTGCCGCAGCCGCAG
+AGTCCTCAAAAAACGCGGCGGCCACCAGTGCCGGTGCGGCGAAAACGTCAGAAACGAATG
+CTGCAGCGTCACAACAATCAGCCGCCACGTCTGCCTCCACCGCGGCCACGAAAGCGTCAG
+AGGCCGCCACTTCAGCACGAGATGCGGTGGCCTCAAAAGAGGCAGCAAAATCATCAGAAA
+CGAACGCATCATCAAGTGCCGGTCGTGCAGCTTCCTCGGCAACGGCGGCAGAAAATTCTG
+CCAGGGCGGCAAAAACGTCCGAGACGAATGCCAGGTCATCTGAAACAGCAGCGGAACGGA
+GCGCCTCTGCCGCGGCAGACGCAAAAACAGCGGCGGCGGGGAGTGCGTCAACGGCATCCA
+CGAAGGCGACAGAGGCTGCGGGAAGTGCGGTATCAGCATCGCAGAGCAAAAGTGCGGCAG
+AAGCGGCGGCAATACGTGCAAAAAATTCGGCAAAACGTGCAGAAGATATAGCTTCAGCTG
+TCGCGCTTGAGGATGCGGACACAACGAGAAAGGGGATAGTGCAGCTCAGCAGTGCAACCA
+ACAGCACGTCTGAAACGCTTGCTGCAACGCCAAAGGCGGTTAAGGTGGTAATGGATGAAA
+CGAACAGAAAAGCCCACTGGACAGTCCGGCACTGACCGGAACGCCAACAGCACCAACCGC
+GCTCAGGGGAACAAACAATACCCAGATTGCGAACACCGCTTTTGTACTGGCCGCGATTGC
+AGATGTTATCGACGCGTCACCTGACGCACTGAATACGCTGAATGAACTGGCCGCAGCGCT
+CGGGAATGATCCAGATTTTGCTACCACCATGACTAACGCGCTTGCGGGTAAACAACCGAA
+GAATGCGACACTGACGGCGCTGGCAGGGCTTTCCACGGCGAAAAATAAATTACCGTATTT
+TGCGGAAAATGATGCCGCCAGCCTGACTGAACTGACTCAGGTTGGCAGGGATATTCTGGC
+AAAAAATTCCGTTGCAGATGTTCTTGAATACCTTGGGGCCGGTGAGAATTCGGCCTTTCC
+GGCAGGTGCGCCGATCCCGTGGCCATCAGATATCGTTCCGTCTGGCTACGTCCTGATGCA
+GGGGCAGGCGTTTGACAAATCAGCCTACCCAAAACTTGCTGTCGCGTATCCATCGGGTGT
+GCTTCCTGATATGCGAGGCTGGACAATCAAGGGGAAACCCGCCAGCGGTCGTGCTGTATT
+GTCTCAGGAACAGGATGGAATTAAGTCGCACACCCACAGTGCCAGTGCATCCGGTACGGA
+TTTGGGGACGAAAACCACATCGTCGTTTGATTACGGGACGAAAACAACAGGCAGTTTCGA
+TTACGGCACCAAATCGACGAATAACACGGGGGCTCATGCTCACAGTCTGAGCGGTTCAAC
+AGGGGCCGCGGGTGCTCATGCCCACACAAGTGGTTTAAGGATGAACAGTTCTGGCTGGAG
+TCAGTATGGAACAGCAACCATTACAGGAAGTTTATCCACAGTTAAAGGAACCAGCACACA
+GGGTATTGCTTATTTATCGAAAACGGACAGTCAGGGCAGCCACAGTCACTCATTGTCCGG
+TACAGCCGTGAGTGCCGGTGCACATGCGCATACAGTTGGTATTGGTGCGCACCAGCATCC
+GGTTGTTATCGGTGCTCATGCCCATTCTTTCAGTATTGGTTCACACGGACACACCATCAC
+CGTTAACGCTGCGGGTAACGCGGAAAACACCGTCAAAAACATTGCATTTAACTATATTGT
+GAGGCTTGCATAATGGCATTCAGAATGAGTGAACAACCACGGACCATAAAAATTTATAAT
+CTGCTGGCCGGAACTAATGAATTTATTGGTGAAGGTGACGCATATATTCCGCCTCATACC
+GGTCTGCCTGCAAACAGTACCGATATTGCACCGCCAGATATTCCGGCTGGCTTTGTGGCT
+GTTTTCAACAGTGATGAGGCATCGTGGCATCTCGTTGAAGACCATCGGGGTAAAACCGTC
+TATGACGTGGCTTCCGGCGACGCGTTATTTATTTCTGAACTCGGTCCGTTACCGGAAAAT
+TTTACCTGGTTATCGCCGGGAGGGGAATATCAGAAGTGGAACGGCACAGCCTGGGTGAAG
+GATACGGAAGCAGAAAAACTGTTCCGGATCCGGGAGGCGGAAGAAACAAAAAAAAGCCTG
+ATGCAGGTAGCCAGTGAGCATATTGCGCCGCTTCAGGATGCTGCAGATCTGGAAATTGCA
+ACGAAGGAAGAAACCTCGTTGCTGGAAGCCTGGAAGAAGTATCGGGTGTTGCTGAACCGT
+GTTGATACATCAACTGCACCTGATATTGAGTGGCCTGCTGTCCCTGTTATGGAGTAATCG
+TTTTGTGATATGCCGCAGAAACGTTGTATGAAATAACGTTCTGCGGTTAGTTAGTATATT
+GTAAAGCTGAGTATTGGTTTATTTGGCGATTATTATCTTCAGGAGAATAATGGAAGTTCT
+ATGACTCAATTGTTCATAGTGTTTACATCACCGCCAATTGCTTTTAAGACTGAACGCATG
+AAATATGGTTTTTCGTCATGTTTTGAGTCTGCTGTTGATATTTCTAAAGTCGGTTTTTTT
+TCTTCGTTTTCTCTAACTATTTTCCATGAAATACATTTTTGATTATTATTTGAATCAATT
+CCAATTACCTGAAGTCTTTCATCTATAATTGGCATTGTATGTATTGGTTTATTGGAGTAG
+ATGCTTGCTTTTCTGAGCCATAGCTCTGATATCCAAATGAAGCCATAGGCATTTGTTATT
+TTGGCTCTGTCAGCTGCATAACGCCAAAAAATATATTTATCTGCTTGATCTTCAAATGTT
+GTATTGATTAAATCAATTGGATGGAATTGTTTATCATAAAAAATTAATGTTTGAATGTGA
+TAACCGTCCTTTAAAAAAGTCGTTTCTGCAAGCTTGGCTGTATAGTCAACTAACTCTTCT
+GTCGAAGTGATATTTTTAGGCTTATCTACCAGTTTTAGACGCTCTTTAATATCTTCAGGA
+ATTATTTTATTGTCATATTGTATCATGCTAAATGACAATTTGCTTATGGAGTAATCTTTT
+AATTTTAAATAAGTTATTCTCCTGGCTTCATCAAATAAAGAGTCGAATGATGTTGGCGAA
+ATCACATCGTCACCCATTGGATTGTTTATTTGTATGCCAAGAGAGTTACAGCAGTTATAC
+ATTCTGCCATAGATTATAGCTAAGGCATGTAATAATTCGTAATCTTTTAGCGTATTAGCG
+ACCCATCGTCTTTCTGATTTAATAATAGATGATTCAGTTAAATATGAAGGTAATTTCTTT
+TGTGCAAGTCTGACTAACTTTTTTATACCAATGTTTAACATACTTTCATTTGTAATAAAC
+TCAATGTCATTTTCTTCAATGTAAGATGAAATAAGAGTAGCCTTTGCCTCGCTATACATT
+TCTAAATCGCCTTGTTTTTCTATCGTATTGCGAGAATTTTTAGCCCAAGCCATTAATGGA
+TCATTTTTCCATTTTTCAATAACATTATTGTTATACCAAATGTCATATCCTATAATCTGG
+TTTTTGTTTTTTTGAATAATAAATGTTACTGTTCTTGCGGTTTGGAGGAATTGATTCAAA
+TTCAAGCGAAATAATTCAGGGTCAAAATATGTATCAATGCAGCATTTGAGCAAGTGCGAT
+AAATCTTTAAGTCTTCTTTCCCATGGTTTTTTAGTCATAAAACTCTCCATTTTGATAGGT
+TGCATGCTAGATGCTGATATATTTTAGAGGTGATAAAATTAACTGCTTAACTGTCAATGT
+AATACAAGTTGTTTGATCTTTGCAATGATTCTTATCAGAAACCATATAGTAAATTAGTTA
+CACAGGAAATTTTTAATATTATTATTATCATTCATTATGTATTAAAATTAGAGTTGTGGC
+TTGGCTCTGCTAACACGTTGCTCATAGGAGATATGGTAGAGCCGCAGACACGTCGTATGC
+AGGAACGTGCTGCGGCTGGCTGGTGAACTTCCGATAGTGCGGGTGTTGAATGATTTCCAG
+TTGCTACCGATTTTACATATTTTTTGCATGAGAGAATTTGTACCACCTCCCACCGACCAT
+CTATGACTGTACGCCACTGTCCCTAGGACTGCTATGTGCCGGAGCGGACATTACAAACGT
+CCTTCTCGGTGCATGCCACTGTTGCCAATGACCTGCCTAGGAATTGGTTAGCAAGTTACT
+ACCGGATTTTGTAAAAACAGCCCTCCTCATATAAAAAGTATTCGTTCACTTCCGATAAGC
+GTCGTAATTTTCTATCTTTCATCATATTCTAGATCCCTCTGAAAAAATCTTCCGAGTTTG
+CTAGGCACTGATACATAACTCTTTTCCAATAATTGGGGAAGTCATTCAAATCTATAATAG
+GTTTCAGATTTGCTTCAATAAATTCTGACTGTAGCTGCTGAAACGTTGCGGTTGAACTAT
+ATTTCCTTATAACTTTTACGAAAGAGTTTCTTTGAGTAATCACTTCACTCAAGTGCTTCC
+CTGCCTCCAAACGATACCTGTTAGCAATATTTAATAGCTTGAAATGATGAAGAGCTCTGT
+GTTTGTCTTCCTGCCTCCAGTTCGCCGGGCATTCAACATAAAAACTGATAGCACCCGGAG
+TTCCGGAAACGAAATTTGCATATACCCATTGCTCACGAAAAAAAATGTCCTTGTCGATAT
+AGGGATGAATCGCTTGGTGTACCTCATCTACTGCGAAAACTTGACCTTTCTCTCCCATAT
+TGCAGTCGCGGCACGATGGAACTAAATTAATAGGCATCACCGAAAATTCAGGATAATGTG
+CAATAGGAAGAAAATGATCTATATTTTTTGTCTGTCCTATATCACCACAAAATGGACATT
+TTTCACCTGATGAAACAAGCATGTCATCGTAATATGTTCTAGCGGGTTTGTTTTTATCTC
+GGAGATTATTTTCATAAAGCTTTTCTAATTTAACCTTTGTCAGGTTACCAACTACTAAGG
+TTGTAGGCTCAAGAGGGTGTGTCCTGTCGTAGGTAAATAACTGACCTGTCGAGCTTAATA
+TTCTATATTGTTGTTCTTTCTGCAAAAAAGTGGGGAAGTGAGTAATGAAATTATTTCTAA
+CATTTATCTGCATCATACCTTCCGAGCATTTATTAAGCATTTCGCTATAAGTTCTCGCTG
+GAAGAGGTAGTTTTTTCATTGTACTTTACCTTCATCTCTGTTCATTATCATCGCTTTTAA
+AACGGTTCGACCTTCTAATCCTATCTGACCATTATAATTTTTTAGAATGGTTTCATAAGA
+AAGCTCTGAATCAACGGACTGCGATAATAAGTGGTGGTATCCAGAATTTGTCACTTCAAG
+TAAAAACACCTCACGAGTTAAAACACCTAAGTTCTCACCGAATGTCTCAATATCCGGACG
+GATAATATTTATTGCTTCTCTTGACCGTAGGACTTTCCACATGCAGGATTTTGGAACCTC
+TTGCAGTACTACTGGGGAATGAGTTGCAATTATTGCTACACCATTGCGTGCATCGAGTAA
+GTCGCTTAATGTTCGTAAAAAAGCAGAGAGCAAAGGTGGATGCAGATGAACCTCTGGTTC
+ATCGAATAAAACTAATGACTTTTCGCCAACGACATCTACTAATCTTGTGATAGTAAATAA
+AACAATTGCATGTCCAGAGCTCATTCGAAGCAGATATTTCTGGATATTGTCATAAAACAA
+TTTAGTGAATTTATCATCGTCCACTTGAATCTGTGGTTCATTACGTCTTAACTCTTCATA
+TTTAGAAATGAGGCTGATGAGTTCCATATTTGAAAAGTTTTCATCACTACTTAGTTTTTT
+GATAGCTTCAAGCCAGAGTTGTCTTTTTCTATCTACTCTCATACAACCAATAAATGCTGA
+AATGAATTCTAAGCGGAGATCGCCTAGTGATTTTAAACTATTGCTGGCAGCATTCTTGAG
+TCCAATATAAAAGTATTGTGTACCTTTTGCTGGGTCAGGTTGTTCTTTAGGAGGAGTAAA
+AGGATCAAATGCACTAAACGAAACTGAAACAAGCGATCGAAAATATCCCTTTGGGATTCT
+TGACTCGATAAGTCTATTATTTTCAGAGAAAAAATATTCATTGTTTTCTGGGTTGGTGAT
+TGCACCAATCATTCCATTCAAAATTGTTGTTTTACCACACCCATTCCGCCCGATAAAAGC
+ATGAATGTTCGTGCTGGGCATAGAATTAACCGTCACCTCAAAAGGTATAGTTAAATCACT
+GAATCCGGGAGCACTTTTTCTATTAAATGAAAAGTGGAAATCTGACAATTCTGGCAAACC
+ATTTAACACACGTGCGAACTGTCCATGAATTTCTGAAAGAGTTACCCCTCTAAGTAATGA
+GGTGTTAAGGACGCTTTCATTTTCAATGTCGGCTAATCGATTTGGCCATACTACTAAATC
+CTGAATAGCTTTAAGAAGGTTATGTTTAAAACCATCGCTTAATTTGCTGAGATTAACATA
+GTAGTCAATGCTTTCACCTAAGGAAAAAAACATTTCAGGGAGTTGACTGAATTTTTTATC
+TATTAATGAATAAGTGCTTACTTCTTCTTTTTGACCTACAAAACCAATTTTAACATTTCC
+GATATCGCATTTTTCACCATGCTCATCAAAGACAGTAAGATAAAACATTGTAACAAAGGA
+ATAGTCATTCCAACCATCTGCTCGTAGGAATGCCTTATTTTTTTCTACTGCAGGAATATA
+CCCGCCTCTTTCAATAACACTAAACTCCAACATATAGTAACCCTTAATTTTATTAAAATA
+ACCGCAATTTATTTGGCGGCAACACAGGATCTCTCTTTTAAGTTACTCTCTATTACATAC
+GTTTTCCATCTAAAAATTAGTAGTATTGAACTTAACGGGGCATCGTATTGTAGTTTTCCA
+TATTTAGCTTTCTGCTTCCTTTTGGATAACCCACTGTTATTCATGTTGCATGGTGCACTG
+TTTATACCAACGATATAGTCTATTAATGCATATATAGTATCGCCGAACGATTAGCTCTTC
+AGGCTTCTGAAGAAGCGTTTCAAGTACTAATAAGCCGATAGATAGCCACGGACTTCGTAG
+CCATTTTTCATAAGTGTTAACTTCCGCTCCTCGCTCATAACAGACATTCACTACAGTTAT
+GGCGGAAAGGTATGCATGCTGGGTGTGGGGAAGTCGTGAAAGAAAAGAAGTCAGCTGCGT
+CGTTTGACATCACTGCTATCTTCTTACTGGTTATGCAGGTCGTAGTGGGTGGCACACAAA
+GCTTTGCACTGGATTGCGAGGCTTTGTGCTTCTCTGGAGTGCGACAGGTTTGATGACAAA
+AAATTAGCGCAAGAAGACAAAAATCACCTTGCGCTAATGCTCTGTTACAGGTCACTAATA
+CCATCTAAGTAGTTGATTCATAGTGACTGCATATGTTGTGTTTTACAGTATTATGTAGTC
+TGTTTTTTATGCAAAATCTAATTTAATATATTGATATTTATATCATTTTACGTTTCTCGT
+TCAGCTTTTTTATACTAAGTTGGCATTATAAAAAAGCATTGCTTATCAATTTGTTGCAAC
+GAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGATTTCAATTTTGTCCCACTCCC
+TGCCTCTGTCATCACGATACTGTGATGCCATGGTGTCCGACTTATGCCCGAGAAGATGTT
+GAGCAAACTTATCGCTTATCTGCTTCTCATAGAGTCTTGCAGACAAACTGCGCAACTCGT
+GAAAGGTAGGCGGATCCCCTTCGAAGGAAAGACCTGATGCTTTTCGTGCGCGCATAAAAT
+ACCTTGATACTGTGCCGGATGAAAGCGGTTCGCGACGAGTAGATGCAATTATGGTTTCTC
+CGCCAAGAATCTCTTTGCATTTATCAAGTGTTTCCTTCATTGATATTCCGAGAGCATCAA
+TATGCAATGCTGTTGGGATGGCAATTTTTACGCCTGTTTTGCTTTGCTCGACATAAAGAT
+ATCCATCTACGATATCAGACCACTTCATTTCGCATAAATCACCAACTCGTTGCCCGGTAA
+CAACAGCCAGTTCCATTGCAAGTCTGAGCCAACATGGTGATGATTCTGCTGCTTGATAAA
+TTTTCAGGTATTCGTCAGCCGTAAGTCTTGATCTCCTTACCTCTGATTTTGCTGCGCGAG
+TGGCAGCGACATGGTTTGTTGTTATATGGCCTTCAGCTATTGCCTCTCGGAATGCATCGC
+TCAGTGTTGATCTGATTAACTTGGCTGACGCCGCCTTGCCCTCGTCTATGTATCCATTGA
+GCATTGCCGCAATTTCTTTTGTGGTGATGTCTTCAAGTGGAGCATCAGGCAGACCCCTCC
+TTATTGCTTTAATTTTGCTCATGTAATTTATGAGTGTCTTCTGCTTGATTCCTCTGCTGG
+CCAGGATTTTTTCGTAGCGATCAAGCCATGAATGTAACGTAACGGAATTATCACTGTTGA
+TTCTCGCTGTCAGAGGCTTGTGTTTGTGTCCTGAAAATAACTCAATGTTGGCCTGTATAG
+CTTCAGTGATTGCGATTCGCCTGTCTCTGCCTAATCCAAACTCTTTACCCGTCCTTGGGT
+CCCTGTAGCAGTAATATCCATTGTTTCTTATATAAAGGTTAGGGGGTAAATCCCGGCGCT
+CATGACTTCGCCTTCTTCCCATTTCTGATCCTCTTCAAAAGGCCACCTGTTACTGGTCGA
+TTTAAGTCAACCTTTACCGCTGATTCGTGGAACAGATACTCTCTTCCATCCTTAACCGGA
+GGTGGGAATATCCTGCATTCCCGAACCCATCGACGAACTGTTTCAAGGCTTCTTGGACGT
+CGCTGGCGTGCGTTCCACTCCTGAAGTGTCAAGTACATCGCAAAGTCTCCGCAATTACAC
+GCAAGAAAAAACCGCCATCAGGCGGCTTGGTGTTCTTTCAGTTCTTCAATTCGAATATTG
+GTTACGTCTGCATGTGCTATCTGCGCCCATATCATCCAGTGGTCGTAGCAGTCGTTGATG
+TTCTCCGCTTCGATAACTCTGTTGAATGGCTCTCCATTCCATTCTCCTGTGACTCGGAAG
+TGCATTTATCATCTCCATAAAACAAAACCCGCCGTAGCGAGTTCAGATAAAATAAATCCC
+CGCGAGTGCGAGGATTGTTATGTAATATTGGGTTTAATCATCTATATGTTTTGTACAGAG
+AGGGCAAGTATCGTTTCCACCGTACTCGTGATAATAATTTTGCACGGTATCAGTCATTTC
+TCGCACATTGCAGAATGGGGATTTGTCTTCATTAGACTTATAAACCTTCATGGAATATTT
+GTATGCCGACTCTATATCTATACCTTCATCTACATAAACACCTTCGTGATGTCTGCATGG
+AGACAAGACACCGGATCTGCACAACATTGATAACGCCCAATCTTTTTGCTCAGACTCTAA
+CTCATTGATACTCATTTATAAACTCCTTGCAATGTATGTCGTTTCAGCTAAACGGTATCA
+GCAATGTTTATGTAAAGAAACAGTAAGATAATACTCAACCCGATGTTTGAGTACGGTCAT
+CATCTGACACTACAGACTCTGGCATCGCTGTGAAGACGACGCGAAATTCAGCATTTTCAC
+AAGCGTTATCTTTTACAAAACCGATCTCACTCTCCTTTGATGCGAATGCCAGCGTCAGAC
+ATCATATGCAGATACTCACCTGCATCCTGAACCCATTGACCTCCAACCCCGTAATAGCGA
+TGCGTAATGATGTCGATAGTTACTAACGGGTCTTGTTCGATTAACTGCCGCAGAAACTCT
+TCCAGGTCACCAGTGCAGTGCTTGATAACAGGAGTCTTCCCAGGATGGCGAACAACAAGA
+AACTGGTTTCCGTCTTCACGGACTTCGTTGCTTTCCAGTTTAGCAATACGCTTACTCCCA
+TCCGAGATAACACCTTCGTAATACTCACGCTGCTCGTTGAGTTTTGATTTTGCTGTTTCA
+AGCTCAACACGCAGTTTCCCTACTGTTAGCGCAATATCCTCGTTCTCCTGGTCGCGGCGT
+TTGATGTATTGCTGGTTTCTTTCCCGTTCATCCAGCAGTTCCAGCACAATCGATGGTGTT
+ACCAATTCATGGAAAAGGTCTGCGTCAAATCCCCAGTCGTCATGCATTGCCTGCTCTGCC
+GCTTCACGCAGTGCCTGAGAGTTAATTTCGCTCACTTCGAACCTCTCTGTTTACTGATAA
+GTTCCAGATCCTCCTGGCAACTTGCACAAGTCCGACAACCCTGAACGACCAGGCGTCTTC
+GTTCATCTATCGGATCGCCACACTCACAACAATGAGTGGCAGATATAGCCTGGTGGTTCA
+GGCGGCGCATTTTTATTGCTGTGTTGCGCTGTAATTCTTCTATTTCTGATGCTGAATCAA
+TGATGTCTGCCATCTTTCATTAATCCCTGAACTGTTGGTTAATACGCTTGAGGGTGAATG
+CGAATAATAAAAAAGGAGCCTGTAGCTCCCTGATGATTTTGCTTTTCATGTTCATCGTTC
+CTTAAAGACGCCGTTTAACATGCCGATTGCCAGGCTTAAATGAGTCGGTGTGAATCCCAT
+CAGCGTTACCGTTTCGCGGTGCTTCTTCAGTACGCTACGGCAAATGTCATCGACGTTTTT
+ATCCGGAAACTGCTGTCTGGCTTTTTTTGATTTCAGAATTAGCCTGACGGGCAATGCTGC
+GAAGGGCGTTTTCCTGCTGAGGTGTCATTGAACAAGTCCCATGTCGGCAAGCATAAGCAC
+ACAGAATATGAAGCCCGCTGCCAGAAAAATGCATTCCGTGGTTGTCATACCTGGTTTCTC
+TCATCTGCTTCTGCTTTCGCCACCATCATTTCCAGCTTTTGTGAAAGGGATGCGGCTAAC
+GTATGAAATTCTTCGTCTGTTTCTACTGGTATTGGCACAAACCTGATTCCAATTTGAGCA
+AGGCTATGTGCCATCTCGATACTCGTTCTTAACTCAACAGAAGATGCTTTGTGCATACAG
+CCCCTCGTTTATTATTTATCTCCTCAGCCAGCCGCTGTGCTTTCAGTGGATTTCGGATAA
+CAGAAAGGCCGGGAAATACCCAGCCTCGCTTTGTAACGGAGTAGACGAAAGTGATTGCGC
+CTACCCGGATATTATCGTGAGGATGCGTCATCGCCATTGCTCCCCAAATACAAAACCAAT
+TTCAGCCAGTGCCTCGTCCATTTTTTCGATGAACTCCGGCACGATCTCGTCAAAACTCGC
+CATGTACTTTTCATCCCGCTCAATCACGACATAATGCAGGCCTTCACGCTTCATACGCGG
+GTCATAGTTGGCAAAGTACCAGGCATTTTTTCGCGTCACCCACATGCTGTACTGCACCTG
+GGCCATGTAAGCTGACTTTATGGCCTCGAAACCACCGAGCCGGAACTTCATGAAATCCCG
+GGAGGTAAACGGGCATTTCAGTTCAAGGCCGTTGCCGTCACTGCATAAACCATCGGGAGA
+GCAGGCGGTACGCATACTTTCGTCGCGATAGATGATCGGGGATTCAGTAACATTCACGCC
+GGAAGTGAATTCAAACAGGGTTCTGGCGTCGTTCTCGTACTGTTTTCCCCAGGCCAGTGC
+TTTAGCGTTAACTTCCGGAGCCACACCGGTGCAAACCTCAGCAAGCAGGGTGTGGAAGTA
+GGACATTTTCATGTCAGGCCACTTCTTTCCGGAGCGGGGTTTTGCTATCACGTTGTGAAC
+TTCTGAAGCGGTGATGACGCCGAGCCGTAATTTGTGCCACGCATCATCCCCCTGTTCGAC
+AGCTCTCACATCGATCCCGGTACGCTGCAGGATAATGTCCGGTGTCATGCTGCCACCTTC
+TGCTCTGCGGCTTTCTGTTTCAGGAATCCAAGAGCTTTTACTGCTTCGGCCTGTGTCAGT
+TCTGACGATGCACGAATGTCGCGGCGAAATATCTGGGAACAGAGCGGCAATAAGTCGTCA
+TCCCATGTTTTATCCAGGGCGATCAGCAGAGTGTTAATCTCCTGCATGGTTTCATCGTTA
+ACCGGAGTGATGTCGCGTTCCGGCTGACGTTCTGCAGTGTATGCAGTATTTTCGACAATG
+CGCTCGGCTTCATCCTTGTCATAGATACCAGCAAATCCGAAGGCCAGACGGGCACACTGA
+ATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTGATTTCT
+CTGCCTTCGCGAGTTTTGAATGGTTCGCGGCGGCATTCATCCATCCATTCGGTAACGCAG
+ATCGGATGATTACGGTCCTTGCGGTAAATCCGGCATGTACAGGATTCATTGTCCTGCTCA
+AAGTCCATGCCATCAAACTGCTGGTTTTCATTGATGATGCGGGACCAGCCATCAACGCCC
+ACCACCGGAACGATGCCATTCTGCTTATCAGGAAAGGCGTAAATTTCTTTCGTCCACGGA
+TTAAGGCCGTACTGGTTGGCAACGATCAGTAATGCGATGAACTGCGCATCGCTGGCATCA
+CCTTTAAATGCCGTCTGGCGAAGAGTGGTGATCAGTTCCTGTGGGTCGACAGAATCCATG
+CCGACACGTTCAGCCAGCTTCCCAGCCAGCGTTGCGAGTGCAGTACTCATTCGTTTTATA
+CCTCTGAATCAATATCAACCTGGTGGTGAGCAATGGTTTCAACCATGTACCGGATGTGTT
+CTGCCATGCGCTCCTGAAACTCAACATCGTCATCAAACGCACGGGTAATGGATTTTTTGC
+TGGCCCCGTGGCGTTGCAAATGATCGATGCATAGCGATTCAAACAGGTGCTGGGGCAGGC
+CTTTTTCCATGTCGTCTGCCAGTTCTGCCTCTTTCTCTTCACGGGCGAGCTGCTGGTAGT
+GACGCGCCCAGCTCTGAGCCTCAAGACGATCCTGAATGTAATAAGCGTTCATGGCTGAAC
+TCCTGAAATAGCTGTGAAAATATCGCCCGCGAAATGCCGGGCTGATTAGGAAAACAGGAA
+AGGGGGTTAGTGAATGCTTTTGCTTGATCTCAGTTTCAGTATTAATATCCATTTTTTATA
+AGCGTCGACGGCTTCACGAAACATCTTTTCATCGCCAATAAAAGTGGCGATAGTGAATTT
+AGTCTGGATAGCCATAAGTGTTTGATCCATTCTTTGGGACTCCTGGCTGATTAAGTATGT
+CGATAAGGCGTTTCCATCCGTCACGTAATTTACGGGTGATTCGTTCAAGTAAAGATTCGG
+AAGGGCAGCCAGCAACAGGCCACCCTGCAATGGCATATTGCATGGTGTGCTCCTTATTTA
+TACATAACGAAAAACGCCTCGAGTGAAGCGTTATTGGTATGCGGTAAAACCGCACTCAGG
+CGGCCTTGATAGTCATATCATCTGAATCAAATATTCCTGATGTATCGATATCGGTAATTC
+TTATTCCTTCGCTACCATCCATTGGAGGCCATCCTTCCTGACCATTTCCATCATTCCAGT
+CGAACTCACACACAACACCATATGCATTTAAGTCGCTTGAAATTGCTATAAGCAGAGCAT
+GTTGCGCCAGCATGATTAATACAGCATTTAATACAGAGCCGTGTTTATTGAGTCGGTATT
+CAGAGTCTGACCAGAAATTATTAATCTGGTGAAGTTTTTCCTCTGTCATTACGTCATGGT
+CGATTTCAATTTCTATTGATGCTTTCCAGTCGTAATCAATGATGTATTTTTTGATGTTTG
+ACATCTGTTCATATCCTCACAGATAAAAAATCGCCCTCACACTGGAGGGCAAAGAAGATT
+TCCAATAATCAGAACAAGTCGGCTCCTGTTTAGTTACGAGCGACATTGCTCCGTGTATTC
+ACTCGTTGGAATGAATACACAGTGCAGTGTTTATTCTGTTATTTATGCCAAAAATAAAGG
+CCACTATCAGGCAGCTTTGTTGTTCTGTTTACCAAGTTCTCTGGCAATCATTGCCGTCGT
+TCGTATTGCCCATTTATCGACATATTTCCCATCTTCCATTACAGGAAACATTTCTTCAGG
+CTTAACCATGCATTCCGATTGCAGCTTGCATCCATTGCATCGCTTGAATTGTCCACACCA
+TTGATTTTTATCAATAGTCGTAGTCATACGGATAGTCCTGGTATTGTTCCATCACATCCT
+GAGGATGCTCTTCGAACTCTTCAAATTCTTCTTCCATATATCACCTTAAATAGTGGATTG
+CGGTAGTAAAGATTGTGCCTGTCTTTTAACCACATCAGGCTCGGTGGTTCTCGTGTACCC
+CTACAGCGAGAAATCGGATAAACTATTACAACCCCTACAGTTTGATGAGTATAGAAATGG
+ATCCACTCGTTATTCTCGGACGAGTGTTCAGTAATGAACCTCTGGAGAGAACCATGTATA
+TGATCGTTATCTGGGTTGGACTTCTGCTTTTAAGCCCAGATAACTGGCCTGAATATGTTA
+ATGAGAGAATCGGTATTCCTCATGTGTGGCATGTTTTCGTCTTTGCTCTTGCATTTTCGC
+TAGCAATTAATGTGCATCGATTATCAGCTATTGCCAGCGCCAGATATAAGCGATTTAAGC
+TAAGAAAACGCATTAAGATGCAAAACGATAAAGTGCGATCAGTAATTCAAAACCTTACAG
+AAGAGCAATCTATGGTTTTGTGCGCAGCCCTTAATGAAGGCAGGAAGTATGTGGTTACAT
+CAAAACAATTCCCATACATTAGTGAGTTGATTGAGCTTGGTGTGTTGAACAAAACTTTTT
+CCCGATGGAATGGAAAGCATATATTATTCCCTATTGAGGATATTTACTGGACTGAATTAG
+TTGCCAGCTATGATCCATATAATATTGAGATAAAGCCAAGGCCAATATCTAAGTAACTAG
+ATAAGAGGAATCGATTTTCCCTTAATTTTCTGGCGTCCACTGCATGTTATGCCGCGTTCG
+CCAGGCTTGCTGTACCATGTGCGCTGATTCTTGCGCTCAATACGTTGCAGGTTGCTTTCA
+ATCTGTTTGTGGTATTCAGCCAGCACTGTAAGGTCTATCGGATTTAGTGCGCTTTCTACT
+CGTGATTTCGGTTTGCGATTCAGCGAGAGAATAGGGCGGTTAACTGGTTTTGCGCTTACC
+CCAACCAACAGGGGATTTGCTGCTTTCCATTGAGCCTGTTTCTCTGCGCGACGTTCGCGG
+CGGCGTGTTTGTGCATCCATCTGGATTCTCCTGTCAGTTAGCTTTGGTGGTGTGTGGCAG
+TTGTAGTCCTGAACGAAAACCCCCCGCGATTGGCACATTGGCAGCTAATCCGGAATCGCA
+CTTACGGCCAATGCTTCGTTTCGTATCACACACCCCAAAGCCTTCTGCTTTGAATGCTGC
+CCTTCTTCAGGGCTTAATTTTTAAGAGCGTCACCTTCATGGTGGTCAGTGCGTCCTGCTG
+ATGTGCTCAGTATCACCGCCAGTGGTATTTATGTCAACACCGCCAGAGATAATTTATCAC
+CGCAGATGGTTATCTGTATGTTTTTTATATGAATTTATTTTTTGCAGGGGGGCATTGTTT
+GGTAGGTGAGAGATCTGAATTGCTATGTTTAGTGAGTTGTATCTATTTATTTTTCAATAA
+ATACAATTGGTTATGTGTTTTGGGGGCGATCGTGAGGCAAAGAAAACCCGGCGCTGAGGC
+CGGGTTATTCTTGTTCTCTGGTCAAATTATATAGTTGGAAAACAAGGATGCATATATGAA
+TGAACGATGCAGAGGCAATGCCGATGGCGATAGTGGGTATCATGTAGCCGCTTATGCTGG
+AAAGAAGCAATAACCCGCAGAAAAACAAAGCTCCAAGCTCAACAAAACTAAGGGCATAGA
+CAATAACTACCGATGTCATATACCCATACTCTCTAATCTTGGCCAGTCGGCGCGTTCTGC
+TTCCGATTAGAAACGTCAAGGCAGCAATCAGGATTGCAATCATGGTTCCTGCATATGATG
+ACAATGTCGCCCCAAGACCATCTCTATGAGCTGAAAAAGAAACACCAGGAATGTAGTGGC
+GGAAAAGGAGATAGCAAATGCTTACGATAACGTAAGGAATTATTACTATGTAAACACCAG
+GCATGATTCTGTTCCGCATAATTACTCCTGATAATTAATCCTTAACTTTGCCCACCTGCC
+TTTTAAAACATTCCAGTATATCACTTTTCATTCTTGCGTAGCAATATGCCATCTCTTCAG
+CTATCTCAGCATTGGTGACCTTGTTCAGAGGCGCTGAGAGATGGCCTTTTTCTGATAGAT
+AATGTTCTGTTAAAATATCTCCGGCCTCATCTTTTGCCCGCAGGCTAATGTCTGAAAATT
+GAGGTGACGGGTTAAAAATAATATCCTTGGCAACCTTTTTTATATCCCTTTTAAATTTTG
+GCTTAATGACTATATCCAATGAGTCAAAAAGCTCCCCTTCAATATCTGTTGCCCCTAAGA
+CCTTTAATATATCGCCAAATACAGGTAGCTTGGCTTCTACCTTCACCGTTGTTCGGCCGA
+TGAAATGCATATGCATAACATCGTCTTTGGTGGTTCCCCTCATCAGTGGCTCTATCTGAA
+CGCGCTCTCCACTGCTTAATGACATTCCTTTCCCGATTAAAAAATCTGTCAGATCGGATG
+TGGTCGGCCCGAAAACAGTTCTGGCAAAACCAATGGTGTCGCCTTCAACAAACAAAAAAG
+ATGGGAATCCCAATGATTCGTCATCTGCGAGGCTGTTCTTAATATCTTCAACTGAAGCTT
+TAGAGCGATTTATCTTCTGAACCAGACTCTTGTCATTTGTTTTGGTAAAGAGAAAAGTTT
+TTCCATCGATTTTATGAATATACAAATAATTGGAGCCAACCTGCAGGTGATGATTATCAG
+CCAGCAGAGAATTAAGGAAAACAGACAGGTTTATTGAGCGCTTATCTTTCCCTTTATTTT
+TGCTGCGGTAAGTCGCATAAAAACCATTCTTCATAATTCAATCCATTTACTATGTTATGT
+TCTGAGGGGAGTGAAAATTCCCCTAATTCGATGAAGATTCTTGCTCAATTGTTATCAGCT
+ATGCGCCGACCAGAACACCTTGCCGATCAGCCAAACGTCTCTTCAGGCCACTGACTAGCG
+ATAACTTTCCCCACAACGGAACAACTCTCATTGCATGGGATCATTGGGTACTGTGGGTTT
+AGTGGTTGTAAAAACACCTGACCGCTATCCCTGATCAGTTTCTTGAAGGTAAACTCATCA
+CCCCCAAGTCTGGCTATGCAGAAATCACCTGGCTCAACAGCCTGCTCAGGGTCAACGAGA
+ATTAACATTCCGTCAGGAAAGCTTGGCTTGGAGCCTGTTGGTGCGGTCATGGAATTACCT
+TCAACCTCAAGCCAGAATGCAGAATCACTGGCTTTTTTGGTTGTGCTTACCCATCTCTCC
+GCATCACCTTTGGTAAAGGTTCTAAGCTTAGGTGAGAACATCCCTGCCTGAACATGAGAA
+AAAACAGGGTACTCATACTCACTTCTAAGTGACGGCTGCATACTAACCGCTTCATACATC
+TCGTAGATTTCTCTGGCGATTGAAGGGCTAAATTCTTCAACGCTAACTTTGAGAATTTTT
+GTAAGCAATGCGGCGTTATAAGCATTTAATGCATTGATGCCATTAAATAAAGCACCAACG
+CCTGACTGCCCCATCCCCATCTTGTCTGCGACAGATTCCTGGGATAAGCCAAGTTCATTT
+TTCTTTTTTTCATAAATTGCTTTAAGGCGACGTGCGTCCTCAAGCTGCTCTTGTGTTAAT
+GGTTTCTTTTTTGTGCTCATACGTTAAATCTATCACCGCAAGGGATAAATATCTAACACC
+GTGCGTGTTGACTATTTTACCTCTGGCGGTGATAATGGTTGCATGTACTAAGGAGGTTGT
+ATGGAACAACGCATAACCCTGAAAGATTATGCAATGCGCTTTGGGCAAACCAAGACAGCT
+AAAGATCTCGGCGTATATCAAAGCGCGATCAACAAGGCCATTCATGCAGGCCGAAAGATT
+TTTTTAACTATAAACGCTGATGGAAGCGTTTATGCGGAAGAGGTAAAGCCCTTCCCGAGT
+AACAAAAAAACAACAGCATAAATAACCCCGCTCTTACACATTCCAGCCCTGAAAAAGGGC
+ATCAAATTAAACCACACCTATGGTGTATGCATTTATTTGCATACATTCAATCAATTGTTA
+TCTAAGGAAATACTTACATATGGTTCGTGCAAACAAACGCAACGAGGCTCTACGAATCGA
+GAGTGCGTTGCTTAACAAAATCGCAATGCTTGGAACTGAGAAGACAGCGGAAGCTGTGGG
+CGTTGATAAGTCGCAGATCAGCAGGTGGAAGAGGGACTGGATTCCAAAGTTCTCAATGCT
+GCTTGCTGTTCTTGAATGGGGGGTCGTTGACGACGACATGGCTCGATTGGCGCGACAAGT
+TGCTGCGATTCTCACCAATAAAAAACGCCCGGCGGCAACCGAGCGTTCTGAACAAATCCA
+GATGGAGTTCTGAGGTCATTACTGGATCTATCAACAGGAGTCATTATGACAAATACAGCA
+AAAATACTCAACTTCGGCAGAGGTAACTTTGCCGGACAGGAGCGTAATGTGGCAGATCTC
+GATGATGGTTACGCCAGACTATCAAATATGCTGCTTGAGGCTTATTCGGGCGCAGATCTG
+ACCAAGCGACAGTTTAAAGTGCTGCTTGCCATTCTGCGTAAAACCTATGGGTGGAATAAA
+CCAATGGACAGAATCACCGATTCTCAACTTAGCGAGATTACAAAGTTACCTGTCAAACGG
+TGCAATGAAGCCAAGTTAGAACTCGTCAGAATGAATATTATCAAGCAGCAAGGCGGCATG
+TTTGGACCAAATAAAAACATCTCAGAATGGTGCATCCCTCAAAACGAGGGAAAATCCCCT
+AAAACGAGGGATAAAACATCCCTCAAATTGGGGGATTGCTATCCCTCAAAACAGGGGGAC
+ACAAAAGACACTATTACAAAAGAAAAAAGAAAAGATTATTCGTCAGAGAATTCTGGCGAA
+TCCTCTGACCAGCCAGAAAACGACCTTTCTGTGGTGAAACCGGATGCTGCAATTCAGAGC
+GGCAGCAAGTGGGGGACAGCAGAAGACCTGACCGCCGCAGAGTGGATGTTTGACATGGTG
+AAGACTATCGCACCATCAGCCAGAAAACCGAATTTTGCTGGGTGGGCTAACGATATCCGC
+CTGATGCGTGAACGTGACGGACGTAACCACCGCGACATGTGTGTGCTGTTCCGCTGGGCA
+TGCCAGGACAACTTCTGGTCCGGTAACGTGCTGAGCCCGGCCAAACTCCGCGATAAGTGG
+ACCCAACTCGAAATCAACCGTAACAAGCAACAGGCAGGCGTGACAGCCAGCAAACCAAAA
+CTCGACCTGACAAACACAGACTGGATTTACGGGGTGGATCTATGAAAAACATCGCCGCAC
+AGATGGTTAACTTTGACCGTGAGCAGATGCGTCGGATCGCCAACAACATGCCGGAACAGT
+ACGACGAAAAGCCGCAGGTACAGCAGGTAGCGCAGATCATCAACGGTGTGTTCAGCCAGT
+TACTGGCAACTTTCCCGGCGAGCCTGGCTAACCGTGACCAGAACGAAGTGAACGAAATCC
+GTCGCCAGTGGGTTCTGGCTTTTCGGGAAAACGGGATCACCACGATGGAACAGGTTAACG
+CAGGAATGCGCGTAGCCCGTCGGCAGAATCGACCATTTCTGCCATCACCCGGGCAGTTTG
+TTGCATGGTGCCGGGAAGAAGCATCCGTTACCGCCGGACTGCCAAACGTCAGCGAGCTGG
+TTGATATGGTTTACGAGTATTGCCGGAAGCGAGGCCTGTATCCGGATGCGGAGTCTTATC
+CGTGGAAATCAAACGCGCACTACTGGCTGGTTACCAACCTGTATCAGAACATGCGGGCCA
+ATGCGCTTACTGATGCGGAATTACGCCGTAAGGCCGCAGATGAGCTTGTCCATATGACTG
+CGAGAATTAACCGTGGTGAGGCGATCCCTGAACCAGTAAAACAACTTCCTGTCATGGGCG
+GTAGACCTCTAAATCGTGCACAGGCTCTGGCGAAGATCGCAGAAATCAAAGCTAAGTTCG
+GACTGAAAGGAGCAAGTGTATGACGGGCAAAGAGGCAATTATTCATTACCTGGGGACGCA
+TAATAGCTTCTGTGCGCCGGACGTTGCCGCGCTAACAGGCGCAACAGTAACCAGCATAAA
+TCAGGCCGCGGCTAAAATGGCACGGGCAGGTCTTCTGGTTATCGAAGGTAAGGTCTGGCG
+AACGGTGTATTACCGGTTTGCTACCAGGGAAGAACGGGAAGGAAAGATGAGCACGAACCT
+GGTTTTTAAGGAGTGTCGCCAGAGTGCCGCGATGAAACGGGTATTGGCGGTATATGGAGT
+TAAAAGATGACCATCTACATTACTGAGCTAATAACAGGCCTGCTGGTAATCGCAGGCCTT
+TTTATTTGGGGGAGAGGGAAGTCATGAAAAAACTAACCTTTGAAATTCGATCTCCAGCAC
+ATCAGCAAAACGCTATTCACGCAGTACAGCAAATCCTTCCAGACCCAACCAAACCAATCG
+TAGTAACCATTCAGGAACGCAACCGCAGCTTAGACCAAAACAGGAAGCTATGGGCCTGCT
+TAGGTGACGTCTCTCGTCAGGTTGAATGGCATGGTCGCTGGCTGGATGCAGAAAGCTGGA
+AGTGTGTGTTTACCGCAGCATTAAAGCAGCAGGATGTTGTTCCTAACCTTGCCGGGAATG
+GCTTTGTGGTAATAGGCCAGTCAACCAGCAGGATGCGTGTAGGCGAATTTGCGGAGCTAT
+TAGAGCTTATACAGGCATTCGGTACAGAGCGTGGCGTTAAGTGGTCAGACGAAGCGAGAC
+TGGCTCTGGAGTGGAAAGCGAGATGGGGAGACAGGGCTGCATGATAAATGTCGTTAGTTT
+CTCCGGTGGCAGGACGTCAGCATATTTGCTCTGGCTAATGGAGCAAAAGCGACGGGCAGG
+TAAAGACGTGCATTACGTTTTCATGGATACAGGTTGTGAACATCCAATGACATATCGGTT
+TGTCAGGGAAGTTGTGAAGTTCTGGGATATACCGCTCACCGTATTGCAGGTTGATATCAA
+CCCGGAGCTTGGACAGCCAAATGGTTATACGGTATGGGAACCAAAGGATATTCAGACGCG
+AATGCCTGTTCTGAAGCCATTTATCGATATGGTAAAGAAATATGGCACTCCATACGTCGG
+CGGCGCGTTCTGCACTGACAGATTAAAACTCGTTCCCTTCACCAAATACTGTGATGACCA
+TTTCGGGCGAGGGAATTACACCACGTGGATTGGCATCAGAGCTGATGAACCGAAGCGGCT
+AAAGCCAAAGCCTGGAATCAGATATCTTGCTGAACTGTCAGACTTTGAGAAGGAAGATAT
+CCTCGCATGGTGGAAGCAACAACCATTCGATTTGCAAATACCGGAACATCTCGGTAACTG
+CATATTCTGCATTAAAAAATCAACGCAAAAAATCGGACTTGCCTGCAAAGATGAGGAGGG
+ATTGCAGCGTGTTTTTAATGAGGTCATCACGGGATCCCATGTGCGTGACGGACATCGGGA
+AACGCCAAAGGAGATTATGTACCGAGGAAGAATGTCGCTGGACGGTATCGCGAAAATGTA
+TTCAGAAAATGATTATCAAGCCCTGTATCAGGACATGGTACGAGCTAAAAGATTCGATAC
+CGGCTCTTGTTCTGAGTCATGCGAAATATTTGGAGGGCAGCTTGATTTCGACTTCGGGAG
+GGAAGCTGCATGATGCGATGTTATCGGTGCGGTGAATGCAAAGAAGATAACCGCTTCCGA
+CCAAATCAACCTTACTGGAATCGATGGTGTCTCCGGTGTGAAAGAACACCAACAGGGGTG
+TTACCACTACCGCAGGAAAAGGAGGACGTGTGGCGAGACAGCGACGAAGTATCACCGACA
+TAATCTGCGAAAACTGCAAATACCTTCCAACGAAACGCACCAGAAATAAACCCAAGCCAA
+TCCCAAAAGAATCTGACGTAAAAACCTTCAACTACACGGCTCACCTGTGGGATATCCGGT
+GGCTAAGACGTCGTGCGAGGAAAACAAGGTGATTGACCAAAATCGAAGTTACGAACAAGA
+AAGCGTCGAGCGAGCTTTAACGTGCGCTAACTGCGGTCAGAAGCTGCATGTGCTGGAAGT
+TCACGTGTGTGAGCACTGCTGCGCAGAACTGATGAGCGATCCGAATAGCTCGATGCACGA
+GGAAGAAGATGATGGCTAAACCAGCGCGAAGACGATGTAAAAACGATGAATGCCGGGAAT
+GGTTTCACCCTGCATTCGCTAATCAGTGGTGGTGCTCTCCAGAGTGTGGAACCAAGATAG
+CACTCGAACGACGAAGTAAAGAACGCGAAAAAGCGGAAAAAGCAGCAGAGAAGAAACGAC
+GACGAGAGGAGCAGAAACAGAAAGATAAACTTAAGATTCGAAAACTCGCCTTAAAGCCCC
+GCAGTTACTGGATTAAACAAGCCCAACAAGCCGTAAACGCCTTCATCAGAGAAAGAGACC
+GCGACTTACCATGTATCTCGTGCGGAACGCTCACGTCTGCTCAGTGGGATGCCGGACATT
+ACCGGACAACTGCTGCGGCACCTCAACTCCGATTTAATGAACGCAATATTCACAAGCAAT
+GCGTGGTGTGCAACCAGCACAAAAGCGGAAATCTCGTTCCGTATCGCGTCGAACTGATTA
+GCCGCATCGGGCAGGAAGCAGTAGACGAAATCGAATCAAACCATAACCGCCATCGCTGGA
+CTATCGAAGAGTGCAAGGCGATCAAGGCAGAGTACCAACAGAAACTCAAAGACCTGCGAA
+ATAGCAGAAGTGAGGCCGCATGACGTTCTCAGTAAAAACCATTCCAGACATGCTCGTTGA
+AACATACGGAAATCAGACAGAAGTAGCACGCAGACTGAAATGTAGTCGCGGTACGGTCAG
+AAAATACGTTGATGATAAAGACGGGAAAATGCACGCCATCGTCAACGACGTTCTCATGGT
+TCATCGCGGATGGAGTGAAAGAGATGCGCTATTACGAAAAAATTGATGGCAGCAAATACC
+GAAATATTTGGGTAGTTGGCGATCTGCACGGATGCTACACGAACCTGATGAACAAACTGG
+ATACGATTGGATTCGACAACAAAAAAGACCTGCTTATCTCGGTGGGCGATTTGGTTGATC
+GTGGTGCAGAGAACGTTGAATGCCTGGAATTAATCACATTCCCCTGGTTCAGAGCTGTAC
+GTGGAAACCATGAGCAAATGATGATTGATGGCTTATCAGAGCGTGGAAACGTTAATCACT
+GGCTGCTTAATGGCGGTGGCTGGTTCTTTAATCTCGATTACGACAAAGAAATTCTGGCTA
+AAGCTCTTGCCCATAAAGCAGATGAACTTCCGTTAATCATCGAACTGGTGAGCAAAGATA
+AAAAATATGTTATCTGCCACGCCGATTATCCCTTTGACGAATACGAGTTTGGAAAGCCAG
+TTGATCATCAGCAGGTAATCTGGAACCGCGAACGAATCAGCAACTCACAAAACGGGATCG
+TGAAAGAAATCAAAGGCGCGGACACGTTCATCTTTGGTCATACGCCAGCAGTGAAACCAC
+TCAAGTTTGCCAACCAAATGTATATCGATACCGGCGCAGTGTTCTGCGGAAACCTAACAT
+TGATTCAGGTACAGGGAGAAGGCGCATGAGACTCGAAAGCGTAGCTAAATTTCATTCGCC
+AAAAAGCCCGATGATGAGCGACTCACCACGGGCCACGGCTTCTGACTCTCTTTCCGGTAC
+TGATGTGATGGCTGCTATGGGGATGGCGCAATCACAAGCCGGATTCGGTATGGCTGCATT
+CTGCGGTAAGCACGAACTCAGCCAGAACGACAAACAAAAGGCTATCAACTATCTGATGCA
+ATTTGCACACAAGGTATCGGGGAAATACCGTGGTGTGGCAAAGCTTGAAGGAAATACTAA
+GGCAAAGGTACTGCAAGTGCTCGCAACATTCGCTTATGCGGATTATTGCCGTAGTGCCGC
+GACGCCGGGGGCAAGATGCAGAGATTGCCATGGTACAGGCCGTGCGGTTGATATTGCCAA
+AACAGAGCTGTGGGGGAGAGTTGTCGAGAAAGAGTGCGGAAGATGCAAAGGCGTCGGCTA
+TTCAAGGATGCCAGCAAGCGCAGCATATCGCGCTGTGACGATGCTAATCCCAAACCTTAC
+CCAACCCACCTGGTCACGCACTGTTAAGCCGCTGTATGACGCTCTGGTGGTGCAATGCCA
+CAAAGAAGAGTCAATCGCAGACAACATTTTGAATGCGGTCACACGTTAGCAGCATGATTG
+CCACGGATGGCAACATATTAACGGCATGATATTGACTTATTGAATAAAATTGGGTAAATT
+TGACTCAACGATGGGTTAATTCGCTCGTTGTGGTAGTGAGATGAAAAGAGGCGGCGCTTA
+CTACCGATTCCGCCTAGTTGGTCACTTCGACGTATCGTCTGGAACTCCAACCATCGCAGG
+CAGAGAGGTCTGCAAAATGCAATCCCGAAACAGTTCGCAGGTAATAGTTAGAGCCTGCAT
+AACGGTTTCGGGATTTTTTATATCTGCACAACAGGTAAGAGCATTGAGTCGATAATCGTG
+AAGAGTCGGCGAGCCTGGTTAGCCAGTGCTCTTTCCGTTGTGCTGAATTAAGCGAATACC
+GGAAGCAGAACCGGATCACCAAATGCGTACAGGCGTCATCGCCGCCCAGCAACAGCACAA
+CCCAAACTGAGCCGTAGCCACTGTCTGTCCTGAATTCATTAGTAATAGTTACGCTGCGGC
+CTTTTACACATGACCTTCGTGAAAGCGGGTGGCAGGAGGTCGCGCTAACAACCTCCTGCC
+GTTTTGCCCGTGCATATCGGTCACGAACAAATCTGATTACTAAACACAGTAGCCTGGATT
+TGTTCTATCAGTAATCGACCTTATTCCTAATTAAATAGAGCAAATCCCCTTATTGGGGGT
+AAGACATGAAGATGCCAGAAAAACATGACCTGTTGGCCGCCATTCTCGCGGCAAAGGAAC
+AAGGCATCGGGGCAATCCTTGCGTTTGCAATGGCGTACCTTCGCGGCAGATATAATGGCG
+GTGCGTTTACAAAAACAGTAATCGACGCAACGATGTGCGCCATTATCGCCTAGTTCATTC
+GTGACCTTCTCGACTTCGCCGGACTAAGTAGCAATCTCGCTTATATAACGAGCGTGTTTA
+TCGGCTACATCGGTACTGACTCGATTGGTTCGCTTATCAAACGCTTCGCTGCTAAAAAAG
+CCGGAGTAGAAGATGGTAGAAATCAATAATCAACGTAAGGCGTTCCTCGATATGCTGGCG
+TGGTCGGAGGGAACTGATAACGGACGTCAGAAAACCAGAAATCATGGTTATGACGTCATT
+GTAGGCGGAGAGCTATTTACTGATTACTCCGATCACCCTCGCAAACTTGTCACGCTAAAC
+CCAAAACTCAAATCAACAGGCGCCGGACGCTACCAGCTTCTTTCCCGTTGGTGGGATGCC
+TACCGCAAGCAGCTTGGCCTGAAAGACTTCTCTCCGAAAAGTCAGGACGCTGTGGCATTG
+CAGCAGATTAAGGAGCGTGGCGCTTTACCTATGATTGATCGTGGTGATATCCGTCAGGCA
+ATCGACCGTTGCAGCAATATCTGGGCTTCACTGCCGGGCGCTGGTTATGGTCAGTTCGAG
+CATAAGGCTGACAGCCTGATTGCAAAATTCAAAGAAGCGGGCGGAACGGTCAGAGAGATT
+GATGTATGAGCAGAGTCACCGCGATTATCTCCGCTCTGGTTATCTGCATCATCGTCTGCC
+TGTCATGGGCTGTTAATCATTACCGTGATAACGCCATTACCTACAAAGCCCAGCGCGACA
+AAAATGCCAGAGAACTGAAGCTGGCGAACGCGGCAATTACTGACATGCAGATGCGTCAGC
+GTGATGTTGCTGCGCTCGATGCAAAATACACGAAGGAGTTAGCTGATGCTAAAGCTGAAA
+ATGATGCTCTGCGTGATGATGTTGCCGCTGGTCGTCGTCGGTTGCACATCAAAGCAGTCT
+GTCAGTCAGTGCGTGAAGCCACCACCGCCTCCGGCGTGGATAATGCAGCCTCCCCCCGAC
+TGGCAGACACCGCTGAACGGGATTATTTCACCCTCAGAGAGAGGCTGATCACTATGCAAA
+AACAACTGGAAGGAACCCAGAAGTATATTAATGAGCAGTGCAGATAGAGTTGCCCATATC
+GATGGGCAACTCATGCAATTATTGTGAGCAATACACACGCGCTTCCAGCGGAGTATAAAT
+GCCTAAAGTAATAAAACCGAGCAATCCATTTACGAATGTTTGCTGGGTTTCTGTTTTAAC
+AACATTTTCTGCGCCGCCACAAATTTTGGCTGCATCGACAGTTTTCTTCTGCCCAATTCC
+AGAAACGAAGAAATGATGGGTGATGGTTTCCTTTGGTGCTACTGCTGCCGGTTTGTTTTG
+AACAGTAAACGTCTGTTGAGCACATCCTGTAATAAGCAGGGCCAGCGCAGTAGCGAGTAG
+CATTTTTTTCATGGTGTTATTCCCGATGCTTTTTGAAGTTCGCAGAATCGTATGTGTAGA
+AAATTAAACAAACCCTAAACAATGAGTTGAAATTTCATATTGTTAATATTTATTAATGTA
+TGTCAGGTGCGATGAATCGTCATTGTATTCCCGGATTAACTATGTCCACAGCCCTGACGG
+GGAACTTCTCTGCGGGAGTGTCCGGGAATAATTAAAACGATGCACACAGGGTTTAGCGCG
+TACACGTATTGCATTATGCCAACGCCCCGGTGCTGACACGGAAGAAACCGGACGTTATGA
+TTTAGCGTGGAAAGATTTGTGTAGTGTTCTGAATGCTCTCAGTAAATAGTAATGAATTAT
+CAAAGGTATAGTAATATCTTTTATGTTCATGGATATTTGTAACCCATCGGAAAACTCCTG
+CTTTAGCAAGATTTTCCCTGTATTGCTGAAATGTGATTTCTCTTGATTTCAACCTATCAT
+AGGACGTTTCTATAAGATGCGTGTTTCTTGAGAATTTAACATTTACAACCTTTTTAAGTC
+CTTTTATTAACACGGTGTTATCGTTTTCTAACACGATGTGAATATTATCTGTGGCTAGAT
+AGTAAATATAATGTGAGACGTTGTGACGTTTTAGTTCAGAATAAAACAATTCACAGTCTA
+AATCTTTTCGCACTTGATCGAATATTTCTTTAAAAATGGCAACCTGAGCCATTGGTAAAA
+CCTTCCATGTGATACGAGGGCGCGTAGTTTGCATTATCGTTTTTATCGTTTCAATCTGGT
+CTGACCTCCTTGTGTTTTGTTGATGATTTATGTCAAATATTAGGAATGTTTTCACTTAAT
+AGTATTGGTTGCGTAACAAAGTGCGGTCCTGCTGGCATTCTGGAGGGAAATACAACCGAC
+AGATGTATGTAAGGCCAACGTGCTCAAATCTTCATACAGAAAGATTTGAAGTAATATTTT
+AACCGCTAGATGAAGAGCAAGCGCATGGAGCGACAAAATGAATAAAGAACAATCTGCTGA
+TGATCCCTCCGTGGATCTGATTCGTGTAAAAAATATGCTTAATAGCACCATTTCTATGAG
+TTACCCTGATGTTGTAATTGCATGTATAGAACATAAGGTGTCTCTGGAAGCATTCAGAGC
+AATTGAGGCAGCGTTGGTGAAGCACGATAATAATATGAAGGATTATTCCCTGGTGGTTGA
+CTGATCACCATAACTGCTAATCATTCAAACTATTTAGTCTGTGACAGAGCCAACACGCAG
+TCTGTCACTGTCAGGAAAGTGGTAAAACTGCAACTCAATTACTGCAATGCCCTCGTAATT
+AAGTGAATTTACAATATCGTCCTGTTCGGAGGGAAGAACGCGGGATGTTCATTCTTCATC
+ACTTTTAATTGATGTATATGCTCTCTTTTCTGACGTTAGTCTCCGACGGCAGGCTTCAAT
+GACCCAGGCTGAGAAATTCCCGGACCCTTTTTGCTCAAGAGCGATGTTAATTTGTTCAAT
+CATTTGGTTAGGAAAGCGGATGTTGCGGGTTGTTGTTCTGCGGGTTCTGTTCTTCGTTGA
+CATGAGGTTGCCCCGTATTCAGTGTCGCTGATTTGTATTGTCTGAAGTTGTTTTTACGTT
+AAGTTGATGCAGATCAATTAATACGATACCTGCGTCATAATTGATTATTTGACGTGGTTT
+GATGGCCTCCACGCACGTTGTGATATGTAGATGATAATCATTATCACTTTACGGGTCCTT
+TCCGGTGATCCGACAGGTTACG
diff --git a/test/data/lambda/sequence/lambda.fasta.fai b/test/data/lambda/sequence/lambda.fasta.fai
new file mode 100644
index 0000000..7bc16a4
--- /dev/null
+++ b/test/data/lambda/sequence/lambda.fasta.fai
@@ -0,0 +1 @@
+lambda_NEB3011 48502 16 60 61
diff --git a/test/data/p4-c2-lambda-mod-decode.cmp.h5 b/test/data/p4-c2-lambda-mod-decode.cmp.h5
new file mode 100644
index 0000000..b38af22
Binary files /dev/null and b/test/data/p4-c2-lambda-mod-decode.cmp.h5 differ
diff --git a/test/detectionMethylFractionTest.py b/test/detectionMethylFractionTest.py
new file mode 100755
index 0000000..2bef408
--- /dev/null
+++ b/test/detectionMethylFractionTest.py
@@ -0,0 +1,52 @@
+import logging
+import os
+import platform
+import unittest
+from pbcore.io import CmpH5Reader
+from kineticsTools.ReferenceUtils import ReferenceWindow
+from kineticsTools.KineticWorker import KineticWorker
+from kineticsTools.ipdModel import IpdModel
+
+from test import TestSetup
+
+
+class TestDetectionMethylFraction(TestSetup):
+
+ # We inherit the setup method for test.py.
+ # If you need to customize your dataset, we should set up some different conventions
+ # def setUp(self):
+
+ def getOpts(self):
+ opts = self.basicOpts()
+ opts.identify = False
+ opts.methylFraction = True
+ return opts
+
+ def testSmallDecode(self):
+ """
+ Test modified fraction estimation in detection mode around a known modification in lambda
+ """
+
+ # First methlyated A in lambda:
+ # strand motif onTarget seqid tpl
+ # 0 GCACNNNNNNGTT On 1 14983
+
+ start = 14900
+ end = 15100
+ referenceWindow = ReferenceWindow(1, "lambda_NEB3011", start, end)
+ bounds = (start, end)
+
+ self.kw._prepForReferenceWindow(referenceWindow)
+ kinetics = self.kw._summarizeReferenceRegion(bounds, True, False)
+
+ # Verify that we detect m6A mods at 14982 and 14991
+ m6AMods = [{'frac': x['frac'], 'fracLow': x['fracLow'], 'fracUp': x['fracUp'], 'tpl': x['tpl'], 'strand': x['strand']}
+ for x in kinetics if x.has_key('frac') and x['tpl'] in (14982, 14991)]
+ print m6AMods
+
+ for mod in m6AMods:
+ self.assertGreater(mod["frac"], 0.5)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/methyFractionTest.py b/test/methyFractionTest.py
new file mode 100755
index 0000000..7bdd812
--- /dev/null
+++ b/test/methyFractionTest.py
@@ -0,0 +1,51 @@
+import logging
+import os
+import platform
+import unittest
+from pbcore.io import CmpH5Reader
+from kineticsTools.ReferenceUtils import ReferenceWindow
+from kineticsTools.KineticWorker import KineticWorker
+from kineticsTools.ipdModel import IpdModel
+
+from test import TestSetup
+
+
+class TestMethylFraction(TestSetup):
+
+ # We inherit the setup method for test.py.
+ # If you need to customize your dataset, we should set up some different conventions
+ # def setUp(self):
+
+ def getOpts(self):
+ opts = self.basicOpts()
+ opts.methylFraction = True
+ return opts
+
+ def testSmallDecode(self):
+ """
+ Test a modification decode around a known modification in lambda
+ """
+
+ # First methlyated A in lambda:
+ # strand motif onTarget seqid tpl
+ # 0 GCACNNNNNNGTT On 1 14983
+
+ start = 14900
+ end = 15100
+ referenceWindow = ReferenceWindow(1, "lambda_NEB3011", start, end)
+ bounds = (start, end)
+
+ self.kw._prepForReferenceWindow(referenceWindow)
+ kinetics = self.kw._summarizeReferenceRegion(bounds, True, True)
+ mods = self.kw._decodePositiveControl(kinetics, bounds)
+ print mods
+
+ # Verify that we detect m6A mods at 14982 and 14991
+ m6AMods = [x for x in mods if x['modification'] == 'm6A' and x['tpl'] in (14982, 14991)]
+
+ for mod in m6AMods:
+ self.assertGreater(mod["frac"], 0.5)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/speed.py b/test/speed.py
new file mode 100755
index 0000000..18652f5
--- /dev/null
+++ b/test/speed.py
@@ -0,0 +1,27 @@
+import logging
+import os
+import platform
+import unittest
+from pbcore.io import CmpH5Reader
+from kineticsTools.KineticWorker import KineticWorker
+from kineticsTools.ipdModel import IpdModel
+
+from test import TestSetup
+
+
+class TestSpeed(TestSetup):
+
+ def testSpeed(self):
+
+ contig = self.contigs[0].sequence
+ snippetFunc = self.ipdModel.snippetFunc(1, 3, 9)
+ ipdFunc = self.ipdModel.predictIpdFunc(1)
+
+ snips = [snippetFunc(x, 0) for x in xrange(1000)]
+
+ pFast = self.ipdModel.gbmModel.getPredictions(snips)
+ #pSlow = self.ipdModel.gbmModel.getPredictionsSlow(snips)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test.py b/test/test.py
new file mode 100755
index 0000000..8d7708e
--- /dev/null
+++ b/test/test.py
@@ -0,0 +1,181 @@
+import logging
+import os
+import platform
+import unittest
+from pbcore.io import AlignmentSet
+from kineticsTools.KineticWorker import KineticWorker
+from kineticsTools.ipdModel import IpdModel
+from kineticsTools.ReferenceUtils import ReferenceUtils, ReferenceWindow
+
+
+class TestSetup(unittest.TestCase):
+
+ def getOpts(self):
+ """Derived tests can override this to customize behaviour"""
+ return self.basicOpts()
+
+ def basicOpts(self):
+ """Mock up some options for the kinetic worker"""
+ class opts:
+
+ def __init__(self):
+ self.mapQvThreshold = -1
+ self.cap_percentile = 99.0
+ self.minCoverage = 3
+ self.subread_norm = True
+ self.maxCoverage = 200
+ self.identify = True
+ self.methylFraction = False
+ self.pvalue = 0.01
+ self.modsToCall = ['H', 'J', 'K']
+ # Bug 23546: need to set values for these two new flags:
+ self.identifyMinCov = 5
+ self.methylMinCov = 10
+ self.useLDA = False
+ self.maxAlignments = 1500
+ self.randomSeed = None
+
+ return opts()
+
+ def setUp(self):
+
+ # Load the lambda genome from our sample data
+
+ dataDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+ resourcesDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../kineticsTools/resources')
+ ref = os.path.join(dataDir, 'lambda', 'sequence', 'lambda.fasta')
+ cmpFile = os.path.join(dataDir, "p4-c2-lambda-mod-decode.cmp.h5")
+
+ self.cmpH5 = AlignmentSet(cmpFile, referenceFastaFname=ref)
+ self.contigs = ReferenceUtils.loadReferenceContigs(ref, self.cmpH5)
+ self.ipdModel = IpdModel(self.contigs, os.path.join(resourcesDir, "P4-C2.h5"))
+
+ # Create a functional KineticWorker object that can be poked at manually.
+ self.kw = KineticWorker(self.ipdModel)
+
+ # Put in our cmp.h5 - this is normally supplied by the Worker superclass
+ self.kw.caseCmpH5 = self.cmpH5
+ self.kw.controlCmpH5 = None
+
+ self.kw.options = self.getOpts()
+
+ def runTest(self):
+ pass
+
+
+class TestBasic(TestSetup):
+
+ def _testIpdModel(self):
+
+ contig = self.contigs[0].sequence
+
+ snippetFunc = self.ipdModel.snippetFunc(1, 10, 10)
+ snip = snippetFunc(0, 0)
+
+ print "Got snippet at pos 0: %s" % snip
+ print "First 10 bases of lambda: %s" % (contig[0:10])
+
+ lastPos = len(contig) - 1
+ snip = snippetFunc(lastPos, 0)
+
+ print "Got snippet at pos %d: %s" % (lastPos, snip)
+ print "Last 10 bases of lambda: %s" % contig[-10:]
+
+ def _testSpeed(self):
+
+ contig = self.contigs[0].sequence
+ snippetFunc = self.ipdModel.snippetFunc(1, 3, 8)
+ ipdFunc = self.ipdModel.predictIpdFunc(1)
+
+ snips = [snippetFunc(x, 0) for x in xrange(10000)]
+
+ pFast = self.ipdModel.gbmModel.getPredictions(snips)
+ pSlow = self.ipdModel.gbmModel.getPredictionsSlow(snips)
+
+ def testCompareNullToGbm(self):
+ """
+ Check the null model against a few hard-coded contexts
+ """
+
+ contig = self.contigs[0].sequence
+ snippetFunc = self.ipdModel.snippetFunc(1, 4, 10)
+ ipdFunc = self.ipdModel.predictIpdFunc(1)
+ ipdModelFunc = self.ipdModel.predictIpdFuncModel(1)
+
+ for (pos, tplStrand) in [(3, 0), (10, 0), (20, 0), (30, 0), (31, 0), (32, 0), (33, 0), (34, 0)]:
+ snip = snippetFunc(pos, tplStrand)
+
+ print "Pos: %d, TplStrand: %d" % (pos, tplStrand)
+ print "Got ctx: %s" % snip
+ #print "From lambda: %s" % (contig[(pos - 4):(pos + 11)])
+
+ print "Lut prediction: %f" % ipdFunc(pos, tplStrand)
+
+ gbmPred = self.ipdModel.gbmModel.getPredictionsSlow([snip])[0]
+ print "Gbm prediction: %f" % gbmPred
+
+ gbmPred = self.ipdModel.gbmModel.getPredictions([snip])[0]
+ print "Gbm prediction fast: %f" % gbmPred
+
+ gbmSnippetPred = ipdModelFunc(pos, tplStrand)
+ print "Gbm pred via predictIpdFuncModel: %f" % gbmSnippetPred
+
+ if snip[4] == 'A':
+ snip2 = snip[0:4] + 'H' + snip[5:]
+ snip3 = snip[0:9] + 'H' + snip[10:]
+ gbmPred = self.ipdModel.gbmModel.getPredictionsSlow([snip2, snip3])
+ print "Methylated prediction: %s -> %f" % (snip2, gbmPred[0])
+ print "Methylated prediction: %s -> %f" % (snip3, gbmPred[1])
+
+ gbmPred = self.ipdModel.gbmModel.getPredictions([snip2, snip3])
+ print "Methylated prediction fast: %s -> %f" % (snip2, gbmPred[0])
+ print "Methylated prediction fast: %s -> %f" % (snip3, gbmPred[1])
+
+ if snip[4] == 'C':
+ snip2 = snip[0:4] + 'J' + snip[5:]
+ snip3 = snip[0:9] + 'J' + snip[10:]
+ gbmPred = self.ipdModel.gbmModel.getPredictionsSlow([snip2, snip3])
+ print "Methylated prediction: %s -> %f" % (snip2, gbmPred[0])
+ print "Methylated prediction: %s -> %f" % (snip3, gbmPred[1])
+
+ gbmPred = self.ipdModel.gbmModel.getPredictions([snip2, snip3])
+ print "Methylated prediction fast: %s -> %f" % (snip2, gbmPred[0])
+ print "Methylated prediction fast: %s -> %f" % (snip3, gbmPred[1])
+
+ print ""
+
+ def testSmallDecode(self):
+ """
+ Test a modification decode around a known modification in lambda
+ """
+
+ # First methlyated A in lambda:
+ # strand motif onTarget seqid tpl
+ # 0 GCACNNNNNNGTT On 1 14983
+
+ start = 14900
+ end = 15100
+ referenceWindow = ReferenceWindow(1, "lambda_NEB3011", start, end)
+ bounds = (start, end)
+
+ self.kw._prepForReferenceWindow(referenceWindow)
+ kinetics = self.kw._summarizeReferenceRegion(bounds, False, True)
+ mods = self.kw._decodePositiveControl(kinetics, bounds)
+ print mods
+
+ # Verify that we detect m6A mods at 14982 and 14991
+ m6AMods = [x for x in mods if x['modification'] == 'm6A' and x['tpl'] in (14982, 14991)]
+ self.assertEqual(len(m6AMods), 2)
+
+ def _testReferenceBoundary(self):
+ start = 0
+ end = 400
+ referenceWindow = (1, start, end)
+ bounds = (start, end)
+
+ res = self.kw.onChunk(referenceWindow)
+ # print res
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_ReferenceUtils.py b/test/test_ReferenceUtils.py
new file mode 100644
index 0000000..1c204b4
--- /dev/null
+++ b/test/test_ReferenceUtils.py
@@ -0,0 +1,73 @@
+
+import logging
+import unittest
+import os.path
+
+from kineticsTools.ReferenceUtils import ReferenceUtils
+from pbcore.io import AlignmentSet
+
+big_data_dir = "/mnt/secondary-siv/testdata/kineticsTools"
+ref_dir = "/mnt/secondary-siv/references"
+
+logging.basicConfig()
+log = logging.getLogger()
+
+ at unittest.skipUnless(os.path.isdir(big_data_dir), "Shared data folder missing")
+class ReferenceUtilsTest (unittest.TestCase):
+ def setUp (self):
+ pass
+
+ def test_cmph5 (self):
+ base_dir = os.path.dirname(os.path.abspath(__file__))
+ dataDir = os.path.join(base_dir,'data')
+ resourcesDir = os.path.join(base_dir, '../kineticsTools/resources')
+ refFile = os.path.join(dataDir, 'lambda', 'sequence', 'lambda.fasta')
+ cmpFile = os.path.join(dataDir, "p4-c2-lambda-mod-decode.cmp.h5")
+ ds = AlignmentSet(cmpFile, referenceFastaFname=refFile)
+ contigs = ReferenceUtils.loadReferenceContigs(refFile, ds)
+ self.assertEquals(len(contigs), 1)
+ self.assertEquals(contigs[0].cmph5ID, 1)
+ chemistry = ReferenceUtils.loadAlignmentChemistry(ds)
+ self.assertEquals(chemistry, "P4-C2")
+
+ def test_bam (self):
+ bamFile = os.path.join(big_data_dir, "Hpyl_1_5000.bam")
+ refFile = os.path.join(ref_dir, "Helicobacter_pylori_J99", "sequence",
+ "Helicobacter_pylori_J99.fasta")
+ ds = AlignmentSet(bamFile, referenceFastaFname=refFile)
+ contigs = ReferenceUtils.loadReferenceContigs(refFile, ds)
+ self.assertEquals(len(contigs), 1)
+ self.assertEquals(contigs[0].cmph5ID, 0)
+ chemistry = ReferenceUtils.loadAlignmentChemistry(ds)
+ self.assertEquals(chemistry, "P6-C4")
+
+ def test_dataset (self):
+ pass # TODO
+
+ def test_parseReferenceWindow (self):
+ window = "gi|12057207|gb|AE001439.1|:1-5000"
+ bamFile = os.path.join(big_data_dir, "Hpyl_1_5000.bam")
+ refFile = os.path.join(ref_dir, "Helicobacter_pylori_J99", "sequence",
+ "Helicobacter_pylori_J99.fasta")
+ alnFile = AlignmentSet(bamFile, referenceFastaFname=refFile)
+ win = ReferenceUtils.parseReferenceWindow(window,
+ alnFile.referenceInfo)
+ self.assertEquals([win.refId, win.start, win.end], [0, 1, 5000])
+
+ def test_createReferenceWindows (self):
+ bamFile = os.path.join(big_data_dir, "Hpyl_1_5000.bam")
+ ds = AlignmentSet(bamFile, referenceFastaFname=None)
+ refInfoTable = ds.referenceInfoTable
+ windows = ReferenceUtils.createReferenceWindows(refInfoTable)
+ self.assertEqual(len(windows), 1)
+ w = windows[0]
+ self.assertEqual(w.refId, 0)
+ self.assertEqual(w.refName, 'gi|12057207|gb|AE001439.1|')
+ self.assertEqual(w.start, 0)
+ self.assertEqual(w.end, 1643831)
+
+ def test_enumerateChunks (self):
+ pass # TODO
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/test/test_inputs.py b/test/test_inputs.py
new file mode 100644
index 0000000..6c2ca5a
--- /dev/null
+++ b/test/test_inputs.py
@@ -0,0 +1,147 @@
+
+"""
+Test for BAM file and AlignmentSet support.
+"""
+
+import logging
+import os
+import platform
+import unittest
+from pbcore.io import AlignmentSet
+from kineticsTools.KineticWorker import KineticWorker
+from kineticsTools.ipdModel import IpdModel
+from kineticsTools.ReferenceUtils import ReferenceUtils, ReferenceWindow
+
+logging.basicConfig()
+log = logging.getLogger()
+
+# FIXME
+data_dir = "/mnt/secondary-siv/testdata/kineticsTools"
+
+class _TestBase(object):
+ """
+ Common test functionality. All input type tests should inherit from this,
+ and yield identical results.
+ """
+
+ def getOpts(self):
+ """Derived tests can override this to customize behaviour"""
+ return self.basicOpts()
+
+ def basicOpts(self):
+ """Mock up some options for the kinetic worker"""
+ class opts:
+ def __init__(self):
+ self.mapQvThreshold = -1
+ self.cap_percentile = 99.0
+ self.minCoverage = 3
+ self.subread_norm = True
+ self.maxCoverage = 200
+ self.identify = True
+ self.methylFraction = False
+ self.pvalue = 0.01
+ self.modsToCall = ['H', 'J', 'K']
+ # Bug 23546: need to set values for these two new flags:
+ self.identifyMinCov = 5
+ self.methylMinCov = 10
+ self.useLDA = False
+ self.maxAlignments = 1500
+ self.randomSeed = None
+ return opts()
+
+ def getAlignments (self):
+ raise NotImplementedError()
+
+ def getReference (self):
+ refDir = "/mnt/secondary-siv/references"
+ return os.path.join(refDir, "Helicobacter_pylori_J99", "sequence",
+ "Helicobacter_pylori_J99.fasta")
+
+ def setUp(self):
+ self.cmpH5 = None
+ resourcesDir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../kineticsTools/resources')
+ ref = self.getReference()
+ alnFile = self.getAlignments()
+ assert os.path.exists(alnFile) and os.path.exists(ref)
+
+ self.ds = AlignmentSet(alnFile, referenceFastaFname=ref)
+ self.contigs = ReferenceUtils.loadReferenceContigs(ref, self.ds)
+ self.ipdModel = IpdModel(self.contigs, os.path.join(resourcesDir, "P6-C4.h5"))
+ # Create a functional KineticWorker object that can be poked at
+ self.kw = KineticWorker(self.ipdModel)
+ # Put in our cmp.h5 - this is normally supplied by the Worker
+ self.kw.caseCmpH5 = self.ds
+ self.kw.controlCmpH5 = None
+
+ self.kw.options = self.getOpts()
+
+ def test_private_api (self):
+ start = 50
+ end = 100
+ REF_GROUP_ID = "gi|12057207|gb|AE001439.1|"
+ referenceWindow = ReferenceWindow(0, REF_GROUP_ID, start, end)
+ bounds = (start, end)
+ rir = list(self.kw.caseCmpH5.readsInRange(referenceWindow.refName,
+ referenceWindow.start, referenceWindow.end))
+ self.assertEqual(len(rir), 301)
+ chunks = self.kw._fetchChunks(REF_GROUP_ID, (start, end),
+ self.kw.caseCmpH5)
+ factor = 1.0 / self.ds.readGroupTable[0].FrameRate
+ rawIpds = self.kw._loadRawIpds(rir, start, end, factor)
+ logging.critical(len(rawIpds))
+ # XXX note that this is very dependent on the exact order of reads
+ # found by readsInRange(), which may be altered by changes to the
+ # implementation of the dataset API. It should, however, remain
+ # consistent across equivalent input types.
+ # XXX 2015-08-28 disabling this for now because it will change if the
+ # dataset contains multiple .bam files
+ #self.assertEqual("%.4f" % rawIpds[0][2], "0.2665")
+ log.info(rawIpds)
+ chunks = self.kw._chunkRawIpds(rawIpds)
+ #log.critical(chunks)
+
+ def testSmallDecode (self):
+ """Test for known modifications near the start of H. pylori genome"""
+ # XXX should have mods on 60- (m4C), 89+ (m6A), 91- (m6A)
+ start = 50
+ end = 100
+ REF_GROUP_ID = "gi|12057207|gb|AE001439.1|"
+ referenceWindow = ReferenceWindow(0, REF_GROUP_ID, start, end)
+ bounds = (start, end)
+
+ self.kw._prepForReferenceWindow(referenceWindow)
+ kinetics = self.kw._summarizeReferenceRegion(bounds, False, True)
+ mods = self.kw._decodePositiveControl(kinetics, bounds)
+ log.info(mods)
+
+ # Verify that we detect m6A mods at 14982 and 14991
+ m6AMods = [x for x in mods if x['modification'] == 'm6A' and x['tpl'] in (88, 90) ]
+ self.assertEqual(len(m6AMods), 2)
+ m4CMods = [x for x in mods if x['modification'] == 'm4C' and x['tpl'] in (59,) ]
+ self.assertEqual(len(m4CMods), 1)
+ for x in mods:
+ if x['strand'] == 0:
+ self.assertEqual(x['tpl'], 88)
+ else:
+ self.assertTrue(x['tpl'] in [59,90])
+
+ at unittest.skipUnless(os.path.isdir(data_dir), "Missing test data directory")
+class TestBam(_TestBase, unittest.TestCase):
+ def getAlignments (self):
+ return os.path.join(data_dir, "Hpyl_1_5000.bam")
+
+
+ at unittest.skipUnless(os.path.isdir(data_dir), "Missing test data directory")
+class TestDataset (TestBam, unittest.TestCase):
+ def getAlignments (self):
+ return os.path.join(data_dir, "Hpyl_1_5000.xml")
+
+
+ at unittest.skipUnless(os.path.isdir(data_dir), "Missing test data directory")
+class TestSplitDataset(_TestBase, unittest.TestCase):
+ def getAlignments (self):
+ return os.path.join(data_dir, "Hpyl_1_5000_split.xml")
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_tool_contract.py b/test/test_tool_contract.py
new file mode 100755
index 0000000..f7dc385
--- /dev/null
+++ b/test/test_tool_contract.py
@@ -0,0 +1,110 @@
+
+"""
+Tests for end-to-end tool contract support in kineticsTools.ipdSummary,
+including consistency checks on output files.
+"""
+
+import unittest
+import logging
+import os.path
+import csv
+
+import pbcommand.testkit
+
+os.environ["PACBIO_TEST_ENV"] = "1" # turns off --verbose
+
+DATA_DIR = "/mnt/secondary-siv/testdata/kineticsTools"
+REF_DIR = "/mnt/secondary-siv/references/Helicobacter_pylori_J99"
+
+
+class Constants(object):
+ N_LINES_GFF = 338
+ N_LINES_CSV = 13357
+ INITIAL_LINES_CSV = """\
+refName,tpl,strand,base,score,tMean,tErr,modelPrediction,ipdRatio,coverage
+"gi|12057207|gb|AE001439.1|",1,0,A,10,2.387,0.464,1.710,1.396,29
+"gi|12057207|gb|AE001439.1|",1,1,T,1,0.492,0.075,0.602,0.817,57"""
+ INITIAL_LINES_GFF = """\
+gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t35\t35\t187\t-\t.\tcoverage=118;context=TTTAAGGGCGTTTTATGCCTAAATTTAAAAAATGATGCTGT;IPDRatio=5.68;identificationQv=196
+gi|12057207|gb|AE001439.1|\tkinModCall\tm4C\t60\t60\t49\t-\t.\tcoverage=112;context=AAAAAGCTCGCTCAAAAACCCTTGATTTAAGGGCGTTTTAT;IPDRatio=2.58;identificationQv=33
+gi|12057207|gb|AE001439.1|\tkinModCall\tm6A\t89\t89\t223\t+\t.\tcoverage=139;context=AGCGAGCTTTTTGCTCAAAGAATCCAAGATAGCGTTTAAAA;IPDRatio=5.69;identificationQv=187"""
+
+ at unittest.skipUnless(os.path.isdir(DATA_DIR) and os.path.isdir(REF_DIR),
+ "%s or %s not available" % (DATA_DIR, REF_DIR))
+class TestIpdSummary(pbcommand.testkit.PbTestApp):
+ DRIVER_BASE = "python -m kineticsTools.ipdSummary "
+ REQUIRES_PBCORE = True
+ MAX_NPROC = 8
+ RESOLVED_NPROC = 8
+ INPUT_FILES = [
+ os.path.join(DATA_DIR, "Hpyl_1_5000.xml"),
+ os.path.join(REF_DIR, "sequence", "Helicobacter_pylori_J99.fasta"),
+ ]
+ TASK_OPTIONS = {
+ "kinetics_tools.task_options.identify": "m6A,m4C",
+ "kinetics_tools.task_options.max_length": 3000000000,
+ "kinetics_tools.task_options.compute_methyl_fraction": False,
+ "kinetics_tools.task_options.pvalue": 0.001
+ }
+
+ def run_after(self, rtc, output_dir):
+ gff_file = os.path.join(output_dir, rtc.task.output_files[0])
+ csv_file = os.path.join(output_dir, rtc.task.output_files[1])
+ def lc(fn): return len(open(fn).readlines())
+ self.assertEqual(lc(gff_file), Constants.N_LINES_GFF)
+ self.assertEqual(lc(csv_file), Constants.N_LINES_CSV)
+ def head(fn,n): return "\n".join( open(fn).read().splitlines()[0:n] )
+ self.assertEqual(head(csv_file, 3), Constants.INITIAL_LINES_CSV)
+ def head2(fn,n):
+ out = []
+ i = 0
+ for line in open(fn).read().splitlines():
+ if line[0] != '#':
+ out.append(line)
+ i += 1
+ if i == n:
+ break
+ return "\n".join(out)
+ self.assertEqual(head2(gff_file, 3), Constants.INITIAL_LINES_GFF)
+
+
+ at unittest.skipUnless(os.path.isdir(DATA_DIR) and os.path.isdir(REF_DIR),
+ "%s or %s not available" % (DATA_DIR, REF_DIR))
+class TestIpdSummaryChunk(TestIpdSummary):
+ """
+ This test is identical to the above except for using an AlignmentSet with
+ filters applied as input. We want the output to actually be within the
+ range specified by the filters, not a seemingly arbitirary larger range
+ around it.
+ """
+ INPUT_FILES = [
+ os.path.join(DATA_DIR, "Hpyl_1_5000_chunk.xml"),
+ os.path.join(REF_DIR, "sequence", "Helicobacter_pylori_J99.fasta"),
+ ]
+
+ def run_after(self, rtc, output_dir):
+ gff_file = os.path.join(output_dir, rtc.task.output_files[0])
+ csv_file = os.path.join(output_dir, rtc.task.output_files[1])
+ logging.critical(gff_file)
+ logging.critical(csv_file)
+ with open(csv_file) as f:
+ records = [ r for r in csv.DictReader(f) ]
+ logging.critical("start=%s end=%s" % (records[0]['tpl'],
+ records[-1]["tpl"]))
+ self.assertEqual(records[0]["tpl"], "1001")
+ self.assertEqual(records[-1]["tpl"], "1050")
+
+
+ at unittest.skipUnless(os.path.isdir(DATA_DIR) and os.path.isdir(REF_DIR),
+ "%s or %s not available" % (DATA_DIR, REF_DIR))
+class TestSummarizeModifications(pbcommand.testkit.PbTestApp):
+ DRIVER_BASE = "python -m kineticsTools.summarizeModifications"
+ REQUIRES_PBCORE = True
+ INPUT_FILES = [
+ os.path.join(DATA_DIR, "Hpyl_1_5000_modifications.gff"),
+ os.path.join(DATA_DIR, "Hpyl_1_5000_alignment_summary.gff"),
+ ]
+
+
+if __name__ == "__main__":
+ unittest.main()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/kineticstools.git
More information about the debian-med-commit
mailing list