[med-svn] [pbalign] 01/08: Imported Upstream version 0.3.0
Afif Elghraoui
afif at moszumanska.debian.org
Sat Dec 24 02:17:06 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository pbalign.
commit e5305172e075f437aac290706c104c5fd9ecce7a
Author: Afif Elghraoui <afif at debian.org>
Date: Fri Dec 23 17:14:01 2016 -0800
Imported Upstream version 0.3.0
---
LICENSES.txt | 34 +-
Makefile | 11 +-
README.md | 18 +-
doc/howto.rst | 7 +-
pbalign/__init__.py | 4 +-
pbalign/alignservice/align.py | 6 +-
pbalign/alignservice/blasr.py | 105 ++++--
pbalign/bampostservice.py | 137 +++++++
pbalign/ccs.py | 32 ++
pbalign/filterservice.py | 33 +-
pbalign/forquiverservice/__init__.py | 34 --
pbalign/forquiverservice/forquiver.py | 99 -----
pbalign/forquiverservice/loadchemistry.py | 77 ----
pbalign/forquiverservice/loadpulses.py | 103 ------
pbalign/forquiverservice/repack.py | 87 -----
pbalign/forquiverservice/sort.py | 89 -----
pbalign/options.py | 416 ++++++++++++++--------
pbalign/pbalignfiles.py | 41 ++-
pbalign/pbalignrunner.py | 247 ++++++-------
pbalign/tasks/__init__.py | 0
pbalign/tasks/consolidate_alignments.py | 96 +++++
pbalign/tools/createChemistryHeader.py | 4 +-
pbalign/utils/fileutil.py | 35 +-
pbalign/utils/progutil.py | 2 +-
setup.py | 6 +-
tests/cram/bam.t | 14 +
tests/cram/ccs.t | 15 +
tests/cram/dataset_tiny.t | 38 ++
tests/cram/filtercriteria_hitpolicy.t | 78 ++++
tests/cram/path_with_space.t | 9 +
tests/cram/setup.sh | 21 ++
tests/cram/test_pbalign.t | 276 --------------
tests/cram/xml.t | 38 ++
tests/cram_h5/ccs.t | 51 +++
tests/cram_h5/filter_adapter_only.t | 11 +
tests/cram_h5/inputs_outputs.t | 130 +++++++
tests/cram_h5/pulsefile.t | 23 ++
tests/cram_h5/setup.sh | 22 ++
tests/{cram => cram_h5}/test_mask_aligned_reads.t | 2 +-
tests/data/1.config | 3 +-
tests/data/ecoli_lp.fofn | 4 +-
tests/data/lambda_bax.fofn | 2 +-
tests/data/reference_lambda.xml | 14 +
tests/data/subreads_dataset1.xml | 21 ++
tests/data/subreads_dataset2.xml | 10 +
tests/data/test_bas.fofn | 2 +-
tests/data/test_ccs.fofn | 2 +-
tests/data/test_filterAdapterOnly.fofn | 2 +-
tests/data/test_leftmost_query.fasta | 2 +-
tests/data/test_pulseFile.fofn | 8 +-
tests/out/readme | 3 +-
tests/unit/nose.cfg | 17 +
tests/unit/test_fileutil.py | 63 ++--
tests/unit/test_options.py | 109 +++---
tests/unit/test_pbalign.py | 37 +-
tests/unit/test_pbalignfiles.py | 31 +-
tests/unit/test_referenceInfo.py | 3 +-
tests/unit/test_setpath.py | 25 ++
tests/unit/test_tool_contract.py | 91 +++++
tests/unit/test_unrolled.py | 87 +++++
tests/unit/test_xmlout.py | 38 ++
tests/unit_h5/nose.cfg | 17 +
tests/{unit => unit_h5}/test_filterservice.py | 11 +-
tests/{unit => unit_h5}/test_forquiverservice.py | 11 +-
tests/{unit => unit_h5}/test_gmap.py | 8 +-
tests/{unit => unit_h5}/test_loadpulsesservice.py | 8 +-
tests/{unit => unit_h5}/test_pbalign.py | 11 +-
tests/{unit => unit_h5}/test_repackservice.py | 7 +-
tests/{unit => unit_h5}/test_rgnh5io.py | 10 +-
tests/unit_h5/test_setpath.py | 27 ++
tests/{unit => unit_h5}/test_sortservice.py | 5 +-
71 files changed, 1837 insertions(+), 1303 deletions(-)
diff --git a/LICENSES.txt b/LICENSES.txt
index 8aee62e..4ec79b5 100644
--- a/LICENSES.txt
+++ b/LICENSES.txt
@@ -1,33 +1,27 @@
-Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+Copyright (c) 2016, Pacific Biosciences of California, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
+modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
-
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
-
* Neither the name of Pacific Biosciences nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
index 83c2e54..757d0a3 100644
--- a/Makefile
+++ b/Makefile
@@ -17,13 +17,20 @@ develop:
test:
# Unit tests
- find tests/unit -name "*.py" | xargs nosetests
+ #find tests/unit -name "*.py" | xargs nosetests
+ nosetests --verbose tests/unit/*.py
# End-to-end tests
@echo pbalign cram tests require blasr installed.
find tests/cram -name "*.t" | xargs cram
+h5test:
+ # Tests for pre-3.0 smrtanalysis when default file formats are *.h5
+ @echo pbalign h5 tests require blasr, samtoh5, loadPulses, samFilter and etc installed.
+ nosetests --verbose tests/unit_h5/*.py
+ find tests/cram_h5 -name "*.t" | xargs cram -v
+
doc:
- sphinx-apidoc -T -f -o doc src/ && cd doc && make html
+ sphinx-apidoc -T -f -o doc pbalign/ && cd doc && make html
docs: doc
diff --git a/README.md b/README.md
index 3c8d149..a4464f8 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,5 @@
-###pbalign maps PacBio reads to reference sequences.###
+pbalign maps PacBio reads to reference sequences.
+Want to know how to install and run pbalign?
-**Q: Want to know how to install and run pbalign?**
-
-A: Please refer to [pbalign readme document](https://github.com/PacificBiosciences/pbalign/blob/master/doc/howto.rst)
-
-
-**Q: 'pbalign.py' does not work?**
-
-A: The main script has been changed from 'pbalign.py' to 'pbalign'. Please use 'pbalign' instead.
-
-
-**Q: Can pbalign handle large datasets with many movies?**
-
-A: pbalign is not designed to handle large datasets, you should follow a [divide and conquer way](https://github.com/PacificBiosciences/pbalign/wiki/Tutorial:-How-to-divide-and-conquer-large-datasets-using-pbalign) to align many movies to a reference.
-
+Please refer to https://github.com/PacificBiosciences/pbalign/blob/master/doc/howto.rst
diff --git a/doc/howto.rst b/doc/howto.rst
index 1bbcf3e..0cf6656 100644
--- a/doc/howto.rst
+++ b/doc/howto.rst
@@ -37,8 +37,7 @@ required,
The following software is optionally required if ``--forQuiver`` option
will be used to convert the output Compare HDF5 file to be compatible
with Quiver.
-- ``pbh5tools.cmph5tools``, a PacBio Bioinformatics tools that manipulates
-Compare HDF5 files.
+- ``pbh5tools.cmph5tools``, a PacBio Bioinformatics tools that manipulates Compare HDF5 files.
- ``h5repack``, a HDF5 tool to compress and repack HDF5 files.
The default aligner that pbalign uses is ``blasr``. If you want to use
@@ -140,9 +139,9 @@ Before installing pbcore, you may need to install numpy and h5py from ::
, or if you have root permission on Ubuntu, do ::
- $ git install numpy
+ $ pip install numpy
$ sudo apt-get install libhdf5-serial-dev
- $ git install h5py
+ $ pip install h5py
To install pbcore, execute ::
diff --git a/pbalign/__init__.py b/pbalign/__init__.py
index 76a6c57..2842f3b 100755
--- a/pbalign/__init__.py
+++ b/pbalign/__init__.py
@@ -34,7 +34,7 @@
from __future__ import absolute_import
-_changelist = "$Change: 141024 $"
+_changelist = "$Change: 173392 $"
def _get_changelist(perforce_str):
@@ -60,7 +60,7 @@ def get_dir():
"""Return lib directory."""
return op.dirname(op.realpath(__file__))
-VERSION = (0, 2, 0, get_changelist())
+VERSION = (0, 3, 0)
def get_version():
diff --git a/pbalign/alignservice/align.py b/pbalign/alignservice/align.py
index eb39eb2..f8333ae 100755
--- a/pbalign/alignservice/align.py
+++ b/pbalign/alignservice/align.py
@@ -38,6 +38,7 @@ from copy import copy
from pbalign.options import importDefaultOptions
from pbalign.utils.tempfileutil import TempFileManager
from pbalign.service import Service
+from pbalign.utils.fileutil import getFileFormat, FILE_FORMATS
class AlignService (Service):
@@ -184,8 +185,11 @@ class AlignService (Service):
self._tempFileManager,
self._fileNames.isWithinRepository)
+ outFormat = getFileFormat(self._fileNames.outputFileName)
+ suffix = ".bam" if (outFormat == FILE_FORMATS.BAM or
+ outFormat == FILE_FORMATS.XML) else ".sam"
self._fileNames.alignerSamOut = self._tempFileManager.\
- RegisterNewTmpFile(suffix=".sam")
+ RegisterNewTmpFile(suffix=suffix)
# Generate and execute cmd.
try:
diff --git a/pbalign/alignservice/blasr.py b/pbalign/alignservice/blasr.py
index 3ec94af..1372e37 100755
--- a/pbalign/alignservice/blasr.py
+++ b/pbalign/alignservice/blasr.py
@@ -35,7 +35,7 @@
from __future__ import absolute_import
from pbalign.alignservice.align import AlignService
-from pbalign.utils.fileutil import FILE_FORMATS, real_upath
+from pbalign.utils.fileutil import FILE_FORMATS, real_upath, getFileFormat
import logging
@@ -96,54 +96,60 @@ class BlasrService(AlignService):
ignoredBinaryOptions = ['-m', '-out', '-V']
ignoredUnitaryOptions = ['-h', '--help', '--version',
- '-v', '-vv', '-sam']
+ '-v', '-vv', '--sam', '--bam']
items = self.__parseAlgorithmOptionItems(options.algorithmOptions)
i = 0
try:
while i < len(items):
infoMsg, errMsg, item = "", "", items[i]
- if item == "-sa":
+ if item == "--sa":
val = real_upath(items[i+1])
if fileNames.sawriterFileName != val:
infoMsg = "Over write sa file with {0}".format(val)
fileNames.sawriterFileName = val
- elif item == "-regionTable":
+ elif item == "--regionTable":
val = real_upath(items[i+1])
if fileNames.regionTable != val:
infoMsg = "Over write regionTable with {0}.\n"\
.format(val)
fileNames.regionTable = val
- elif item == "-bestn":
+ elif item == "--bestn":
val = int(items[i+1])
if options.maxHits is not None and \
int(options.maxHits) != val:
- errMsg = "blasr -bestn specified within " + \
+ errMsg = "blasr --bestn specified within " + \
"--algorithmOptions is equivalent to " + \
"--maxHits. Conflicting values of " + \
- "--algorithmOptions '-bestn' and " +\
+ "--algorithmOptions '--bestn' and " +\
"--maxHits have been found."
else:
options.maxHits = val
- elif item == "-minMatch":
+ elif item == "--minMatch":
val = int(items[i+1])
if options.minAnchorSize is not None and \
int(options.minAnchorSize) != val:
- errMsg = "blasr -minMatch specified within " + \
+ errMsg = "blasr --minMatch specified within " + \
"--algorithmOptions is equivalent to " + \
"--minAnchorSize. Conflicting values " + \
- "of --algorithmOptions '-minMatch' and " + \
+ "of --algorithmOptions '--minMatch' and " + \
"--minAnchorSize have been found."
else:
options.minAnchorSize = val
- elif item == "-nproc":
+ elif item == "--maxMatch":
+ val = int(items[i+1])
+ if options.maxMatch is not None and \
+ int(options.maxMatch) != val:
+ infoMsg = "Override maxMatch with {n}.".format(n=val)
+ options.maxMatch = val
+ elif item == "--nproc":
val = int(items[i+1])
# The number of threads is not critical.
if options.nproc is None or \
int(options.nproc) != val:
infoMsg = "Over write nproc with {n}.".format(n=val)
options.nproc = val
- elif item == "-noSplitSubreads":
+ elif item == "--noSplitSubreads":
if not options.noSplitSubreads:
infoMsg = "Over write noSplitSubreads with True."
logging.info(self.name +
@@ -151,22 +157,25 @@ class BlasrService(AlignService):
options.noSplitSubreads = True
del items[i]
continue
- elif item == "-concordant":
+ elif item == "--concordant":
if not options.concordant:
infoMsg = "Over writer concordant with True."
logging.info(self.name +
": Resolve algorithmOptions. " + infoMsg)
options.concordant = True
del items[i]
- elif "-useccs" in item: # -useccs, -useccsall, -useccsdenovo
- val = item.lstrip('-')
+ elif "--useccs" in item: # -useccs, -useccsall, -useccsdenovo
+ val = item.lstrip('--')
if options.useccs != val and options.useccs is not None:
errMsg = "Found conflicting options in " + \
"--algorithmOptions '{v}' \nand --useccs={u}"\
.format(v=item, u=options.useccs)
else:
options.useccs = val
- elif item == "-seed" or item == "-randomSeed":
+ elif item == "--unaligned":
+ val = str(items[i+1])
+ options.unaligned = val
+ elif item == "--seed" or item == "--randomSeed":
val = int(items[i+1])
if options.seed is None or int(options.seed) != val:
infoMsg = "Overwrite random seed with {0}.".format(val)
@@ -195,6 +204,15 @@ class BlasrService(AlignService):
logging.error(errMsg + str(e))
raise ValueError(errMsg + str(e))
+ # Existing suffix array always uses match size 8.
+ # When BLASR search option -minMatch is less than 8, suffix array needs
+ # to be created on the fly.
+ if (options.minAnchorSize is not None and options.minAnchorSize != "" and
+ int(options.minAnchorSize) < 8):
+ logging.warning("Suffix array must be recreated on the fly when " +
+ "minMatch < 8, which may take a long time.")
+ fileNames.sawriterFileName = None
+
# Update algorithmOptions when resolve is done
options.algorithmOptions = " ".join(items)
return options
@@ -211,57 +229,84 @@ class BlasrService(AlignService):
Output:
a command-line string which can be used in bash.
"""
- cmdStr = "blasr {queryFile} {targetFile} -sam -out {outFile} ".format(
+ cmdStr = "blasr {queryFile} {targetFile} --out {outFile} ".format(
queryFile=fileNames.queryFileName,
targetFile=fileNames.targetFileName,
outFile=fileNames.alignerSamOut)
+ if getFileFormat(fileNames.alignerSamOut) == FILE_FORMATS.BAM:
+ cmdStr += " --bam "
+ else:
+ cmdStr += " --sam "
+
if ((fileNames.sawriterFileName is not None) and
(fileNames.sawriterFileName != "")):
- cmdStr += " -sa {sawriter} ".format(
+ cmdStr += " --sa {sawriter} ".format(
sawriter=fileNames.sawriterFileName)
if ((fileNames.regionTable != "") and
(fileNames.regionTable is not None)):
- cmdStr += " -regionTable {regionTable} ".format(
+ cmdStr += " --regionTable {regionTable} ".format(
regionTable=fileNames.regionTable)
if options.maxHits is not None and options.maxHits != "":
- cmdStr += " -bestn {n}".format(n=options.maxHits)
+ cmdStr += " --bestn {n}".format(n=options.maxHits)
if (options.minAnchorSize is not None and
options.minAnchorSize != ""):
- cmdStr += " -minMatch {0} ".format(options.minAnchorSize)
+ cmdStr += " --minMatch {0} ".format(options.minAnchorSize)
+
+ if (options.maxMatch is not None and options.maxMatch != ""):
+ cmdStr += " --maxMatch {0} ".format(options.maxMatch)
if options.nproc is not None and options.nproc != "":
- cmdStr += " -nproc {0} ".format(options.nproc)
+ cmdStr += " --nproc {0} ".format(options.nproc)
+ # Specify filter criteira and hit policy.
if options.minLength is not None:
- cmdStr += " -minSubreadLength {n} -minReadLength {n} ".\
+ cmdStr += " --minSubreadLength {n} --minAlnLength {n} ".\
format(n=options.minLength)
+ if options.maxDivergence is not None:
+ maxDivergence = int(options.maxDivergence if options.maxDivergence
+ > 1.0 else (options.maxDivergence * 100))
+ cmdStr += " --minPctSimilarity {0}".format(100 - maxDivergence)
+
+ if options.minAccuracy is not None:
+ minAccuracy = int(options.minAccuracy if options.minAccuracy > 1.0
+ else (options.minAccuracy * 100))
+ cmdStr += " --minPctAccuracy {0}".format(minAccuracy)
+
+ if options.scoreCutoff is not None:
+ cmdStr += " --maxScore {0}".format(options.scoreCutoff)
+
+ cmdStr += " --hitPolicy {0} ".format(options.hitPolicy)
+
if options.noSplitSubreads:
- cmdStr += " -noSplitSubreads "
+ cmdStr += " --noSplitSubreads "
if options.concordant:
- cmdStr += " -concordant "
+ cmdStr += " --concordant "
if options.seed is not None and options.seed != 0:
- cmdStr += " -randomSeed {0} ".format(options.seed)
+ cmdStr += " --randomSeed {0} ".format(options.seed)
- if options.hitPolicy == "randombest":
- cmdStr += " -placeRepeatsRandomly "
+ #if options.hitPolicy == "randombest":
+ # cmdStr += " --placeRepeatsRandomly "
if options.useccs is not None and options.useccs != "":
- cmdStr += " -{0} ".format(options.useccs)
+ cmdStr += " --{0} ".format(options.useccs)
# When input is a FASTA file, blasr -clipping = soft
if fileNames.inputFileFormat == FILE_FORMATS.FASTA:
- cmdStr += " -clipping soft "
+ cmdStr += " --clipping soft "
if options.algorithmOptions is not None:
cmdStr += " {0} ".format(options.algorithmOptions)
+ if options.unaligned is not None:
+ cmdStr += " --unaligned {f} --noPrintUnalignedSeqs".format(f=options.unaligned)
+
return cmdStr
def _preProcess(self, inputFileName, referenceFile=None,
diff --git a/pbalign/bampostservice.py b/pbalign/bampostservice.py
new file mode 100755
index 0000000..15cf937
--- /dev/null
+++ b/pbalign/bampostservice.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+"""This script defines BamPostService, which
+ * calls 'samtools sort' to sort out.bam, and
+ * calls 'samtools index' to make out.bai index, and
+ * calls 'makePbi.py' to make out.pbi index file.
+"""
+
+# Author: Yuan Li
+
+from __future__ import absolute_import
+import logging
+from pbalign.service import Service
+from pbalign.utils.progutil import Execute
+
+
+class BamPostService(Service):
+
+ """Sort a bam, makes bam index and PacBio index."""
+ @property
+ def name(self):
+ """Name of bam post service."""
+ return "BamPostService"
+
+ @property
+ def progName(self):
+ return "samtools"
+
+ @property
+ def cmd(self):
+ return ""
+
+ def __init__(self, filenames):
+ """Initialize a BamPostService object.
+ Input - unsortedBamFile: a filtered, unsorted bam file
+ refFasta : a reference fasta file
+ Output - sortedBamFile: sorted BAM file
+ outBaiFile: index BAI file
+ """
+ self.refFasta = filenames.targetFileName
+
+ # filtered, unsorted bam file.
+ self.unsortedBamFile = filenames.filteredSam
+ self.outBamFile = filenames.outBamFileName
+ self.outBaiFile = filenames.outBaiFileName
+ self.outPbiFile = filenames.outPbiFileName
+
+ def _sortbam(self, unsortedBamFile, sortedBamFile):
+ """Sort unsortedBamFile and output sortedBamFile."""
+ if not sortedBamFile.endswith(".bam"):
+ raise ValueError("sorted bam file name %s must end with .bam" %
+ sortedBamFile)
+ sortedPrefix = sortedBamFile[0:-4]
+ cmd = 'samtools --version'
+ _samtoolsversion = ["0","1","19"]
+ try:
+ _out, _code, _msg = Execute(self.name, cmd)
+ if "samtools" in _out[0]:
+ _samtoolsversion = str(_out[0][8:]).strip().split('.')
+ else:
+ pass
+ except Exception:
+ pass
+ _stvmajor = int(_samtoolsversion[0])
+ if _stvmajor >= 1:
+ cmd = 'samtools sort -m 4G -o {sortedBamFile} {unsortedBamFile}'.format(
+ sortedBamFile=sortedBamFile, unsortedBamFile=unsortedBamFile)
+ else:
+ cmd = 'samtools sort -m 4G {unsortedBamFile} {prefix}'.format(
+ unsortedBamFile=unsortedBamFile, prefix=sortedPrefix)
+ Execute(self.name, cmd)
+
+ def _makebai(self, sortedBamFile, outBaiFile):
+ """Build *.bai index file."""
+ cmd = 'samtools --version'
+ _samtoolsversion = ["0","1","19"]
+ try:
+ _out, _code, _msg = Execute(self.name, cmd)
+ if "samtools" in _out[0]:
+ _samtoolsversion = str(_out[0][8:]).strip().split('.')
+ else:
+ pass
+ except Exception:
+ pass
+ _stvmajor = int(_samtoolsversion[0])
+ _stvminor = int(_samtoolsversion[1])
+ if _stvmajor == 1 and _stvminor == 2:
+ # only for 1.2
+ cmd = "samtools index {sortedBamFile}".format(
+ sortedBamFile=sortedBamFile)
+ else:
+ cmd = "samtools index {sortedBamFile} {outBaiFile}".format(
+ sortedBamFile=sortedBamFile, outBaiFile=outBaiFile)
+ Execute(self.name, cmd)
+
+ def _makepbi(self, sortedBamFile):
+ """Generate *.pbi PacBio BAM index."""
+ cmd = "pbindex %s" % sortedBamFile
+ Execute(self.name, cmd)
+
+ def run(self):
+ """ Run the BAM post-processing service. """
+ logging.info(self.name + ": Sort and build index for a bam file.")
+ self._sortbam(unsortedBamFile=self.unsortedBamFile,
+ sortedBamFile=self.outBamFile)
+ self._makebai(sortedBamFile=self.outBamFile,
+ outBaiFile=self.outBaiFile)
+ self._makepbi(sortedBamFile=self.outBamFile)
diff --git a/pbalign/ccs.py b/pbalign/ccs.py
new file mode 100644
index 0000000..ad3b4a4
--- /dev/null
+++ b/pbalign/ccs.py
@@ -0,0 +1,32 @@
+
+"""
+VERY thin wrapper on top of pbalign to provide a tool contract-driven task
+specific to CCS reads.
+"""
+
+import sys
+
+from pbcommand.models import FileTypes
+
+from pbalign import pbalignrunner
+import pbalign.options
+
+class Constants(pbalign.options.Constants):
+ TOOL_ID = "pbalign.tasks.pbalign_ccs"
+ DRIVER_EXE = "python -m pbalign.ccs --resolved-tool-contract"
+ INPUT_FILE_TYPE = FileTypes.DS_CCS
+ OUTPUT_FILE_TYPE = FileTypes.DS_ALIGN_CCS
+ # some modified defaults
+ ALGORITHM_OPTIONS_DEFAULT = "--minMatch 12 --bestn 10 --minPctSimilarity 70.0"
+
+def get_parser():
+ return pbalign.options.get_contract_parser(Constants, ccs_mode=True)
+
+def main(argv=sys.argv):
+ return pbalignrunner.main(
+ argv=argv,
+ get_parser_func=get_parser,
+ contract_runner_func=pbalignrunner.resolved_tool_contract_runner_ccs)
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/pbalign/filterservice.py b/pbalign/filterservice.py
index 9d23215..bd65240 100755
--- a/pbalign/filterservice.py
+++ b/pbalign/filterservice.py
@@ -37,8 +37,7 @@ in an input SAM file according to filtering criteria."""
from __future__ import absolute_import
import logging
from pbalign.service import Service
-from pbalign.utils.fileutil import isExist
-
+from pbalign.utils.fileutil import getFileFormat, FILE_FORMATS, isExist
class FilterService(Service):
""" Call samFilter to filter low quality hits and apply multiple hits
@@ -54,22 +53,22 @@ class FilterService(Service):
return "samFilter"
def __init__(self, inSamFile, refFile, outSamFile,
- alnServiceName, scoreSign, options,
+ alignerName, scoreSign, options,
adapterGffFile=None):
"""Initialize a FilterService object.
Input:
- inSamFile: an input SAM file
+ inSamFile: an input SAM/BAM file
refFile : the reference FASTA file
- outSAM : an output SAM file
+ outSAM : an output SAM/BAM file
alnServiceName: the name of the align service
scoreSign: score sign of the aligner, can be -1 or 1
options : pbalign options
adapterGffFile: a GFF file storing all the adapters
"""
- self.inSamFile = inSamFile
+ self.inSamFile = inSamFile # sam|bam
self.refFile = refFile
- self.outSamFile = outSamFile
- self.alnServiceName = alnServiceName
+ self.outSamFile = outSamFile # sam|bam
+ self.alignerName = alignerName
self.scoreSign = scoreSign
self.options = options
self.adapterGffFile = adapterGffFile
@@ -78,23 +77,32 @@ class FilterService(Service):
def cmd(self):
"""String of a command-line to execute."""
return self._toCmd(self.inSamFile, self.refFile,
- self.outSamFile, self.alnServiceName,
+ self.outSamFile, self.alignerName,
self.scoreSign, self.options,
self.adapterGffFile)
def _toCmd(self, inSamFile, refFile, outSamFile,
- alnServiceName, scoreSign, options, adapterGffFile):
+ alignerName, scoreSign, options, adapterGffFile):
""" Generate a samFilter command line from options.
Input:
inSamFile : the input SAM file
refFile : the reference FASTA file
outSamFile: the output SAM file
- alnServiceName: aligner service name
+ alignerName: aligner service name
scoreSign : score sign, can be -1 or 1
options : argument options
Output:
a command-line string
"""
+ # blasr supports in-line alignment filteration,
+ # no need to call samFilter at all.
+ if alignerName == "blasr" and \
+ not self.options.filterAdapterOnly:
+ cmdStr = "rm -f {outFile} && ln -s {inFile} {outFile}".format(
+ inFile=inSamFile, outFile=outSamFile)
+ return cmdStr
+
+ # if aligner is not blasr, call samFilter instead
cmdStr = self.progName + \
" {inSamFile} {refFile} {outSamFile} ".format(
inSamFile=inSamFile,
@@ -121,7 +129,7 @@ class FilterService(Service):
cmdStr += " -scoreSign {0}".format(scoreSign)
else:
logging.error("{0}'s score sign is neither 1 nor -1.".format(
- alnServiceName))
+ alignerName))
if options.scoreCutoff is not None:
cmdStr += " -scoreCutoff {0}".format(options.scoreCutoff)
@@ -133,6 +141,7 @@ class FilterService(Service):
isExist(adapterGffFile):
cmdStr += " -filterAdapterOnly {gffFile}".format(
gffFile=adapterGffFile)
+
return cmdStr
def run(self):
diff --git a/pbalign/forquiverservice/__init__.py b/pbalign/forquiverservice/__init__.py
deleted file mode 100755
index 175bd6a..0000000
--- a/pbalign/forquiverservice/__init__.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env python
-###############################################################################
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
-# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
-# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
-# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-###############################################################################
-
-# Author: Yuan Li
-"""Initialization."""
-from __future__ import absolute_import
diff --git a/pbalign/forquiverservice/forquiver.py b/pbalign/forquiverservice/forquiver.py
deleted file mode 100755
index e478daa..0000000
--- a/pbalign/forquiverservice/forquiver.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python
-###############################################################################
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
-# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
-# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
-# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-###############################################################################
-
-"""This script defines ForQuiverService, which post-processes a cmp.h5 file so
-that it can be used by quiver directly. ForQuiverService sorts the file, loads
-pulse information to it and finally repacks it.."""
-
-# Author: Yuan Li
-
-from __future__ import absolute_import
-import logging
-from pbalign.forquiverservice.sort import SortService
-from pbalign.forquiverservice.loadpulses import LoadPulsesService
-from pbalign.forquiverservice.loadchemistry import LoadChemistryService
-from pbalign.forquiverservice.repack import RepackService
-
-
-class ForQuiverService(object):
- """
- Uses SortService, LoadPulsesService, LoadChemistryService,
- RepackService to post process a cmp.h5 file so that the file can
- be used by quiver directly.
- """
- @property
- def name(self):
- """Name of ForQuiverService."""
- return "ForQuiverService"
-
- def __init__(self, fileNames, options):
- """Initialize a ForQuiverService object.
-
- Input:
- fileNames : pbalign file names
- options : pbalign options
-
- """
- self.fileNames = fileNames
- self.options = options
- self._loadpulsesService = LoadPulsesService(
- self.fileNames.pulseFileName,
- self.fileNames.outputFileName,
- self.options)
- self._loadchemistryService = LoadChemistryService(
- self.fileNames.pulseFileName,
- self.fileNames.outputFileName,
- self.options)
- self._sortService = SortService(
- self.fileNames.outputFileName,
- self.options)
- self._repackService = RepackService(
- self.fileNames.outputFileName,
- self.fileNames.outputFileName + ".TMP")
-
- def run(self):
- """ Run the ForQuiver service."""
- logging.info(self.name + ": Sort.")
-
- self._sortService.checkAvailability()
- self._sortService.run()
-
- logging.info(self.name + ": LoadPulses.")
- self._loadpulsesService.checkAvailability()
- self._loadpulsesService.run()
-
- logging.info(self.name + ": LoadChemistry.")
- self._loadchemistryService.checkAvailability()
- self._loadchemistryService.run()
-
- logging.info(self.name + ": Repack.")
- self._repackService.checkAvailability()
- self._repackService.run()
diff --git a/pbalign/forquiverservice/loadchemistry.py b/pbalign/forquiverservice/loadchemistry.py
deleted file mode 100755
index 5f6516f..0000000
--- a/pbalign/forquiverservice/loadchemistry.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env python
-###############################################################################
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
-# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
-# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
-# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-###############################################################################
-
-from __future__ import absolute_import
-import logging
-from pbalign.service import Service
-
-class LoadChemistryService(Service):
- @property
- def name(self):
- return "LoadChemistryService"
-
- @property
- def progName(self):
- return "loadChemistry.py"
-
- # Quiver only uses the following five metrics.
-
- def __init__(self, basFofnFile, cmpFile, options):
- """
- Input:
- basFofnFile: the input BASE.H5 (or fofn) files
- cmpFile : an input CMP.H5 file
- options : pbalign options
- """
- self.basFofnFile = basFofnFile
- self.cmpFile = cmpFile
- self.options = options
-
- @property
- def cmd(self):
- """String of a command-line to execute."""
- return self._toCmd(self.basFofnFile, self.cmpFile)
-
- def _toCmd(self, basFofnFile, cmpFile):
- """
- Generate a loadChemistry command line.
- """
- cmdStr = self.progName + \
- " {basFofnFile} {cmpFile} ".format(
- basFofnFile=basFofnFile, cmpFile=cmpFile)
-
- return cmdStr
-
- def run(self):
- """Run the loadChemistry service."""
- logging.info(self.name + ": Load pulses using {progName}.".
- format(progName=self.progName))
- return self._execute()
diff --git a/pbalign/forquiverservice/loadpulses.py b/pbalign/forquiverservice/loadpulses.py
deleted file mode 100755
index 98c71b8..0000000
--- a/pbalign/forquiverservice/loadpulses.py
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/usr/bin/env python
-###############################################################################
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
-# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
-# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
-# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-###############################################################################
-
-"""Define LoadPulseService class, which calls loadPulses to load PacBio
-pulse metrics to a cmp.h5 file. Five metrics, inclduing DeletionQV,
-DeletionTag, InsertionQV, MergeQV, and SubstitutionQV, are loaded by default
-unless --metrics is specified."""
-
-# Author: Yuan Li
-
-from __future__ import absolute_import
-import logging
-from pbalign.service import Service
-
-
-class LoadPulsesService(Service):
- """
- LoadPulsesService calls loadPulses to load PacBio pulse information to a
- cmp.h5 file.
- """
- @property
- def name(self):
- """Name of LoadPulsesService."""
- return "LoadPulsesService"
-
- @property
- def progName(self):
- """Program to call."""
- return "loadPulses"
-
- # Quiver only uses the following five metrics.
-
- def __init__(self, basFofnFile, cmpFile, options):
- """Initialize a LoadPulsesService object.
- Input:
- basFofnFile: the input BASE.H5 (or fofn) files
- cmpFile : an input CMP.H5 file
- options : pbalign options
- """
- self.basFofnFile = basFofnFile
- self.cmpFile = cmpFile
- self.options = options
-
- @property
- def cmd(self):
- """String of a command-line to execute."""
- return self._toCmd(self.basFofnFile, self.cmpFile)
-
- def _toCmd(self, basFofnFile, cmpFile):
- """Generate a loadPulses command line.
-
- Input:
- basFofnFile: a BAX/PLX.H5 (or fofn) file with pulses
- cmpFile : an input CMP.H5 file
- Output:
- a command-line string
-
- """
- cmdStr = self.progName + \
- " {basFofnFile} {cmpFile} ".format(
- basFofnFile=basFofnFile, cmpFile=cmpFile)
-
- metrics = self.options.metrics.replace(" ", "")
- cmdStr += " -metrics {metrics} ".format(metrics=metrics)
-
- if self.options.byread:
- cmdStr += " -byread "
-
- return cmdStr
-
- def run(self):
- """Run the loadPulses service."""
- logging.info(self.name + ": Load pulses using {progName}.".
- format(progName=self.progName))
- return self._execute()
diff --git a/pbalign/forquiverservice/repack.py b/pbalign/forquiverservice/repack.py
deleted file mode 100755
index 5c0f210..0000000
--- a/pbalign/forquiverservice/repack.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python
-###############################################################################
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
-# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
-# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
-# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-###############################################################################
-
-"""Define RepackService class, which calls h5repack to repack a cmp.h5 file."""
-
-# Author: Yuan Li
-
-from __future__ import absolute_import
-import logging
-from pbalign.service import Service
-
-
-class RepackService(Service):
- """RepackService calls h5repack to repack a PacBio cmp.h5 file."""
- @property
- def name(self):
- """Name of RepackService."""
- return "RepackService"
-
- @property
- def progName(self):
- """Program to call."""
- return "h5repack"
-
- def __init__(self, cmpFile, tmpcmpFile):
- """Initialize a RepackService object.
-
- Input:
- cmpFile : an input CMP.H5 file
- tmpcmpFile : a temporary CMP.H5 file
-
- """
- self.cmpFile = cmpFile
- self.tmpcmpFile = tmpcmpFile
-
- @property
- def cmd(self):
- """String of a command-line to execute."""
- return self._toCmd(self.cmpFile, self.tmpcmpFile)
-
- def _toCmd(self, cmpFile, tmpcmpFile):
- """Generate a h5repack command line.
-
- Input:
- cmpFile : an input CMP.H5 file
- options : pbalign options
- Output:
- a command-line string
-
- """
- cmdStr = self.progName + " -f GZIP=1 {i} {o} && mv {o} {i}".\
- format(i=cmpFile, o=tmpcmpFile)
- return cmdStr
-
- def run(self):
- """Run the Repack service."""
- logging.info(self.name + ": Repack a cmp.h5 file using {progName}.".
- format(progName=self.progName))
- return self._execute()
diff --git a/pbalign/forquiverservice/sort.py b/pbalign/forquiverservice/sort.py
deleted file mode 100755
index 750601e..0000000
--- a/pbalign/forquiverservice/sort.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/env python
-###############################################################################
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Pacific Biosciences nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
-# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
-# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
-# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
-# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-###############################################################################
-
-"""This script defines SortService, which calls cmph5tools.py sort to sort a
-cmp.h5 file."""
-
-# Author: Yuan Li
-
-from __future__ import absolute_import
-import logging
-from pbalign.service import Service
-
-
-class SortService(Service):
- """Call cmph5tools.py sort to sort a PacBio cmp.h5 file."""
- @property
- def name(self):
- """Name of SortService."""
- return "SortService"
-
- @property
- def progName(self):
- """Program to call."""
- return "cmph5tools.py"
-
- def __init__(self, cmpFile, options):
- """Initialize a SortService object.
- Input:
- cmpFile : an input CMP.H5 file
- options : pbalign options
- """
- self.cmpFile = cmpFile
- self.options = options
-
- @property
- def cmd(self):
- """String of a command-line to execute."""
- return self._toCmd(self.cmpFile, self.options)
-
- def _toCmd(self, cmpFile, options):
- """Generate a cmph5tools.py sort command line.
-
- Input:
- cmpFile : an input CMP.H5 file
- options : pbalign options
- Output:
- a command-line string
-
- """
- cmdStr = self.progName
- if options.verbosity > 1:
- cmdStr += " -vv "
-
- cmdStr += " sort --deep --inPlace {cmpFile} ".format(cmpFile=cmpFile)
- return cmdStr
-
- def run(self):
- """Run the sort service."""
- logging.info(self.name + ": Sort a cmp.h5 file using {progName}.".
- format(progName=self.progName))
- return self._execute()
diff --git a/pbalign/options.py b/pbalign/options.py
index 9cb1e81..acb1f9f 100755
--- a/pbalign/options.py
+++ b/pbalign/options.py
@@ -34,8 +34,40 @@
from __future__ import absolute_import
import argparse
+import logging
from copy import copy
+from pbcommand.models import FileTypes, SymbolTypes, ResourceTypes, get_pbparser
+from pbcommand.common_options import add_debug_option, add_base_options
+
+from pbalign.__init__ import get_version
+
+log = logging.getLogger(__name__)
+
+
+class Constants(object):
+ TOOL_ID = "pbalign.tasks.pbalign"
+ INPUT_FILE_TYPE = FileTypes.DS_SUBREADS
+ OUTPUT_FILE_TYPE = FileTypes.DS_ALIGN
+ OUTPUT_FILE_NAME = "mapped"
+ ALGORITHM_OPTIONS_ID = "pbalign.task_options.algorithm_options"
+ ALGORITHM_OPTIONS_DEFAULT = "--minMatch 12 --bestn 10 --minPctSimilarity 70.0"
+ MIN_ACCURACY_ID = "pbalign.task_options.min_accuracy"
+ MIN_LENGTH_ID = "pbalign.task_options.min_length"
+ NO_SPLIT_ID = "pbalign.task_options.no_split_subreads"
+ CONCORDANT_ID = "pbalign.task_options.concordant"
+ HIT_POLICY_ID = "pbalign.task_options.hit_policy"
+ DRIVER_EXE = "pbalign --resolved-tool-contract "
+ VERSION = get_version()
+ PARSER_DESC = """\
+Mapping PacBio sequences to references using an algorithm selected from a
+selection of supported command-line alignment algorithms. Input can be a
+fasta, pls.h5, bas.h5 or ccs.h5 file or a fofn (file of file names). Output
+can be in SAM or BAM format. If output is BAM format, aligner can
+only be blasr and QVs will be loaded automatically.
+
+NOTE that pbalign no longer supports CMP.H5 in 3.0."""
+
# The first candidate 'blasr' is the default.
ALGORITHM_CANDIDATES = ('blasr', 'bowtie', 'gmap')
@@ -57,15 +89,17 @@ DEFAULT_OPTIONS = {"regionTable": None,
# Aligner options
"maxHits": 10,
"minAnchorSize": 12,
+ "maxMatch": 30,
"noSplitSubreads": False,
"concordant": False,
+ "unaligned": None,
"algorithmOptions": None,
"useccs": None,
# Filter options
- "maxDivergence": 30,
- "minAccuracy": 70,
+ "maxDivergence": 30.0,
+ "minAccuracy": 70.0,
"minLength": 50,
- "scoreFunction": SCOREFUNCTION_CANDIDATES[0],
+ #"scoreFunction": SCOREFUNCTION_CANDIDATES[0],
"scoreCutoff": None,
"hitPolicy": HITPOLICY_CANDIDATES[0],
"filterAdapterOnly": False,
@@ -78,39 +112,30 @@ DEFAULT_OPTIONS = {"regionTable": None,
# Miscellaneous options
"nproc": 8,
"seed": 1,
- "tmpDir": "/scratch"}
-
-
-def constructOptionParser(parser=None):
- """Constrct and return an argument parser.
-
- If a parser is specified, use it. Otherwise, create a parser instead.
- Add PBAlignRunner arguments to construct the parser, and finally
- return it.
+ "tmpDir": "/tmp"}
+def constructOptionParser(parser, C=Constants, ccs_mode=False):
"""
- desc = "Mapping PacBio sequences to references using an algorithm \n"
- desc += "selected from a selection of supported command-line alignment\n"
- desc += "algorithms. Input can be a fasta, pls.h5, bas.h5 or ccs.h5\n"
- desc += "file or a fofn (file of file names). Output is in either\n"
- desc += "cmp.h5 or sam format.\n"
-
- if (parser is None):
- parser = argparse.ArgumentParser()
-
- parser.description = desc
- parser.argument_default = argparse.SUPPRESS
- parser.formatter_class = argparse.RawTextHelpFormatter
+ Add PBAlignRunner arguments to the parser.
+ """
+ # save reference to PbParser
+ p = parser
+ tcp = p.tool_contract_parser
+ parser = parser.arg_parser.parser
+ #parser.argument_default = argparse.SUPPRESS
+ parser.formatter_class = argparse.ArgumentDefaultsHelpFormatter
+ add_debug_option(parser)
# Optional input.
- parser.add_argument("--regionTable",
+ input_group = parser.add_argument_group("Optional input arguments")
+ input_group.add_argument("--regionTable",
dest="regionTable",
type=str,
default=None,
action="store",
help="Specify a region table for filtering reads.")
- parser.add_argument("--configFile",
+ input_group.add_argument("--configFile",
dest="configFile",
default=None,
type=str,
@@ -120,7 +145,7 @@ def constructOptionParser(parser=None):
helpstr = "When input reads are in fasta format and output is a cmp.h5\n" + \
"this option can specify pls.h5 or bas.h5 or \n" + \
"FOFN files from which pulse metrics can be loaded for Quiver."
- parser.add_argument("--pulseFile",
+ input_group.add_argument("--pulseFile",
dest="pulseFile",
default=None,
type=str,
@@ -128,9 +153,9 @@ def constructOptionParser(parser=None):
help=helpstr)
# Chose an aligner.
+ align_group = parser.add_argument_group("Alignment options")
helpstr = "Select an aligorithm from {0}.\n".format(ALGORITHM_CANDIDATES)
- helpstr += "Default algorithm is {0}.".format(DEFAULT_OPTIONS["algorithm"])
- parser.add_argument("--algorithm",
+ align_group.add_argument("--algorithm",
dest="algorithm",
type=str,
action="store",
@@ -140,9 +165,8 @@ def constructOptionParser(parser=None):
# Aligner options.
helpstr = "The maximum number of matches of each read to the \n" + \
- "reference sequence that will be evaluated. Default\n" + \
- "value is {0}.".format(DEFAULT_OPTIONS["maxHits"])
- parser.add_argument("--maxHits",
+ "reference sequence that will be evaluated."
+ align_group.add_argument("--maxHits",
dest="maxHits",
type=int,
default=None, # Set as None instead of a real number.
@@ -150,9 +174,8 @@ def constructOptionParser(parser=None):
help=helpstr)
helpstr = "The minimum anchor size defines the length of the read\n" + \
- "that must match against the reference sequence. Default\n" + \
- "value is {0}.".format(DEFAULT_OPTIONS["minAnchorSize"])
- parser.add_argument("--minAnchorSize",
+ "that must match against the reference sequence."
+ align_group.add_argument("--minAnchorSize",
dest="minAnchorSize",
type=int,
default=None, # Set as None to avoid conflicts with
@@ -160,6 +183,11 @@ def constructOptionParser(parser=None):
action="store",
help=helpstr)
+ helpstr = "BLASR maxMatch option. (Will be overriden if is also set in algorithmOptions)"
+ align_group.add_argument("--maxMatch", dest="maxMatch", type=int,
+ default=DEFAULT_OPTIONS["maxMatch"],
+ action="store", help=helpstr)
+
# Aligner options: Use ccs or not?
helpstr = "Map the ccsSequence to the genome first, then align\n" + \
"subreads to the interval that the CCS reads mapped to.\n" + \
@@ -167,7 +195,7 @@ def constructOptionParser(parser=None):
" the template.\n" + \
" useccsall: maps all subreads.\n" + \
" useccsdenovo: maps ccs only."
- parser.add_argument("--useccs",
+ align_group.add_argument("--useccs",
type=str,
choices=["useccs", "useccsall", "useccsdenovo"],
action="store",
@@ -175,24 +203,32 @@ def constructOptionParser(parser=None):
help=helpstr)
helpstr = "Do not split reads into subreads even if subread \n" + \
- "regions are available. Default value is {0}."\
- .format(DEFAULT_OPTIONS["noSplitSubreads"])
- parser.add_argument("--noSplitSubreads",
+ "regions are available."
+ align_group.add_argument("--noSplitSubreads",
dest="noSplitSubreads",
default=DEFAULT_OPTIONS["noSplitSubreads"],
action="store_true",
help=helpstr)
+ if not ccs_mode:
+ tcp.add_boolean(C.NO_SPLIT_ID, "noSplitSubreads",
+ default=DEFAULT_OPTIONS["noSplitSubreads"],
+ name="Align unsplit polymerase reads",
+ description=helpstr)
helpstr = "Map subreads of a ZMW to the same genomic location.\n"
- parser.add_argument("--concordant",
+ align_group.add_argument("--concordant",
dest="concordant",
default=DEFAULT_OPTIONS["concordant"],
action="store_true",
help=helpstr)
-
- helpstr = "Number of threads. Default value is {v}."\
- .format(v=DEFAULT_OPTIONS["nproc"])
- parser.add_argument("--nproc",
+ if not ccs_mode:
+ tcp.add_boolean(C.CONCORDANT_ID, "concordant",
+ default=DEFAULT_OPTIONS["concordant"],
+ name="Concordant alignment",
+ description="Map subreads of a ZMW to the same genomic location")
+
+ helpstr = "Number of threads."
+ align_group.add_argument("--nproc",
type=int,
dest="nproc",
default=DEFAULT_OPTIONS["nproc"],
@@ -200,18 +236,24 @@ def constructOptionParser(parser=None):
action="store",
help=helpstr)
- parser.add_argument("--algorithmOptions",
+ align_group.add_argument("--algorithmOptions",
type=str,
dest="algorithmOptions",
default=None,
action="append",
help="Pass alignment options through.")
+ # XXX the arguments used in SMRTpipe 2.3 are different from the defaults
+ # for the command line tool
+ tcp.add_str(C.ALGORITHM_OPTIONS_ID, "algorithmOptions",
+ default=C.ALGORITHM_OPTIONS_DEFAULT,
+ name="Algorithm options",
+ description="List of space-separated arguments passed to BLASR")
# Filtering criteria and hit policy.
+ filter_group = parser.add_argument_group("Filter criteria options")
helpstr = "The maximum allowed percentage divergence of a read \n" + \
- "from the reference sequence. Default value is {0}." \
- .format(DEFAULT_OPTIONS["maxDivergence"])
- parser.add_argument("--maxDivergence",
+ "from the reference sequence."
+ filter_group.add_argument("--maxDivergence",
dest="maxDivergence",
type=float,
default=DEFAULT_OPTIONS["maxDivergence"],
@@ -219,39 +261,45 @@ def constructOptionParser(parser=None):
action="store",
help=helpstr)
- helpstr = "The minimum percentage accuracy of alignments that\n" + \
- "will be evaluated. Default value is {v}." \
- .format(v=DEFAULT_OPTIONS["minAccuracy"])
- parser.add_argument("--minAccuracy",
+ helpstr = "The minimum concordance of alignments that\n" + \
+ "will be evaluated."
+ filter_group.add_argument("--minAccuracy",
dest="minAccuracy",
type=float,
default=DEFAULT_OPTIONS["minAccuracy"],
#default=70,
action="store",
help=helpstr)
+ tcp.add_float(C.MIN_ACCURACY_ID, "minAccuracy",
+ default=DEFAULT_OPTIONS["minAccuracy"],
+ name="Min. concordance",
+ description="Minimum required alignment concordance")
helpstr = "The minimum aligned read length of alignments that\n" + \
- "will be evaluated. Default value is {v}." \
- .format(v=DEFAULT_OPTIONS["minLength"])
- parser.add_argument("--minLength",
+ "will be evaluated."
+ filter_group.add_argument("--minLength",
dest="minLength",
type=int,
default=DEFAULT_OPTIONS["minLength"],
action="store",
help=helpstr)
-
- helpstr = "Specify a score function for evaluating alignments.\n"
- helpstr += " alignerscore : aligner's score in the SAM tag 'as'.\n"
- helpstr += " editdist : edit distance between read and reference.\n"
- helpstr += " blasrscore : blasr's default score function.\n"
- helpstr += "Default value is {0}.".format(DEFAULT_OPTIONS["scoreFunction"])
- parser.add_argument("--scoreFunction",
- dest="scoreFunction",
- type=str,
- choices=SCOREFUNCTION_CANDIDATES,
- default=DEFAULT_OPTIONS["scoreFunction"],
- action="store",
- help=helpstr)
+ tcp.add_int(C.MIN_LENGTH_ID, "minLength",
+ default=DEFAULT_OPTIONS["minLength"],
+ name="Min. length",
+ description="Minimum required alignment length")
+
+ #helpstr = "Specify a score function for evaluating alignments.\n"
+ #helpstr += " alignerscore : aligner's score in the SAM tag 'as'.\n"
+ #helpstr += " editdist : edit distance between read and reference.\n"
+ #helpstr += " blasrscore : blasr's default score function.\n"
+ #helpstr += "Default value is {0}.".format(DEFAULT_OPTIONS["scoreFunction"])
+ #filter_group.add_argument("--scoreFunction",
+ # dest="scoreFunction",
+ # type=str,
+ # choices=SCOREFUNCTION_CANDIDATES,
+ # default=DEFAULT_OPTIONS["scoreFunction"],
+ # action="store",
+ # help=helpstr)
#" userscore : user-defined score matrix (by -scoreMatrix).\n")
#parser.add_argument("--scoreMatrix",
# dest="scoreMatrix",
@@ -275,7 +323,7 @@ def constructOptionParser(parser=None):
# "= -5 (match),\n"
# "mismatch = 6.\n")
- parser.add_argument("--scoreCutoff",
+ filter_group.add_argument("--scoreCutoff",
dest="scoreCutoff",
type=int,
default=None,
@@ -288,89 +336,98 @@ def constructOptionParser(parser=None):
" allbest : selects all the best score hits.\n" + \
" randombest: selects a random hit from all best score hits.\n" + \
" leftmost : selects a hit which has the best score and the\n" + \
- " smallest mapping coordinate in any reference.\n" + \
- "Default value is {v}.".format(v=DEFAULT_OPTIONS["hitPolicy"])
- parser.add_argument("--hitPolicy",
+ " smallest mapping coordinate in any reference.\n"
+ filter_group.add_argument("--hitPolicy",
dest="hitPolicy",
type=str,
choices=HITPOLICY_CANDIDATES,
default=DEFAULT_OPTIONS["hitPolicy"],
action="store",
help=helpstr)
+ tcp.add_str(C.HIT_POLICY_ID, "hitPolicy",
+ default=DEFAULT_OPTIONS["hitPolicy"],
+ name="Hit policy",
+ description=helpstr)
helpstr = "If specified, do not report adapter-only hits using\n" + \
"annotations with the reference entry."
- parser.add_argument("--filterAdapterOnly",
+ filter_group.add_argument("--filterAdapterOnly",
dest="filterAdapterOnly",
default=DEFAULT_OPTIONS["filterAdapterOnly"],
action="store_true",
help=helpstr)
# Output.
- helpstr = "Specify the ReadType attribute in the cmp.h5 output.\n" + \
- "Default value is {v}.".format(v=DEFAULT_OPTIONS["readType"])
- parser.add_argument("--readType",
+ # CMP H5 output has been deprecated, let's hide associated options.
+ cmph5_group = parser.add_argument_group("Options for cmp.h5")
+ helpstr = "Specify the ReadType attribute in the cmp.h5 output.\n"
+ cmph5_group.add_argument("--readType",
dest="readType",
type=str,
action="store",
default=DEFAULT_OPTIONS["readType"],
help=argparse.SUPPRESS)
- #help=helpstr)
helpstr = "The output cmp.h5 file which will be sorted, loaded\n" + \
"with pulse QV information, and repacked, so that it \n" + \
"can be consumed by quiver directly. This requires\n" + \
"the input file to be in PacBio bas/pls.h5 format,\n" + \
- "and --useccs must be None. Default value is False."
- parser.add_argument("--forQuiver",
+ "and --useccs must be None."
+ cmph5_group.add_argument("--forQuiver",
dest="forQuiver",
action="store_true",
default=DEFAULT_OPTIONS["forQuiver"],
- help=helpstr)
+ help=argparse.SUPPRESS)
helpstr = "Similar to --forQuiver, the only difference is that \n" + \
- "--useccs can be specified. Default value is False."
- parser.add_argument("--loadQVs",
+ "--useccs can be specified."
+ cmph5_group.add_argument("--loadQVs",
dest="loadQVs",
action="store_true",
default=DEFAULT_OPTIONS["loadQVs"],
- help=helpstr)
+ help=argparse.SUPPRESS)
helpstr = "Load pulse information using -byread option instead\n" + \
"of -bymetric. Only works when --forQuiver or \n" + \
- "--loadQVs are set. Default value is False."
- parser.add_argument("--byread",
+ "--loadQVs are set."
+ cmph5_group.add_argument("--byread",
dest="byread",
action="store_true",
default=DEFAULT_OPTIONS["byread"],
- help=helpstr)
+ help=argparse.SUPPRESS)
helpstr = "Load the specified (comma-delimited list of) metrics\n" + \
"instead of the default metrics required by quiver.\n" + \
"This option only works when --forQuiver or \n" + \
- "--loadQVs are set. Default: {m}".\
- format(m=DEFAULT_OPTIONS["metrics"])
- parser.add_argument("--metrics",
+ "--loadQVs are set."
+ cmph5_group.add_argument("--metrics",
dest="metrics",
type=str,
action="store",
default=DEFAULT_OPTIONS["metrics"],
- help=helpstr)
+ help=argparse.SUPPRESS)
# Miscellaneous.
+ misc_group = parser.add_argument_group("Miscellaneous options")
+ helpstr = "Output names of unaligned reads to specified file."
+ misc_group.add_argument("--unaligned",
+ dest="unaligned",
+ type=str,
+ action="store",
+ default=DEFAULT_OPTIONS["unaligned"],
+ help=helpstr)
+
helpstr = "Initialize the random number generator with a none-zero \n" + \
- "integer. Zero means that current system time is used.\n" + \
- "Default value is {v}.".format(v=DEFAULT_OPTIONS["seed"])
- parser.add_argument("--seed",
+ "integer. Zero means that current system time is used.\n"
+ misc_group.add_argument("--seed",
dest="seed",
type=int,
default=DEFAULT_OPTIONS["seed"],
action="store",
help=helpstr)
- helpstr = "Specify a directory for saving temporary files.\n" + \
- "Default is {v}.".format(v=DEFAULT_OPTIONS["tmpDir"])
- parser.add_argument("--tmpDir",
+ helpstr = "Specify a directory for saving temporary files.\n"
+ misc_group.add_argument("--tmpDir",
dest="tmpDir",
type=str,
action="store",
@@ -378,31 +435,11 @@ def constructOptionParser(parser=None):
help=helpstr)
# Keep all temporary & intermediate files.
- parser.add_argument("--keepTmpFiles",
+ misc_group.add_argument("--keepTmpFiles",
dest="keepTmpFiles",
action="store_true",
default=False,
help=argparse.SUPPRESS)
-
- # Required options: inputs and outputs.
- helpstr = "The input file can be a fasta, plx.h5, bax.h5, ccs.h5\n" + \
- "file or a fofn."
- parser.add_argument("inputFileName",
- type=str,
- action="store",
- help=helpstr)
-
- helpstr = "Either a reference fasta file or a reference repository."
- parser.add_argument("referencePath",
- type=str,
- action="store",
- help=helpstr)
-
- parser.add_argument("outputFileName",
- type=str,
- action="store",
- help="The output cmp.h5 or sam file.")
-
return parser
@@ -479,44 +516,109 @@ def importDefaultOptions(parsedOptions, additionalDefaults=DEFAULT_OPTIONS):
return newOptions, infoMsg.rstrip(', ')
-def parseOptions(argumentList, parser=None):
- """Parse a list of arguments, return options and an info message.
-
- If a parser is not specified, create a new parser, otherwise, use
- the specifed parser. If there exists a config file, import options
- from the config file and finally overwrite these options with
- options from the argument list.
-
+class _ArgParser(argparse.ArgumentParser):
"""
- # Obtain a constructed argument parser.
- parser = constructOptionParser(parser)
-
- # Parse argumentList for the first time in order to
- # get a config file.
- options = parser.parse_args(args=argumentList)
-
- # Import options from the specified config file, if it exists.
- configOptions, infoMsg = importConfigOptions(options)
-
- # Parse argumentList for the second time in order to
- # overwrite config options with options in argumentList.
- newOptions = copy(configOptions)
- newOptions.algorithmOptions = None
- newOptions = parser.parse_args(namespace=newOptions, args=argumentList)
-
- # Overwrite config algorithmOptions if it is specified in argumentList
- if newOptions.algorithmOptions is None:
- if configOptions.algorithmOptions is not None:
- newOptions.algorithmOptions = configOptions.algorithmOptions
- else:
- newOptions.algorithmOptions = \
- " ".join(newOptions.algorithmOptions)
-
- # Return the updated options and an info message.
- return parser, newOptions, infoMsg
-
-#if __name__ == "__main__":
-# import sys
-# parser = argparse.ArgumentParser()
-# parser, options, info = parseOptions(argumentList = sys.argv[1:],
-# parser=parser)
+ Substitute for the standard argument parser, where parse_args is
+ extended to facilitate the use of config files.
+ """
+ def parse_args(self, args=None, namespace=None):
+ options = super(_ArgParser, self).parse_args(args=args,
+ namespace=namespace)
+
+ # Import options from the specified config file, if it exists.
+ configOptions, infoMsg = importConfigOptions(options)
+
+ # Parse argumentList for the second time in order to
+ # overwrite config options with options in argumentList.
+ newOptions = copy(configOptions)
+ newOptions.algorithmOptions = None
+ newOptions = super(_ArgParser, self).parse_args(namespace=newOptions,
+ args=args)
+
+ # Overwrite config algorithmOptions if it is specified in argumentList
+ if newOptions.algorithmOptions is None:
+ if configOptions.algorithmOptions is not None:
+ newOptions.algorithmOptions = configOptions.algorithmOptions
+ else:
+ newOptions.algorithmOptions = \
+ " ".join(newOptions.algorithmOptions)
+
+ # FIXME gross hack to work around the problem of passing this
+ # parameter from a resolved tool contract
+ def unquote(s):
+ if s[0] in ["'", '"'] and s[-1] in ["'", '"']:
+ return s[1:-1]
+ return s
+ if newOptions.algorithmOptions is not None:
+ newOptions.algorithmOptions = unquote(newOptions.algorithmOptions)
+
+ # Return the updated options and an info message.
+ return newOptions #parser, newOptions, infoMsg
+
+def get_contract_parser(C=Constants, ccs_mode=False):
+ """
+ Create and populate the combined tool contract/argument parser. This
+ method can optionally be overridden with a different Constants object for
+ defining additional tasks (e.g. CCS alignment).
+ """
+ p = get_pbparser(
+ tool_id=C.TOOL_ID,
+ version=C.VERSION,
+ name=C.TOOL_ID,
+ description=C.PARSER_DESC,
+ driver_exe=C.DRIVER_EXE,
+ nproc=SymbolTypes.MAX_NPROC,
+ resource_types=(ResourceTypes.TMP_DIR,),
+ default_level="WARN")
+ p.arg_parser.parser = _ArgParser(
+ description=C.PARSER_DESC,
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ p.arg_parser.parser.version = C.VERSION
+ p.arg_parser.parser.add_argument('--version',
+ action="version",
+ help="show program's version number and exit")
+ add_base_options(p.arg_parser.parser)
+
+ # Required options: inputs and outputs.
+ p.add_input_file_type(C.INPUT_FILE_TYPE, "inputFileName",
+ "Subread DataSet", "SubreadSet or unaligned .bam")
+ p.add_input_file_type(FileTypes.DS_REF, "referencePath",
+ "ReferenceSet", "Reference DataSet or FASTA file")
+ p.add_output_file_type(C.OUTPUT_FILE_TYPE, "outputFileName",
+ name="Alignments",
+ description="Alignment results dataset",
+ default_name=C.OUTPUT_FILE_NAME)
+ constructOptionParser(p, ccs_mode=ccs_mode)
+ p.arg_parser.parser.add_argument(
+ "--profile", action="store_true",
+ help="Print runtime profile at exit")
+ return p
+
+def resolved_tool_contract_to_args(resolved_tool_contract):
+ rtc = resolved_tool_contract
+ p = get_contract_parser().arg_parser.parser
+ args = [
+ rtc.task.input_files[0],
+ rtc.task.input_files[1],
+ rtc.task.output_files[0],
+ "--nproc", str(resolved_tool_contract.task.nproc),
+ "--minAccuracy", str(rtc.task.options[Constants.MIN_ACCURACY_ID]),
+ "--minLength", str(rtc.task.options[Constants.MIN_LENGTH_ID]),
+ "--hitPolicy", str(rtc.task.options[Constants.HIT_POLICY_ID]),
+ "--tmpDir", rtc.task.tmpdir_resources[0].path,
+ "--log-level", rtc.task.log_level,
+ ]
+ if rtc.task.options[Constants.ALGORITHM_OPTIONS_ID]:
+ # FIXME this is gross: if I don't quote the options, the parser chokes;
+ # if I do quote them, the quotes get propagated, so I have to strip
+ # them off later
+ args.extend([
+ "--algorithmOptions=\"%s\"" %
+ rtc.task.options[Constants.ALGORITHM_OPTIONS_ID],
+ ])
+ if rtc.task.options.get(Constants.CONCORDANT_ID, False):
+ args.append("--concordant")
+ if rtc.task.options.get(Constants.NO_SPLIT_ID, False):
+ args.append("--noSplitSubreads")
+ log.info("Converted command line: 'pbalign {a}'".format(a=" ".join(args)))
+ return p.parse_args(args)
diff --git a/pbalign/pbalignfiles.py b/pbalign/pbalignfiles.py
index 9bf4ad9..7d401bc 100755
--- a/pbalign/pbalignfiles.py
+++ b/pbalign/pbalignfiles.py
@@ -33,14 +33,15 @@
"""This script defines class PBALignFiles."""
from __future__ import absolute_import
-import logging
from pbalign.utils.fileutil import checkInputFile, getRealFileFormat, \
checkOutputFile, checkReferencePath, checkRegionTableFile, \
getFileFormat, FILE_FORMATS
class PBAlignFiles:
+
"""PBAlignFiles contains files that will be used by pbalign."""
+
def __init__(self, inputFileName=None, referencePath=None,
outputFileName=None, regionTable=None,
pulseFileName=None):
@@ -80,8 +81,12 @@ class PBAlignFiles:
self.targetFileName = None
self.sawriterFileName = None
self.isWithinRepository = False
- self.alignerSamOut = None # The sam output file by an aligner
- self.filteredSam = None # The filtered sam file.
+ self.alignerSamOut = None # The sam/bam output file by an aligner
+ self.filteredSam = None # The filtered sam/bam file.
+
+ self.outBamFileName = None # filtered, sorted bam
+ self.outBaiFileName = None # output *.bai bam index
+ self.outPbiFileName = None # output *.pbi pacbio bam index
# There might be an adapter file in the reference repository in
# directory 'annotations', which can be used by the
@@ -99,7 +104,7 @@ class PBAlignFiles:
# the absolute and expanded path. Validate file format.
if inputFileName is not None and inputFileName != "":
self.inputFileName = checkInputFile(inputFileName)
- self.inputFileFormat = getRealFileFormat(inputFileName)
+ self.inputFileFormat = getRealFileFormat(self.inputFileName)
def SetPulseFileName(self, inputFileName, pulseFileName):
"""Verify and assign the pulse file from which pulses can be
@@ -133,10 +138,17 @@ class PBAlignFiles:
def SetOutputFileName(self, outputFileName):
"""Validate the user-specified output file and get the absolute and
- expanded path.
+ expanded path. If output file format is XML or BAM, set output BAM
+ filename, BAM index bai file and PacBio BAM index pbi file.
"""
if outputFileName is not None and outputFileName != "":
self.outputFileName = checkOutputFile(outputFileName)
+ if getFileFormat(self.outputFileName) in [FILE_FORMATS.BAM,
+ FILE_FORMATS.XML]:
+ prefix = str(self.outputFileName[0:-3])
+ self.outBamFileName = prefix + "bam"
+ self.outBaiFileName = self.outBamFileName + ".bai"
+ self.outPbiFileName = self.outBamFileName + ".pbi"
def SetRegionTable(self, regionTable):
"""Validate the user-specified region table and get the absolute and
@@ -167,16 +179,25 @@ class PBAlignFiles:
desc += "Output file: {o}\n".format(o=self.outputFileName)
desc += "Query file : {q}\n".format(q=self.queryFileName)
desc += "Target file: {t}\n".format(t=self.targetFileName)
- desc += "Suffix array file: {s}\n".format(s=self.sawriterFileName)
- desc += "regionTable:{s}\n".format(s=self.regionTable)
+ if self.sawriterFileName is not None:
+ desc += "Suffix array file: {s}\n".format(s=self.sawriterFileName)
+ if self.regionTable is not None:
+ desc += "Region table:{s}\n".format(s=self.regionTable)
if self.pulseFileName is not None:
desc += "Pulse files: {s}\n".format(s=self.pulseFileName)
- desc += "Aligner's SAM out: {t}\n".format(t=self.alignerSamOut)
- desc += "Filtered SAM file: {t}\n".format(t=self.filteredSam)
+ desc += "Aligner's SAM/BAM out: {t}\n".format(t=self.alignerSamOut)
+ desc += "Filtered SAM/BAM file: {t}\n".format(t=self.filteredSam)
if self.adapterGffFileName is not None:
desc += "Adapter GFF file: {t}\n".format(
t=self.adapterGffFileName)
+ if self.outBamFileName is not None:
+ desc += "Out bam file: {b}\n".format(b=self.outBamFileName)
+ if self.outBaiFileName is not None:
+ desc += "Bam index file: {b}\n".format(b=self.outBaiFileName)
+ if self.outPbiFileName is not None:
+ desc += "PacBio Bam index file: {p}\n".format(p=self.outPbiFileName)
+
return desc
-#if __name__ == "__main__":
+# if __name__ == "__main__":
# p = PBAlignFiles("lambda.fasta", "lambda_ref.fasta", "tmp.sam")
diff --git a/pbalign/pbalignrunner.py b/pbalign/pbalignrunner.py
index afd2a18..9583287 100755
--- a/pbalign/pbalignrunner.py
+++ b/pbalign/pbalignrunner.py
@@ -34,19 +34,26 @@
PBAlignRunner uses AlignService to align PacBio reads in FASTA/BASE/PULSE/FOFN
formats to reference sequences, then uses FilterServices to filter out
alignments that do not satisfy filtering criteria, and finally generates a SAM
-or CMP.H5 file.
+or BAM file.
"""
# Author: Yuan Li
+import functools
import logging
import time
import sys
import shutil
+from pbcommand.cli import pbparser_runner
+from pbcommand.utils import setup_log
+from pbcore.util.ToolRunner import PBToolRunner
+from pbcore.io import (AlignmentSet, ConsensusAlignmentSet)
+
from pbalign.__init__ import get_version
-from pbalign.options import parseOptions, ALGORITHM_CANDIDATES
+from pbalign.options import (ALGORITHM_CANDIDATES, get_contract_parser,
+ resolved_tool_contract_to_args)
from pbalign.alignservice.blasr import BlasrService
from pbalign.alignservice.bowtie import BowtieService
from pbalign.alignservice.gmap import GMAPService
@@ -54,35 +61,40 @@ from pbalign.utils.fileutil import getFileFormat, FILE_FORMATS, real_ppath
from pbalign.utils.tempfileutil import TempFileManager
from pbalign.pbalignfiles import PBAlignFiles
from pbalign.filterservice import FilterService
-from pbalign.forquiverservice.forquiver import ForQuiverService
-from pbcore.util.Process import backticks
-from pbcore.util.ToolRunner import PBToolRunner
-
+from pbalign.bampostservice import BamPostService
class PBAlignRunner(PBToolRunner):
+
"""Tool runner."""
- def __init__(self, argumentList):
+ def __init__(self, args=None, argumentList=(),
+ output_dataset_type=AlignmentSet):
"""Initialize a PBAlignRunner object.
argumentList is a list of arguments, such as:
['--debug', '--maxHits', '10', 'in.fasta', 'ref.fasta', 'out.sam']
"""
desc = "Utilities for aligning PacBio reads to reference sequences."
+ if args is None: # FIXME unit testing hack
+ args = get_contract_parser().arg_parser.parser.parse_args(argumentList)
+ self.args = args
+ # args.verbosity is computed by counting # of 'v's in '-vv...'.
+ # However in parseOptions, arguments are parsed twice to import config
+ # options and then overwrite them with argumentList (e.g. command-line)
+ # options.
+ #self.args.verbosity = 1 if (self.args.verbosity is None) else \
+ # (int(self.args.verbosity) / 2 + 1)
super(PBAlignRunner, self).__init__(desc)
- self._argumentList = argumentList
+ self._output_dataset_type = output_dataset_type
self._alnService = None
self._filterService = None
self.fileNames = PBAlignFiles()
self._tempFileManager = TempFileManager()
- self.parser, self.args, _infoMsg = parseOptions(
- argumentList=self._argumentList, parser=self.parser)
- # args.verbosity is computed by counting # of 'v's in '-vv...'.
- # However in parseOptions, arguments are parsed twice to import config
- # options and then overwrite them with argumentList (e.g. command-line)
- # options.
- self.args.verbosity = 0 if (self.args.verbosity is None) else \
- int(self.args.verbosity) / 2
+ def _setupParsers(self, description):
+ pass
+
+ def _addStandardArguments(self):
+ pass
def getVersion(self):
"""Return version."""
@@ -132,24 +144,20 @@ class PBAlignRunner(PBToolRunner):
args.readType = "CCS"
if args.forQuiver:
- if args.useccs is not None:
- errMsg = "Options --forQuiver and --useccs should not " + \
- "be used together, since Quiver is not designed to " + \
- "polish ccs reads. if you want to align ccs reads" + \
- "in cmp.h5 format with pulse QVs loaded, use " + \
- "--loadQVs with --useccs instead."
+ logging.warning("Option --forQuiver has been deprecated in 3.0")
+
+ outFormat = getFileFormat(fileNames.outputFileName)
+
+ if outFormat == FILE_FORMATS.CMP:
+ errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
+ raise IOError(errMsg)
+
+ if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML:
+ if args.algorithm != "blasr":
+ errMsg = "Must choose blasr in order to output a bam file."
raise ValueError(errMsg)
- args.loadQVs = True
-
- if args.loadQVs:
- if fileNames.pulseFileName is None:
- errMsg = "The input file has to be in bas/pls/ccs.h5 " + \
- "format, or --pulseFile needs to be specified, "
- if getFileFormat(fileNames.outputFileName) != FILE_FORMATS.CMP:
- errMsg = "The output file has to be in cmp.h5 format, "
- if errMsg != "":
- errMsg += "in order to load pulse QVs."
- logging.error(errMsg)
+ if args.filterAdapterOnly:
+ errMsg = "-filterAdapter does not work when out format is BAM."
raise ValueError(errMsg)
def _parseArgs(self):
@@ -159,12 +167,12 @@ class PBAlignRunner(PBToolRunner):
"""
pass
- def _output(self, inSam, refFile, outFile, readType=None, smrtTitle=False):
- """Generate a sam or a cmp.h5 file.
+ def _output(self, inSam, refFile, outFile, readType=None):
+ """Generate a SAM, BAM file.
Input:
- inSam : an input SAM file. (e.g. fileName.filteredSam)
+ inSam : an input SAM/BAM file. (e.g. fileName.filteredSam)
refFile : the reference file. (e.g. fileName.targetFileName)
- outFile : the output SAM or CMP.H5 file.
+ outFile : the output SAM/BAM file
(i.e. fileName.outputFileName)
readType: standard or cDNA or CCS (can be None if not specified)
Output:
@@ -172,34 +180,37 @@ class PBAlignRunner(PBToolRunner):
"""
output, errCode, errMsg = "", 0, ""
- if getFileFormat(outFile) == FILE_FORMATS.SAM:
- #`mv inSam outFile`
+ outFormat = getFileFormat(outFile)
+
+ if outFormat == FILE_FORMATS.BAM:
+ pass # Nothing to be done
+ if outFormat == FILE_FORMATS.SAM:
logging.info("OutputService: Genearte the output SAM file.")
- logging.debug("OutputService: Move {src} as {dst}".format(
- src=inSam, dst=outFile))
+ logging.debug("OutputService: Move %s as %s", inSam, outFile)
try:
shutil.move(real_ppath(inSam), real_ppath(outFile))
except shutil.Error as e:
- output, errCode, errMsg = "", 1, str(e)
- elif getFileFormat(outFile) == FILE_FORMATS.CMP:
- #`samtoh5 inSam outFile -readType readType
- logging.info("OutputService: Genearte the output CMP.H5 " +
- "file using samtoh5.")
- prog = "samtoh5"
- cmd = "samtoh5 {samFile} {refFile} {outFile}".format(
- samFile=inSam, refFile=refFile, outFile=outFile)
- if readType is not None:
- cmd += " -readType {0} ".format(readType)
- if smrtTitle:
- cmd += " -smrtTitle "
- # Execute the command line
- logging.debug("OutputService: Call \"{0}\"".format(cmd))
- output, errCode, errMsg = backticks(cmd)
-
- if errCode != 0:
- errMsg = prog + " returned a non-zero exit status." + errMsg
+ output, errCode, errMsg = "", 1, "Exited with error: " + str(e)
+ logging.error(errMsg)
+ raise RuntimeError(errMsg)
+ elif outFormat == FILE_FORMATS.CMP:
+ errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
logging.error(errMsg)
- raise RuntimeError(errMsg)
+ raise IOError(errMsg)
+ elif outFormat == FILE_FORMATS.XML:
+ logging.info("OutputService: Generating the output XML file %s %s",
+ inSam, outFile)
+ # Create {out}.xml, given {out}.bam
+ outBam = str(outFile[0:-3]) + "bam"
+ aln = None
+ # FIXME This should really be more automatic
+ if readType == "CCS":
+ self._output_dataset_type = ConsensusAlignmentSet
+ aln = self._output_dataset_type(real_ppath(outBam))
+ for res in aln.externalResources:
+ res.reference = refFile
+ aln.write(outFile)
+
return output, errCode, errMsg
def _cleanUp(self, realDelete=False):
@@ -207,26 +218,13 @@ class PBAlignRunner(PBToolRunner):
logging.debug("Clean up temporary files and directories.")
self._tempFileManager.CleanUp(realDelete)
-# def _setupLogging(self):
-# LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
-# if self.args.verbosity >= 2:
-# print "logLevel = debug"
-# logLevel = logging.DEBUG
-# elif self.args.verbosity == 1:
-# print "logLevel = info"
-# logLevel = logging.INFO
-# else:
-# print "logLevel = warn"
-# logLevel = logging.WARN
-# logging.basicConfig(level=logLevel, format=LOG_FORMAT)
-
def run(self):
"""
The main function, it is called by PBToolRunner.start().
"""
startTime = time.time()
- logging.info("pbalign version: {version}".format(version=get_version()))
- logging.debug("Original arguments: " + str(self._argumentList))
+ logging.info("pbalign version: %s", get_version())
+ #logging.debug("Original arguments: " + str(self._argumentList))
# Create an AlignService by algorithm name.
self._alnService = self._createAlignService(self.args.algorithm,
@@ -238,69 +236,76 @@ class PBAlignRunner(PBToolRunner):
self._makeSane(self.args, self.fileNames)
# Run align service.
- try:
- self._alnService.run()
- except RuntimeError:
- return 1
+ self._alnService.run()
- # Create a temporary filtered SAM file as output for FilterService.
+ # Create a temporary filtered SAM/BAM file as output for FilterService.
+ outFormat = getFileFormat(self.fileNames.outputFileName)
+ suffix = ".bam" if outFormat in \
+ [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam"
self.fileNames.filteredSam = self._tempFileManager.\
- RegisterNewTmpFile(suffix=".sam")
+ RegisterNewTmpFile(suffix=suffix)
- # Call filter service.
+ # Call filter service on SAM or BAM file.
self._filterService = FilterService(self.fileNames.alignerSamOut,
self.fileNames.targetFileName,
self.fileNames.filteredSam,
- self._alnService.name,
+ self.args.algorithm,
+ #self._alnService.name,
self._alnService.scoreSign,
self.args,
self.fileNames.adapterGffFileName)
- try:
- self._filterService.run()
- except RuntimeError:
- return 1
-
- # Output all hits either in SAM or CMP.H5.
- try:
- useSmrtTitle = False
- if (self.args.algorithm != "blasr" or
- self.fileNames.inputFileFormat == FILE_FORMATS.FASTA):
- useSmrtTitle = True
-
- self._output(
- self.fileNames.filteredSam,
- self.fileNames.targetFileName,
- self.fileNames.outputFileName,
- self.args.readType,
- useSmrtTitle)
- except RuntimeError:
- return 1
-
- # Call post service for quiver.
- if self.args.forQuiver or self.args.loadQVs:
- postService = ForQuiverService(self.fileNames,
- self.args)
- try:
- postService.run()
- except RuntimeError:
- return 1
+ self._filterService.run()
+
+ # Sort bam before output
+ if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]:
+ # Sort/make index for BAM output.
+ BamPostService(self.fileNames).run()
+
+ # Output all hits in SAM, BAM.
+ self._output(
+ inSam=self.fileNames.filteredSam,
+ refFile=self.fileNames.targetFileName,
+ outFile=self.fileNames.outputFileName,
+ readType=self.args.readType)
# Delete temporay files anyway to make
self._cleanUp(False if (hasattr(self.args, "keepTmpFiles") and
- self.args.keepTmpFiles is True) else True)
+ self.args.keepTmpFiles is True) else True)
endTime = time.time()
logging.info("Total time: {:.2f} s.".format(float(endTime - startTime)))
return 0
-
-def main():
- pbobj = PBAlignRunner(sys.argv[1:])
- return pbobj.start()
-
-
-if __name__ == "__main__":
- # For testing PBAlignRunner.
+def args_runner(args, output_dataset_type=AlignmentSet):
+ """args runner"""
# PBAlignRunner inherits PBToolRunner. So PBAlignRunner.start() parses args,
# sets up logging and finally returns run().
+ return PBAlignRunner(args, output_dataset_type=output_dataset_type).start()
+
+def _resolved_tool_contract_runner(output_dataset_type,
+ resolved_tool_contract):
+ """
+ Template function for running from a tool contract with an explicitly
+ specified output dataset type.
+ """
+ args = resolved_tool_contract_to_args(resolved_tool_contract)
+ return args_runner(args, output_dataset_type=output_dataset_type)
+
+resolved_tool_contract_runner = functools.partial(
+ _resolved_tool_contract_runner, AlignmentSet)
+resolved_tool_contract_runner_ccs = functools.partial(
+ _resolved_tool_contract_runner, ConsensusAlignmentSet)
+
+def main(argv=sys.argv, get_parser_func=get_contract_parser,
+ contract_runner_func=resolved_tool_contract_runner):
+ """Main, supporting both args runner and tool contract runner."""
+ return pbparser_runner(
+ argv=argv[1:],
+ parser=get_parser_func(),
+ args_runner_func=args_runner,
+ contract_runner_func=contract_runner_func,
+ alog=logging.getLogger(__name__),
+ setup_log_func=setup_log)
+
+if __name__ == "__main__":
sys.exit(main())
diff --git a/pbalign/tasks/__init__.py b/pbalign/tasks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pbalign/tasks/consolidate_alignments.py b/pbalign/tasks/consolidate_alignments.py
new file mode 100644
index 0000000..59f74ff
--- /dev/null
+++ b/pbalign/tasks/consolidate_alignments.py
@@ -0,0 +1,96 @@
+
+# FIXME this should probably live somewhere more general, e.g. pbdataset?
+
+"""
+Consolidate AlignmentSet .bam files
+"""
+
+import functools
+import tempfile
+import logging
+import os.path as op
+import os
+import sys
+
+from pbcommand.models import get_pbparser, FileTypes, ResourceTypes
+from pbcommand.cli import pbparser_runner
+from pbcommand.utils import setup_log
+from pbcore.io import openDataSet
+
+
+class Constants(object):
+ TOOL_ID = "pbalign.tasks.consolidate_alignments"
+ VERSION = "0.1.0"
+ DRIVER = "python -m pbalign.tasks.consolidate_alignments --resolved-tool-contract "
+ CONSOLIDATE_ID = "pbalign.task_options.consolidate_aligned_bam"
+ N_FILES_ID = "pbalign.task_options.consolidate_n_files"
+
+
+def get_parser():
+ p = get_pbparser(Constants.TOOL_ID,
+ Constants.VERSION,
+ "AlignmentSet consolidate",
+ __doc__,
+ Constants.DRIVER,
+ is_distributed=True,
+ resource_types=(ResourceTypes.TMP_DIR,))
+
+ p.add_input_file_type(FileTypes.DS_ALIGN, "align_in", "Input AlignmentSet",
+ "Gathered AlignmentSet to consolidate")
+ p.add_output_file_type(FileTypes.DS_ALIGN,
+ "ds_out",
+ "Alignments",
+ description="Alignment results dataset",
+ default_name="combined")
+ p.add_boolean(Constants.CONSOLIDATE_ID, "consolidate",
+ default=False,
+ name="Consolidate .bam",
+ description="Merge chunked/gathered .bam files")
+ p.add_int(Constants.N_FILES_ID, "consolidate_n_files",
+ default=1,
+ name="Number of .bam files",
+ description="Number of .bam files to create in consolidate mode")
+ return p
+
+
+def run_consolidate(dataset_file, output_file, consolidate, n_files):
+ with openDataSet(dataset_file) as ds_in:
+ # XXX shouldn't the file count check be done elsewhere?
+ if consolidate and len(ds_in.toExternalFiles()) != 1:
+ new_resource_file = op.splitext(output_file)[0] + ".bam" # .fasta?
+ ds_in.consolidate(new_resource_file, numFiles=n_files)
+ ds_in.newUuid()
+ ds_in.write(output_file)
+ return 0
+
+
+def args_runner(args):
+ return run_consolidate(
+ dataset_file=args.align_in,
+ output_file=args.ds_out,
+ consolidate=args.consolidate,
+ n_files=args.consolidate_n_files)
+
+
+def rtc_runner(rtc):
+ tempfile.tempdir = rtc.task.tmpdir_resources[0].path
+ return run_consolidate(
+ dataset_file=rtc.task.input_files[0],
+ output_file=rtc.task.output_files[0],
+ consolidate=rtc.task.options[Constants.CONSOLIDATE_ID],
+ n_files=rtc.task.options[Constants.N_FILES_ID])
+
+
+def main(argv=sys.argv):
+ logging.basicConfig(level=logging.DEBUG)
+ log = logging.getLogger()
+ return pbparser_runner(argv[1:],
+ get_parser(),
+ args_runner,
+ rtc_runner,
+ log,
+ setup_log)
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/pbalign/tools/createChemistryHeader.py b/pbalign/tools/createChemistryHeader.py
index 434d98c..ee71da7 100755
--- a/pbalign/tools/createChemistryHeader.py
+++ b/pbalign/tools/createChemistryHeader.py
@@ -28,8 +28,8 @@ def format_rgds_entries(rgds_entries):
rgds_strings = {}
for rg_id in rgds_entries:
- rgds_string = ("BINDINGKIT:{b};SEQUENCINGKIT:{s};"
- "SOFTWAREVERSION:{v}"
+ rgds_string = ("BINDINGKIT={b};SEQUENCINGKIT={s};"
+ "SOFTWAREVERSION={v}"
.format(b=rgds_entries[rg_id][0],
s=rgds_entries[rg_id][1],
v=rgds_entries[rg_id][2]))
diff --git a/pbalign/utils/fileutil.py b/pbalign/utils/fileutil.py
index 02afbb4..0ff5245 100755
--- a/pbalign/utils/fileutil.py
+++ b/pbalign/utils/fileutil.py
@@ -39,6 +39,8 @@ import os.path as op
import logging
from xml.etree import ElementTree as ET
from pbcore.util.Process import backticks
+from pbcore.io import DataSet, ReferenceSet
+
def enum(**enums):
"""Simulate enum."""
@@ -48,16 +50,19 @@ FILE_FORMATS = enum(FASTA="FASTA", PLS="PLS_H5", PLX="PLX_H5",
BAS="BAS_H5", BAX="BAX_H5", FOFN="FOFN",
SAM="SAM", CMP="CMP_H5", RGN="RGN_H5",
SA="SA", XML="XML", UNKNOWN="UNKNOWN",
- CCS="CCS_H5")
+ CCS="CCS_H5", BAM="BAM")
VALID_INPUT_FORMATS = (FILE_FORMATS.FASTA, FILE_FORMATS.PLS,
FILE_FORMATS.PLX, FILE_FORMATS.BAS,
FILE_FORMATS.BAX, FILE_FORMATS.FOFN,
- FILE_FORMATS.CCS)
+ FILE_FORMATS.CCS, FILE_FORMATS.BAM,
+ FILE_FORMATS.XML)
VALID_REGIONTABLE_FORMATS = (FILE_FORMATS.RGN, FILE_FORMATS.FOFN)
-VALID_OUTPUT_FORMATS = (FILE_FORMATS.CMP, FILE_FORMATS.SAM)
+VALID_OUTPUT_FORMATS = (FILE_FORMATS.CMP, FILE_FORMATS.SAM,
+ FILE_FORMATS.BAM, FILE_FORMATS.XML)
+
def real_ppath(fn):
"""Return real 'python-style' path of a file.
@@ -99,6 +104,7 @@ def isExist(ff):
_output, errCode, _errMsg = backticks(cmd)
return (errCode == 0)
+
def isValidInputFormat(ff):
"""Return True if ff is a valid input file format."""
return ff in VALID_INPUT_FORMATS
@@ -127,6 +133,8 @@ def getFileFormat(filename):
return FILE_FORMATS.FASTA
elif ext in [".sam"]:
return FILE_FORMATS.SAM
+ elif ext in [".bam"]:
+ return FILE_FORMATS.BAM
elif ext in [".sa"]:
return FILE_FORMATS.SA
elif ext in [".fofn"]:
@@ -173,7 +181,6 @@ def getFileFormatsFromFOFN(fofnname):
fs = getFilesFromFOFN(fofnname)
return [getFileFormat(f) for f in fs]
-
def checkInputFile(filename, validFormats=VALID_INPUT_FORMATS):
"""
Check whether an input file has the valid file format and exists.
@@ -207,6 +214,7 @@ def checkInputFile(filename, validFormats=VALID_INPUT_FORMATS):
raise IOError(errMsg)
else:
fileListRet.append(f)
+
return real_upath(filename)
@@ -238,7 +246,7 @@ def checkOutputFile(filename):
"""
filename = real_ppath(filename)
if not isValidOutputFormat(getFileFormat(filename)):
- errMsg = "The output file format can only be SAM or CMP.H5."
+ errMsg = "The output file format can only be CMP.H5, SAM, BAM or XML."
logging.error(errMsg)
raise ValueError(errMsg)
try:
@@ -252,7 +260,9 @@ def checkOutputFile(filename):
class ReferenceInfo:
+
"""Parse reference.info.xml in reference path."""
+
def __init__(self, fileName):
fileName = real_ppath(fileName)
if getFileFormat(fileName) != FILE_FORMATS.XML:
@@ -325,11 +335,12 @@ class ReferenceInfo:
def checkReferencePath(inRefpath):
"""Validate input reference path.
Check whether the input reference path exists or not.
- Input : can be a FASTA file or a reference repository.
+ Input : can be a FASTA file, a XML file or a reference repository.
Output: [refpath, FASTA_file, None, False, gff], if input is a FASTA file,
and it is not located within a reference repository.
[refpath, FASTA_file, SA_file, True, gff], if input is a FASTA
file, and it is located within a reference repository.
+ [refpath, FASTA_file, None, False, None], if input is a XML
[refpath, FASTA_file, SA_file, True, gff], if input is a reference
repository produced by PacBio referenceUploader.
"""
@@ -348,6 +359,16 @@ def checkReferencePath(inRefpath):
# reference repository
refinfoxml = op.join(op.split(op.dirname(refpath))[0],
"reference.info.xml")
+ elif getFileFormat(refpath) == FILE_FORMATS.XML:
+ fastaFiles = ReferenceSet(refpath).toFofn()
+ if len(fastaFiles) != 1:
+ errMsg = refpath + " must contain exactly one reference"
+ logging.error(errMsg)
+ raise Exception(errMsg)
+ fastaFile = fastaFiles[0][5:] if fastaFiles[0].startswith('file:') \
+ else fastaFiles[0]
+ refinfoxml = op.join(op.split(op.dirname(refpath))[0],
+ "reference.info.xml")
else:
refinfoxml = op.join(refpath, "reference.info.xml")
@@ -387,7 +408,7 @@ def checkReferencePath(inRefpath):
return real_upath(refpath), real_upath(fastaFile), sawriterFile, \
isWithinRepository, adapterGffFile
-#if __name__ == "__main__":
+# if __name__ == "__main__":
# refPath = "/opt/smrtanalysis" + \
# "/common/references/lambda/"
# refpath, faFile, saFile, isWithinRepository = checkReferencePath(refPath)
diff --git a/pbalign/utils/progutil.py b/pbalign/utils/progutil.py
index 3e6f0f7..7ad523b 100755
--- a/pbalign/utils/progutil.py
+++ b/pbalign/utils/progutil.py
@@ -60,7 +60,7 @@ def Execute(name, cmd):
errCode: the error code (zero means normal exit)
errMsg : the error message
"""
- logging.debug(name + ": Call \"{0}\"".format(cmd))
+ logging.info(name + ": Call \"{0}\"".format(cmd))
output, errCode, errMsg = backticks(cmd)
if errCode != 0:
errMsg = name + " returned a non-zero exit status. " + errMsg
diff --git a/setup.py b/setup.py
index 340abed..ffd0d5e 100755
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import sys
setup(
name = 'pbalign',
- version='0.2.0',
+ version='0.3.0',
author='Pacific Biosciences',
author_email='devnet at pacificbiosciences.com',
license='LICENSE.txt',
@@ -14,8 +14,10 @@ setup(
zip_safe = False,
install_requires=[
'pbcore >= 0.8.5',
- 'pysam'
+ 'pbcommand >= 0.2.0',
+ 'pysam',
],
+ test_requires=['pbtestdata'],
entry_points={'console_scripts': [
'pbalign=pbalign.pbalignrunner:main',
'maskAlignedReads.py = pbalign.tools.mask_aligned_reads:main',
diff --git a/tests/cram/bam.t b/tests/cram/bam.t
new file mode 100644
index 0000000..5d54f4f
--- /dev/null
+++ b/tests/cram/bam.t
@@ -0,0 +1,14 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+#Test pbalign with bam in bam out
+ $ Q=/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/test_bam/tiny_bam.fofn
+ $ T=/pbi/dept/secondary/siv/references/lambda/sequence/lambda.fasta
+ $ O=tiny_bam.bam
+ $ pbalign $Q $T $O >/dev/null
+
+#Call samtools index to check whether out.bam is sorted or not
+ $ samtools index $O $TMP1.bai && ls $TMP1.bai >/dev/null && echo $?
+ 0
+
+
diff --git a/tests/cram/ccs.t b/tests/cram/ccs.t
new file mode 100644
index 0000000..bba0bed
--- /dev/null
+++ b/tests/cram/ccs.t
@@ -0,0 +1,15 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+#Test pbalign with bam in bam out
+ $ Q=/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/all4mer-ccs/tiny_ccs.bam
+ $ T=/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/all4mer-ccs/all4mer_v2_30_1.fasta
+ $ O=$OUTDIR/tiny_ccs.bam
+ $ pbalign $Q $T $O >/dev/null
+
+#Call samtools index to check whether out.bam is sorted or not
+ $ samtools index $O $TMP1.bai && ls $TMP1.bai >/dev/null && echo $?
+ 0
+
+ $ samtools view $O |wc -l
+ 8
diff --git a/tests/cram/dataset_tiny.t b/tests/cram/dataset_tiny.t
new file mode 100644
index 0000000..be6f70b
--- /dev/null
+++ b/tests/cram/dataset_tiny.t
@@ -0,0 +1,38 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+#Test pbalign with dataset in and out
+ $ D=$TESTDATASETS/lambda/2372215/0007_tiny/Analysis_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.all.subreadset.xml
+ $ T=$REFDIR/lambda/reference.dataset.xml
+ $ O=$OUTDIR/tiny_bam.bam
+ $ rm -f $O
+ $ pbalign $D $T $O --algorithmOptions=" --holeNumbers 1-1000,30000-30500,60000-60600,100000-100500" >/dev/null
+
+Try feeding an aligned bam back in...
+ $ RA=$OUTDIR/tiny_bam_realigned.bam
+ $ pbalign $O $T $RA >/dev/null
+
+Call samtools index to check whether out.bam is sorted or not and coverage is sufficient and basic mapped stats
+ $ samtools index $O $TMP1.bai && ls $TMP1.bai >/dev/null && echo $?
+ 0
+
+Sum of depth > 197740
+ $ samtools depth $O | awk '{sum+=$3} END {if (sum >= 197740) print "TRUE"; else print "FALSE"}'
+ TRUE
+
+ $ samtools flagstat $O
+ 248 + 0 in total (QC-passed reads + QC-failed reads)
+ 0 + 0 secondary
+ 0 + 0 supplementary
+ 0 + 0 duplicates
+ 248 + 0 mapped * (glob)
+ 0 + 0 paired in sequencing
+ 0 + 0 read1
+ 0 + 0 read2
+ 0 + 0 properly paired * (glob)
+ 0 + 0 with itself and mate mapped
+ 0 + 0 singletons * (glob)
+ 0 + 0 with mate mapped to a different chr
+ 0 + 0 with mate mapped to a different chr (mapQ>=5)
+
+
diff --git a/tests/cram/filtercriteria_hitpolicy.t b/tests/cram/filtercriteria_hitpolicy.t
new file mode 100644
index 0000000..208f739
--- /dev/null
+++ b/tests/cram/filtercriteria_hitpolicy.t
@@ -0,0 +1,78 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+##Test --maxDivergence --minAnchorSize --minAccuracy
+# $ Q=$DATDIR/lambda_query.fasta
+# $ T="/pbi/dept/secondary/siv/references/lambda/sequence/lambda.fasta"
+#
+# $ NAME=lambda5
+# $ SAMOUT=$OUTDIR/$NAME.sam
+# $ M4OUT=$OUTDIR/$NAME.m4
+# $ M4STDOUT=$STDSIVDIR/$NAME.m4
+#
+# $ rm -f $SAMOUT $M4OUT
+# $ pbalign --maxDivergence 40 --minAnchorSize 20 --minAccuracy 80 $Q $T $SAMOUT 2>/dev/null
+# $ $SAMTOM4 $SAMOUT $T $TMP1 && sort $TMP1 > $M4OUT && rm $TMP1
+# $ diff $M4OUT $M4STDOUT
+#
+##Test whether pbalign interprets minAccuracy and maxDivergence correclty.
+# $ rm -f $SAMOUT $M4OUT
+# $ pbalign --maxDivergence 0.4 --minAnchorSize 20 --minAccuracy 0.8 $Q $T $SAMOUT 2>/dev/null
+# $ $SAMTOM4 $SAMOUT $T $TMP1 && sort $TMP1 > $M4OUT && rm $TMP1
+# $ diff $M4OUT $M4STDOUT
+#
+##Test --hitPolicy random
+# $ NAME=lambda_hitPolicy_random
+# $ SAMOUT=$OUTDIR/$NAME.sam
+# $ M4OUT=$OUTDIR/$NAME.m4
+# $ M4STDOUT=$STDSIVDIR/$NAME.m4
+#
+# $ rm -f $SAMOUT $M4OUT
+# $ pbalign --hitPolicy random --seed 1 $Q $T $SAMOUT 2>/dev/null
+# $ $SAMTOM4 $SAMOUT $T $TMP1 && sort $TMP1 > $M4OUT && rm $TMP1
+# $ diff $M4OUT $M4STDOUT
+#
+##Test --hitPolicy all
+# $ NAME=lambda_hitPolicy_all
+# $ SAMOUT=$OUTDIR/$NAME.sam
+# $ M4OUT=$OUTDIR/$NAME.m4
+# $ M4STDOUT=$STDSIVDIR/$NAME.m4
+#
+# $ rm -f $SAMOUT $M4OUT
+# $ pbalign --hitPolicy all $Q $T $SAMOUT 2>/dev/null
+# $ $SAMTOM4 $SAMOUT $T $TMP1 && sort $TMP1 > $M4OUT && rm $TMP1
+# $ diff $M4OUT $M4STDOUT
+#
+#
+##Test --hitPolicy randombest
+# $ NAME=lambda_hitPolicy_randombest
+# $ SAMOUT=$OUTDIR/$NAME.sam
+# $ M4OUT=$OUTDIR/$NAME.m4
+# $ M4STDOUT=$STDSIVDIR/$NAME.m4
+#
+# $ rm -f $SAMOUT $M4OUT
+# $ pbalign --hitPolicy randombest --seed 1 $Q $T $SAMOUT 2>/dev/null
+# $ $SAMTOM4 $SAMOUT $T $TMP1 && sort $TMP1 > $M4OUT && rm $TMP1
+# $ diff $M4OUT $M4STDOUT
+
+#Test --hitPolicy allbest
+ $ Q=$DATDIR/example_read.fasta
+ $ T=$DATDIR/example_ref.fasta
+ $ SAMOUT=$OUTDIR/hitPolicy_allbest.sam
+
+ $ rm -f $SAMOUT
+ $ pbalign --hitPolicy allbest $Q $T $SAMOUT >/dev/null
+
+
+
+#Test pbalign with -hitPolicy leftmost
+ $ Q=$DATDIR/test_leftmost_query.fasta
+ $ T=$DATDIR/test_leftmost_target.fasta
+ $ O=$OUTDIR/test_leftmost_out.sam
+ $ pbalign $Q $T $O --hitPolicy leftmost >/dev/null
+ $ echo $?
+ 0
+ $ grep -v '@' $O | cut -f 4
+ 1
+
+
diff --git a/tests/cram/path_with_space.t b/tests/cram/path_with_space.t
new file mode 100644
index 0000000..e2a6a9d
--- /dev/null
+++ b/tests/cram/path_with_space.t
@@ -0,0 +1,9 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+#Test pbalign with space in file names.
+ $ FA=$DATDIR/dir\ with\ spaces/reads.fasta
+ $ pbalign "$FA" "$FA" $OUTDIR/with_space.sam >/dev/null
+ $ echo $?
+ 0
+
diff --git a/tests/cram/setup.sh b/tests/cram/setup.sh
new file mode 100755
index 0000000..1b3e12c
--- /dev/null
+++ b/tests/cram/setup.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/bash
+
+#Note:
+#Program name has been changed from `pbalign.py` in version 0.1.0
+#to `pbalign` in 0.2.0, pseudo namespace pbtools has been removed also.
+
+CURDIR=$TESTDIR
+DATDIR=$CURDIR/../data
+OUTDIR=$CURDIR/../out
+STDDIR=$CURDIR/../stdout
+SIVDIR=/pbi/dept/secondary/siv/testdata/pbalign-unittest
+DATSIVDIR=$SIVDIR/data/
+STDSIVDIR=$SIVDIR/stdout/
+
+TESTDATASETS=/pbi/dept/secondary/siv/testdata/SA3-DS
+REFDIR=/pbi/dept/secondary/siv/references
+
+TMP1=$$.tmp.out
+TMP2=$$.tmp.stdout
+SAMTOM4=samtom4
+CMPH5TOOLS="/mnt/secondary/Smrtanalysis/smrtcmds/bin/smrtwrap cmph5tools.py "
diff --git a/tests/cram/test_pbalign.t b/tests/cram/test_pbalign.t
deleted file mode 100644
index 9e3b91c..0000000
--- a/tests/cram/test_pbalign.t
+++ /dev/null
@@ -1,276 +0,0 @@
-Note:
-Program name has been changed from `pbalign.py` in version 0.1.0
-to `pbalign` in 0.2.0, pseudo namespace pbtools has been removed also.
-
-Test pbalign
- $ CURDIR=$TESTDIR
- $ DATDIR=$CURDIR/../data
- $ OUTDIR=$CURDIR/../out
- $ STDDIR=$CURDIR/../stdout
-
-#Test pbalign with all combinations of input & output formats
-#input, reference and output formats are: fasta, fasta, and sam/cmp.h5
- $ READFILE=$DATDIR/lambda_query.fasta
- $ REFFILE="/mnt/secondary-siv/references/lambda/sequence/lambda.fasta"
-
- $ SAMOUT=$OUTDIR/lambda.sam
- $ CMPOUT=$OUTDIR/lambda.cmp.h5
-
- $ rm -f $SAMOUT $CMPOUT
- $ pbalign $READFILE $REFFILE $SAMOUT
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- ea31763bc847a6c75d3ddb5fb6036489 -
-
- $ pbalign $READFILE $REFFILE $CMPOUT
- $ h5dump -g /ref000001 $CMPOUT > tmpfile
- $ sed -n '2,11p' tmpfile
- GROUP "/ref000001" {
- GROUP "m120619_015854_42161_c100392070070000001523040811021231_s1_p0" {
- DATASET "AlnArray" {
- DATATYPE H5T_STD_U8LE
- DATASPACE SIMPLE { ( 48428 ) / ( H5S_UNLIMITED ) }
- DATA {
- (0): 34, 136, 17, 17, 34, 17, 136, 136, 136, 17, 136, 34, 136, 68,
- (14): 128, 34, 17, 136, 34, 17, 136, 17, 2, 34, 136, 136, 34, 34,
- (28): 68, 17, 68, 34, 17, 136, 136, 136, 17, 136, 136, 1, 17, 68,
- (42): 34, 17, 136, 136, 136, 34, 68, 34, 136, 17, 136, 17, 17, 68,
- $ rm tmpfile
-
-
-
-#input, reference and output formats are: fasta, folder and sam/cmp.h5
- $ READFILE=$DATDIR/lambda_query.fasta
- $ REFFILE=/mnt/secondary-siv/references/lambda/
-
- $ SAMOUT=$OUTDIR/lambda2.sam
- $ CMPOUT=$OUTDIR/lambda2.cmp.h5
-
- $ rm -f $SAMOUT $CMPOUT
- $ pbalign $READFILE $REFFILE $SAMOUT
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- ea31763bc847a6c75d3ddb5fb6036489 -
-
- $ pbalign $READFILE $REFFILE $CMPOUT
- $ h5dump -g /ref000001 $CMPOUT > tmpfile
- $ sed -n '2,11p' tmpfile
- GROUP "/ref000001" {
- GROUP "m120619_015854_42161_c100392070070000001523040811021231_s1_p0" {
- DATASET "AlnArray" {
- DATATYPE H5T_STD_U8LE
- DATASPACE SIMPLE { ( 48428 ) / ( H5S_UNLIMITED ) }
- DATA {
- (0): 34, 136, 17, 17, 34, 17, 136, 136, 136, 17, 136, 34, 136, 68,
- (14): 128, 34, 17, 136, 34, 17, 136, 17, 2, 34, 136, 136, 34, 34,
- (28): 68, 17, 68, 34, 17, 136, 136, 136, 17, 136, 136, 1, 17, 68,
- (42): 34, 17, 136, 136, 136, 34, 68, 34, 136, 17, 136, 17, 17, 68,
- $ rm -f tmpfile
-
-
-#input, reference and output formats are: fofn, fasta and sam/cmp.h5
- $ READFILE=$DATDIR/lambda_bax.fofn
- $ REFFILE="/mnt/secondary-siv/references/lambda/sequence/lambda.fasta"
-
- $ SAMOUT=$OUTDIR/lambda3.sam
- $ CMPOUT=$OUTDIR/lambda3.cmp.h5
-
- $ rm -f $SAMOUT $CMPOUT
- $ pbalign $READFILE $REFFILE $SAMOUT
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- e5c29fba1efbbfbe164fa2797408dbf6 -
-
- $ pbalign $READFILE $REFFILE $CMPOUT
- $ h5dump -g /ref000001 $CMPOUT > tmpfile
- $ sed -n '2,11p' tmpfile
- GROUP "/ref000001" {
- GROUP "m130220_114643_42129_c100471902550000001823071906131347_s1_p0" {
- DATASET "AlnArray" {
- DATATYPE H5T_STD_U8LE
- DATASPACE SIMPLE { ( 79904 ) / ( H5S_UNLIMITED ) }
- DATA {
- (0): 136, 34, 136, 34, 136, 68, 34, 68, 68, 68, 17, 68, 136, 68,
- (14): 136, 2, 34, 68, 68, 68, 17, 17, 136, 17, 17, 136, 136, 1, 17,
- (29): 17, 17, 34, 68, 17, 136, 68, 2, 17, 34, 17, 34, 17, 68, 68,
- (44): 68, 8, 136, 136, 17, 68, 34, 68, 34, 68, 136, 17, 2, 17, 130,
- $ rm -f tmpfile
-
-
-#input, reference and output formats are: fofn, folder and sam/cmp.h5
- $ READFILE=$DATDIR/lambda_bax.fofn
- $ REFFILE=/mnt/secondary-siv/references/lambda/
-
- $ SAMOUT=$OUTDIR/lambda4.sam
- $ CMPOUT=$OUTDIR/lambda4.cmp.h5
-
- $ rm -f $SAMOUT $CMPOUT
- $ pbalign $READFILE $REFFILE $SAMOUT
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- e5c29fba1efbbfbe164fa2797408dbf6 -
-
- $ pbalign $READFILE $REFFILE $CMPOUT
- $ h5dump -g /ref000001 $CMPOUT > tmpfile
- $ sed -n '2,11p' tmpfile
- GROUP "/ref000001" {
- GROUP "m130220_114643_42129_c100471902550000001823071906131347_s1_p0" {
- DATASET "AlnArray" {
- DATATYPE H5T_STD_U8LE
- DATASPACE SIMPLE { ( 79904 ) / ( H5S_UNLIMITED ) }
- DATA {
- (0): 136, 34, 136, 34, 136, 68, 34, 68, 68, 68, 17, 68, 136, 68,
- (14): 136, 2, 34, 68, 68, 68, 17, 17, 136, 17, 17, 136, 136, 1, 17,
- (29): 17, 17, 34, 68, 17, 136, 68, 2, 17, 34, 17, 34, 17, 68, 68,
- (44): 68, 8, 136, 136, 17, 68, 34, 68, 34, 68, 136, 17, 2, 17, 130,
- $ rm tmpfile
-
-
-#Test --maxDivergence --minAnchorSize --minAccuracy
- $ READFILE=$DATDIR/lambda_query.fasta
- $ REFFILE="/mnt/secondary-siv/references/lambda/sequence/lambda.fasta"
-
- $ SAMOUT=$OUTDIR/lambda5.sam
-
- $ rm -f $SAMOUT
- $ pbalign --maxDivergence 40 --minAnchorSize 20 --minAccuracy 80 $READFILE $REFFILE $SAMOUT
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- 29f8897b8ee6d4b7fff126d374edb306 -
-
-#Test whether pbalign interprets minAccuracy and maxDivergence correclty.
- $ rm -f $SAMOUT
- $ pbalign --maxDivergence 0.4 --minAnchorSize 20 --minAccuracy 0.8 $READFILE $REFFILE $SAMOUT
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- 29f8897b8ee6d4b7fff126d374edb306 -
-
-#Test --hitPolicy random
- $ SAMOUT=$OUTDIR/lambda_hitPolicy_random.sam
-
- $ rm -f $SAMOUT
- $ pbalign --hitPolicy random --seed 1 $READFILE $REFFILE $SAMOUT
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- ea31763bc847a6c75d3ddb5fb6036489 -
-
-#Test --hitPolicy all
- $ SAMOUT=$OUTDIR/lambda_hitPolicy_all.sam
-
- $ rm -f $SAMOUT
- $ pbalign --hitPolicy all $READFILE $REFFILE $SAMOUT
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- 2022614eb99fe3288c332aadcfefe739 -
-
-
-#Test --hitPolicy randombest
- $ SAMOUT=$OUTDIR/lambda_hitPolicy_randombest.sam
-
- $ rm -f $SAMOUT
- $ pbalign --hitPolicy randombest --seed 1 $READFILE $REFFILE $SAMOUT
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- ea31763bc847a6c75d3ddb5fb6036489 -
-
-#Test --scoreFunction
- $ SAMOUT=$OUTDIR/lambda_scoreFunction_editdist.sam
-
- $ rm -f $SAMOUT
- $ pbalign $READFILE $REFFILE $SAMOUT --scoreFunction editdist
- $ tail -n+6 $SAMOUT | cut -f 1-11 | sort | md5sum
- ea31763bc847a6c75d3ddb5fb6036489 -
-
-
-#Test --hitPolicy allbest
- $ READFILE=$DATDIR/example_read.fasta
- $ REFFILE=$DATDIR/example_ref.fasta
- $ SAMOUT=$OUTDIR/hitPolicy_allbest.sam
-
- $ rm -f $SAMOUT
- $ pbalign --hitPolicy allbest $READFILE $REFFILE $SAMOUT
- $ tail -n+8 $SAMOUT | cut -f 1-11 | sort | md5sum
- 6e68a0902f282c25526e14e5516e663b -
-
-#Test --useccs=useccsdenovo, whether attribute /ReadType is 'CCS'
- $ READFILE=$DATDIR/lambda_bax.fofn
- $ REFFILE="/mnt/secondary-siv/references/lambda/sequence/lambda.fasta"
- $ CMPOUT=$OUTDIR/lambda_denovo.cmp.h5
-
- $ rm -f $CMPOUT
- $ pbalign $READFILE $REFFILE $CMPOUT --useccs=useccsdenovo --algorithmOptions=" -holeNumbers 0-100"
- $ h5dump -a /ReadType $CMPOUT | grep "CCS"
- (0): "CCS"
-
-#Test --forQuiver can not be used together with --useccs
- $ pbalign $READFILE $REFFILE $CMPOUT --useccs=useccsdenovo --algorithmOptions=" -holeNumbers 0-100" --forQuiver 1>/dev/null 2>/dev/null || echo 'fail as expected'
- fail as expected
-
-
-#Test whether pbalign can produce sam output for non-PacBio reads
-# $ READFILE=$DATDIR/notSMRT.fasta
-# $ REFFILE="/mnt/secondary-siv/references/lambda/sequence/lambda.fasta"
-# $ SAMOUT=$OUTDIR/notSMRT.sam
-#
-# $ rm -f $SAMOUT $CMPOUT
-# $ pbalign $READFILE $REFFILE $SAMOUT
-
-
-# Test whether (ccs.h5) produces
-# identical results as (bas.h5 and --useccs=useccsdenovo).
- $ READFILE=$DATDIR/test_ccs.fofn
- $ REFFILE=/mnt/secondary-siv/references/ecoli_k12_MG1655/sequence/ecoli_k12_MG1655.fasta
- $ CCS_CMPOUT=$OUTDIR/test_ccs.cmp.h5
-
- $ rm -f $CCS_CMPOUT
- $ pbalign $READFILE $REFFILE $CCS_CMPOUT
-
- $ READFILE=$DATDIR/test_bas.fofn
- $ BAS_CMPOUT=$OUTDIR/test_bas.cmp.h5
-
- $ rm -f $BAS_CMPOUT
- $ pbalign $READFILE $REFFILE $BAS_CMPOUT --useccs=useccsdenovo
-
- $ h5diff $CCS_CMPOUT $BAS_CMPOUT /AlnGroup /AlnGroup
- $ h5diff $CCS_CMPOUT $BAS_CMPOUT /AlnInfo /AlnInfo
- $ h5diff $CCS_CMPOUT $BAS_CMPOUT /MovieInfo /MovieInfo
- $ h5diff $CCS_CMPOUT $BAS_CMPOUT /RefInfo /RefInfo
- $ h5diff $CCS_CMPOUT $BAS_CMPOUT /ref000001 /ref000001
-
-
-#Test pbalign with -filterAdapterOnly
- $ READFILE=$DATDIR/test_filterAdapterOnly.fofn
- $ REFDIR=/mnt/secondary-siv/testdata/BlasrTestData/pbalign/data/references/H1_6_Scal_6x/
- $ OUTPUT=$OUTDIR/test_filterAdapterOnly.sam
- $ rm -f $OUTPUT
- $ pbalign $READFILE $REFDIR $OUTPUT --filterAdapterOnly --algorithmOptions=" -holeNumbers 10817,14760" --seed=1
- $ tail -n+6 $OUTPUT | cut -f 1-4
-
-# Test pbalign with --pulseFile
-# This is an experimental option which goes only with gmap,
-# it enables users to bypass the pls2fasta step and use their own fasta
-# file instead, while keep the ability of generating cmp.h5 files with pulses
-# (i.e., --forQuiver). Eventually, we need to support --algorithm=blasr.
- $ OUTFILE=$OUTDIR/test_pulseFile.cmp.h5
- $ REFPATH=/mnt/secondary-siv/references/Ecoli_K12_DH10B/
- $ REFFILE=/mnt/secondary-siv/references/Ecoli_K12_DH10B/sequence/Ecoli_K12_DH10B.fasta
- $ pbalign $DATDIR/test_pulseFile.fasta $REFPATH $OUTFILE --pulseFile $DATDIR/test_pulseFile.fofn --forQuiver --algorithm gmap --byread
- $ echo $?
- 0
-
- $ OUTFILE=$OUTDIR/test_pulseFile.cmp.h5
- $ REFPATH=/mnt/secondary-siv/references/Ecoli_K12_DH10B/
- $ REFFILE=/mnt/secondary-siv/references/Ecoli_K12_DH10B/sequence/Ecoli_K12_DH10B.fasta
- $ rm -f $OUTFILE
- $ pbalign $DATDIR/test_pulseFile.fasta $REFPATH $OUTFILE --pulseFile $DATDIR/test_pulseFile.fofn --forQuiver --algorithm blasr --byread
- $ echo $?
- 0
-
-#Test pbalign with space in file names.
- $ FA=$DATDIR/dir\ with\ spaces/reads.fasta
- $ pbalign "$FA" "$FA" $OUTDIR/with_space.sam
- $ echo $?
- 0
-
-#Test pbalign with -hitPolicy leftmost
- $ Q=$DATDIR/test_leftmost_query.fasta
- $ T=$DATDIR/test_leftmost_target.fasta
- $ O=$OUTDIR/test_leftmost_out.sam
- $ pbalign $Q $T $O --hitPolicy leftmost
- $ echo $?
- 0
- $ tail -n+6 $O | cut -f 4
- 1
-
-
diff --git a/tests/cram/xml.t b/tests/cram/xml.t
new file mode 100644
index 0000000..8839141
--- /dev/null
+++ b/tests/cram/xml.t
@@ -0,0 +1,38 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+#Test pbalign with xml in bam out
+ $ Q=$DATDIR/subreads_dataset1.xml
+ $ T=$DATDIR/reference_lambda.xml
+ $ O=$OUTDIR/xml_in_bam_out.bam
+ $ rm -f $O
+ $ pbalign $Q $T $O --algorithmOptions=" --holeNumbers 1-2000" >/dev/null 2>/dev/null
+ $ echo $?
+ 0
+
+#Test pbalign with xml in xml out
+ $ Q=$DATDIR/subreads_dataset1.xml
+ $ T=$DATDIR/reference_lambda.xml
+ $ O=$OUTDIR/xml_in_xml_out.xml
+ $ rm -f $O
+ $ pbalign $Q $T $O --algorithmOptions=" --holeNumbers 1-2000" >/dev/null 2>/dev/null
+ $ echo $?
+ 0
+
+ $ ls $OUTDIR/xml_in_xml_out.bam.bai && echo $? # bam index exists
+ *.bai (glob)
+ 0
+
+ $ ls $OUTDIR/xml_in_xml_out.bam.pbi && echo $? # pbi index exists
+ *.pbi (glob)
+ 0
+
+#Test pbalign with up-to-dated xml inputs
+ $ Q=$DATDIR/subreads_dataset2.xml
+ $ T=$DATSIVDIR/ecoli.fasta
+ $ O=$OUTDIR/xml_in_bam_out_2.bam
+ $ rm -f $O
+ $ pbalign $Q $T $O --algorithmOptions=" --holeNumbers 1-2000" >/dev/null 2>/dev/null
+ $ echo $?
+ 0
+
diff --git a/tests/cram_h5/ccs.t b/tests/cram_h5/ccs.t
new file mode 100644
index 0000000..7665c7a
--- /dev/null
+++ b/tests/cram_h5/ccs.t
@@ -0,0 +1,51 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+#Test --useccs=useccsdenovo, whether attribute /ReadType is 'CCS'
+ $ Q=$DATDIR/lambda_bax.fofn
+ $ T="/pbi/dept/secondary/siv/references/lambda/sequence/lambda.fasta"
+ $ CMPOUT=$OUTDIR/lambda_denovo.cmp.h5
+
+ $ rm -f $CMPOUT
+ $ pbalign $Q $T $CMPOUT --useccs=useccsdenovo --algorithmOptions=" -holeNumbers 0-100" 2>/dev/null
+ $ h5dump -a /ReadType $CMPOUT | grep "CCS"
+ (0): "CCS"
+
+#Test --forQuiver can not be used together with --useccs
+ $ pbalign $Q $T $CMPOUT --useccs=useccsdenovo --algorithmOptions=" -holeNumbers 0-100" --forQuiver 1>/dev/null 2>/dev/null || echo 'fail as expected'
+ fail as expected
+
+
+#Test whether pbalign can produce sam output for non-PacBio reads
+# $ Q=$DATDIR/notSMRT.fasta
+# $ T="/pbi/dept/secondary/siv/references/lambda/sequence/lambda.fasta"
+# $ SAMOUT=$OUTDIR/notSMRT.sam
+#
+# $ rm -f $SAMOUT $CMPOUT
+# $ pbalign $Q $T $SAMOUT 2>/dev/null
+
+
+# Test whether (ccs.h5) produces
+# identical results as (bas.h5 and --useccs=useccsdenovo).
+ $ Q=$DATDIR/test_ccs.fofn
+ $ T=/pbi/dept/secondary/siv/references/ecoli_k12_MG1655/sequence/ecoli_k12_MG1655.fasta
+ $ CCS_CMPOUT=$OUTDIR/test_ccs.cmp.h5
+
+ $ rm -f $CCS_CMPOUT
+ $ pbalign $Q $T $CCS_CMPOUT 2>/dev/null
+
+ $ Q=$DATDIR/test_bas.fofn
+ $ BAS_CMPOUT=$OUTDIR/test_bas.cmp.h5
+
+ $ rm -f $BAS_CMPOUT
+ $ pbalign $Q $T $BAS_CMPOUT --useccs=useccsdenovo 2>/dev/null
+ $ /mnt/secondary/Smrtanalysis/smrtcmds/bin/smrtwrap cmph5tools.py sort $BAS_CMPOUT --deep --inPlace
+ $ /mnt/secondary/Smrtanalysis/smrtcmds/bin/smrtwrap cmph5tools.py sort $CCS_CMPOUT --deep --inPlace
+
+ $ h5diff $CCS_CMPOUT $BAS_CMPOUT /AlnGroup /AlnGroup
+ $ h5diff $CCS_CMPOUT $BAS_CMPOUT /AlnInfo /AlnInfo
+ $ h5diff $CCS_CMPOUT $BAS_CMPOUT /MovieInfo /MovieInfo
+ $ h5diff $CCS_CMPOUT $BAS_CMPOUT /RefInfo /RefInfo
+ $ h5diff $CCS_CMPOUT $BAS_CMPOUT /ref000001 /ref000001
+
+
diff --git a/tests/cram_h5/filter_adapter_only.t b/tests/cram_h5/filter_adapter_only.t
new file mode 100644
index 0000000..7e87cf6
--- /dev/null
+++ b/tests/cram_h5/filter_adapter_only.t
@@ -0,0 +1,11 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+#Test pbalign with -filterAdapterOnly
+ $ Q=$DATDIR/test_filterAdapterOnly.fofn
+ $ T=/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/references/H1_6_Scal_6x/
+ $ O=$OUTDIR/test_filterAdapterOnly.sam
+ $ rm -f $O
+ $ pbalign $Q $T $O --filterAdapterOnly --algorithmOptions=" -holeNumbers 10817,14760" --seed=1 2>/dev/null
+ $ grep -v '@' $O | cut -f 1-4
+
diff --git a/tests/cram_h5/inputs_outputs.t b/tests/cram_h5/inputs_outputs.t
new file mode 100644
index 0000000..a2eb155
--- /dev/null
+++ b/tests/cram_h5/inputs_outputs.t
@@ -0,0 +1,130 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+#Test pbalign with all combinations of input & output formats
+#input, reference and output formats are: fasta, fasta, and sam/cmp.h5
+ $ Q=$DATDIR/lambda_query.fasta
+ $ T="/pbi/dept/secondary/siv/references/lambda/sequence/lambda.fasta"
+
+ $ NAME=lambda
+ $ SAMOUT=$OUTDIR/$NAME.sam
+ $ CMPOUT=$OUTDIR/$NAME.cmp.h5
+ $ M4OUT=$OUTDIR/$NAME.m4
+ $ M4STDOUT=$STDSIVDIR/$NAME.m4
+
+ $ rm -f $SAMOUT $CMPOUT $M4OUT
+ $ pbalign $Q $T $SAMOUT 2>/dev/null
+ $ $SAMTOM4 $SAMOUT $T $TMP1 && sort $TMP1 > $M4OUT && rm $TMP1
+ $ diff $M4OUT $M4STDOUT
+
+ $ pbalign $Q $T $CMPOUT 2>/dev/null
+ $ $CMPH5TOOLS sort $CMPOUT --deep --inPlace
+ $ h5dump -g /ref000001 $CMPOUT > tmpfile
+ $ sed -n '2,11p' tmpfile
+ GROUP "/ref000001" {
+ GROUP "rg1-0" {
+ DATASET "AlnArray" {
+ DATATYPE H5T_STD_U8LE
+ DATASPACE SIMPLE { ( 48428 ) / ( H5S_UNLIMITED ) }
+ DATA {
+ (0): 34, 136, 68, 128, 136, 136, 136, 34, 136, 34, 34, 17, 68, 34,
+ (14): 68, 34, 17, 68, 34, 17, 34, 34, 68, 136, 16, 17, 17, 136, 136,
+ (29): 17, 34, 136, 68, 32, 136, 68, 17, 68, 34, 34, 17, 136, 34, 17,
+ (44): 136, 68, 17, 34, 68, 34, 34, 68, 17, 136, 68, 68, 17, 68, 34,
+ $ rm tmpfile
+
+#input, reference and output formats are: fasta, folder and sam/cmp.h5
+ $ Q=$DATDIR/lambda_query.fasta
+ $ T=/pbi/dept/secondary/siv/references/lambda/
+
+ $ NAME=lambda2
+ $ SAMOUT=$OUTDIR/$NAME.sam
+ $ CMPOUT=$OUTDIR/$NAME.cmp.h5
+ $ M4OUT=$OUTDIR/$NAME.m4
+ $ M4STDOUT=$STDSIVDIR/$NAME.m4
+
+ $ rm -f $SAMOUT $CMPOUT $M4OUT
+ $ pbalign $Q $T $SAMOUT 2>/dev/null
+ $ $SAMTOM4 $SAMOUT $T/sequence/lambda.fasta $TMP1 && sort $TMP1 > $M4OUT && rm $TMP1
+ $ diff $M4OUT $M4STDOUT
+
+ $ pbalign $Q $T $CMPOUT 2>/dev/null
+ $ $CMPH5TOOLS sort $CMPOUT --deep --inPlace
+ $ h5dump -g /ref000001 $CMPOUT > tmpfile
+ $ sed -n '2,11p' tmpfile
+ GROUP "/ref000001" {
+ GROUP "rg1-0" {
+ DATASET "AlnArray" {
+ DATATYPE H5T_STD_U8LE
+ DATASPACE SIMPLE { ( 48428 ) / ( H5S_UNLIMITED ) }
+ DATA {
+ (0): 34, 136, 68, 128, 136, 136, 136, 34, 136, 34, 34, 17, 68, 34,
+ (14): 68, 34, 17, 68, 34, 17, 34, 34, 68, 136, 16, 17, 17, 136, 136,
+ (29): 17, 34, 136, 68, 32, 136, 68, 17, 68, 34, 34, 17, 136, 34, 17,
+ (44): 136, 68, 17, 34, 68, 34, 34, 68, 17, 136, 68, 68, 17, 68, 34,
+ $ rm tmpfile
+
+
+#input, reference and output formats are: fofn, fasta and sam/cmp.h5
+ $ Q=$DATDIR/lambda_bax.fofn
+ $ T="/pbi/dept/secondary/siv/references/lambda/sequence/lambda.fasta"
+
+ $ NAME=lambda3
+ $ SAMOUT=$OUTDIR/$NAME.sam
+ $ CMPOUT=$OUTDIR/$NAME.cmp.h5
+ $ M4OUT=$OUTDIR/$NAME.m4
+ $ M4STDOUT=$STDSIVDIR/$NAME.m4
+
+ $ rm -f $SAMOUT $CMPOUT $M4OUT
+ $ pbalign $Q $T $SAMOUT 2>/dev/null
+ $ $SAMTOM4 $SAMOUT $T $TMP1 && sort $TMP1 > $M4OUT && rm $TMP1
+ $ diff $M4OUT $M4STDOUT
+
+ $ pbalign $Q $T $CMPOUT 2>/dev/null
+ $ $CMPH5TOOLS sort $CMPOUT --deep --inPlace
+ $ h5dump -g /ref000001 $CMPOUT > tmpfile
+ $ sed -n '2,11p' tmpfile
+ GROUP "/ref000001" {
+ GROUP "rg1-0" {
+ DATASET "AlnArray" {
+ DATATYPE H5T_STD_U8LE
+ DATASPACE SIMPLE { ( 79904 ) / ( H5S_UNLIMITED ) }
+ DATA {
+ (0): 68, 68, 68, 34, 68, 68, 34, 68, 17, 34, 34, 136, 32, 34, 68,
+ (15): 34, 68, 68, 68, 136, 136, 136, 136, 34, 68, 34, 128, 136, 17,
+ (29): 136, 136, 136, 17, 136, 68, 17, 17, 17, 17, 128, 136, 136,
+ (42): 136, 136, 34, 34, 68, 68, 128, 136, 136, 136, 16, 17, 17, 64,
+
+ $ rm tmpfile
+
+#input, reference and output formats are: fofn, folder and sam/cmp.h5
+ $ Q=$DATDIR/lambda_bax.fofn
+ $ T=/pbi/dept/secondary/siv/references/lambda/
+
+ $ NAME=lambda4
+ $ SAMOUT=$OUTDIR/$NAME.sam
+ $ CMPOUT=$OUTDIR/$NAME.cmp.h5
+ $ M4OUT=$OUTDIR/$NAME.m4
+ $ M4STDOUT=$STDSIVDIR/$NAME.m4
+
+ $ rm -f $SAMOUT $CMPOUT $M4OUT
+ $ pbalign $Q $T $SAMOUT 2>/dev/null
+ $ $SAMTOM4 $SAMOUT $T/sequence/lambda.fasta $TMP1 && sort $TMP1 > $M4OUT && rm $TMP1
+ $ diff $M4OUT $M4STDOUT
+
+ $ pbalign $Q $T $CMPOUT 2>/dev/null
+ $ $CMPH5TOOLS sort $CMPOUT --deep --inPlace
+ $ h5dump -g /ref000001 $CMPOUT > tmpfile
+ $ sed -n '2,11p' tmpfile
+ GROUP "/ref000001" {
+ GROUP "rg1-0" {
+ DATASET "AlnArray" {
+ DATATYPE H5T_STD_U8LE
+ DATASPACE SIMPLE { ( 79904 ) / ( H5S_UNLIMITED ) }
+ DATA {
+ (0): 68, 68, 68, 34, 68, 68, 34, 68, 17, 34, 34, 136, 32, 34, 68,
+ (15): 34, 68, 68, 68, 136, 136, 136, 136, 34, 68, 34, 128, 136, 17,
+ (29): 136, 136, 136, 17, 136, 68, 17, 17, 17, 17, 128, 136, 136,
+ (42): 136, 136, 34, 34, 68, 68, 128, 136, 136, 136, 16, 17, 17, 64,
+
+ $ rm tmpfile
diff --git a/tests/cram_h5/pulsefile.t b/tests/cram_h5/pulsefile.t
new file mode 100644
index 0000000..976fcb7
--- /dev/null
+++ b/tests/cram_h5/pulsefile.t
@@ -0,0 +1,23 @@
+Set up
+ $ . $TESTDIR/setup.sh
+
+# Test pbalign with --pulseFile
+# This is an experimental option which goes only with gmap,
+# it enables users to bypass the pls2fasta step and use their own fasta
+# file instead, while keep the ability of generating cmp.h5 files with pulses
+# (i.e., --forQuiver).
+ $ O=$OUTDIR/test_pulseFile.cmp.h5
+ $ T=/pbi/dept/secondary/siv/references/Ecoli_K12_DH10B/
+ $ T=/pbi/dept/secondary/siv/references/Ecoli_K12_DH10B/sequence/Ecoli_K12_DH10B.fasta
+ $ pbalign $DATDIR/test_pulseFile.fasta $T $O --pulseFile $DATDIR/test_pulseFile.fofn --forQuiver --algorithm gmap --byread 2>/dev/null
+ $ echo $?
+ 0
+
+ $ O=$OUTDIR/test_pulseFile.cmp.h5
+ $ T=/pbi/dept/secondary/siv/references/Ecoli_K12_DH10B/
+ $ T=/pbi/dept/secondary/siv/references/Ecoli_K12_DH10B/sequence/Ecoli_K12_DH10B.fasta
+ $ rm -f $O
+ $ pbalign $DATDIR/test_pulseFile.fasta $T $O --pulseFile $DATDIR/test_pulseFile.fofn --forQuiver --algorithm blasr --byread 2>/dev/null
+ $ echo $?
+ 0
+
diff --git a/tests/cram_h5/setup.sh b/tests/cram_h5/setup.sh
new file mode 100755
index 0000000..391b51f
--- /dev/null
+++ b/tests/cram_h5/setup.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/bash
+
+#Note:
+#Program name has been changed from `pbalign.py` in version 0.1.0
+#to `pbalign` in 0.2.0, pseudo namespace pbtools has been removed also.
+
+CURDIR=$TESTDIR
+DATDIR=$CURDIR/../data
+OUTDIR=$CURDIR/../out
+STDDIR=$CURDIR/../stdout
+SIVDIR=/pbi/dept/secondary/siv/testdata/pbalign-unittest
+DATSIVDIR=$SIVDIR/data/
+STDSIVDIR=$SIVDIR/stdout/
+
+TESTDATASETS=/pbi/dept/secondary/siv/testdata/SA3-RS
+REFDIR=/pbi/dept/secondary/siv/references
+
+TMP1=$$.tmp.out
+TMP2=$$.tmp.stdout
+SAMTOM4=samtom4
+CMPH5TOOLS="/mnt/secondary/Smrtanalysis/smrtcmds/bin/smrtwrap cmph5tools.py "
+
diff --git a/tests/cram/test_mask_aligned_reads.t b/tests/cram_h5/test_mask_aligned_reads.t
similarity index 96%
rename from tests/cram/test_mask_aligned_reads.t
rename to tests/cram_h5/test_mask_aligned_reads.t
index 552a7bc..7311758 100644
--- a/tests/cram/test_mask_aligned_reads.t
+++ b/tests/cram_h5/test_mask_aligned_reads.t
@@ -1,6 +1,6 @@
Test mask_aligned_reads.py
$ CURDIR=$TESTDIR
- $ DATDIR=/mnt/secondary-siv/testdata/BlasrTestData/pbalign/data/test_mask_aligned_reads
+ $ DATDIR=/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/test_mask_aligned_reads
$ OUTDIR=$CURDIR/../out
$ STDDIR=$DATDIR/../../stdout
diff --git a/tests/data/1.config b/tests/data/1.config
index 8d43a60..ab02ab6 100644
--- a/tests/data/1.config
+++ b/tests/data/1.config
@@ -14,10 +14,9 @@
--maxHits = 20
--minAnchorSize = 1
--minLength = 100
---algorithmOptions = "-noSplitSubreads -maxMatch 30 -nCandidates 30"
+--algorithmOptions = "--noSplitSubreads --maxMatch 30 --nCandidates 30"
# SamFilter's filtering criteria and hit policy.
---scoreFunction = blasr
--hitPolicy = randombest
--maxDivergence = 40
diff --git a/tests/data/ecoli_lp.fofn b/tests/data/ecoli_lp.fofn
index 06c29ac..cb6e86b 100644
--- a/tests/data/ecoli_lp.fofn
+++ b/tests/data/ecoli_lp.fofn
@@ -1,2 +1,2 @@
-/home/UNIXHOME/yli/yliWorkspace/private/yli/data/testLoadPulses/m121215_065521_richard_c100425710150000001823055001121371_s1_p0.pls.h5
-/home/UNIXHOME/yli/yliWorkspace/private/yli/data/testLoadPulses/m121215_065521_richard_c100425710150000001823055001121371_s2_p0.pls.h5
+/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/testLoadPulses/Analysis_Results/m121215_065521_richard_c100425710150000001823055001121371_s1_p0.pls.h5
+/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/testLoadPulses/Analysis_Results/m121215_065521_richard_c100425710150000001823055001121371_s2_p0.pls.h5
diff --git a/tests/data/lambda_bax.fofn b/tests/data/lambda_bax.fofn
index 80b3bd2..22aecb6 100644
--- a/tests/data/lambda_bax.fofn
+++ b/tests/data/lambda_bax.fofn
@@ -1 +1 @@
-/home/UNIXHOME/yli/yliWorkspace/private/yli/data/testLoadPulses/m130302_011223_pd1_c000000092559900001500000112311511_s1_p0.2.bax.h5
+/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/testLoadPulses/Analysis_Results/m130302_011223_pd1_c000000092559900001500000112311511_s1_p0.2.bax.h5
diff --git a/tests/data/reference_lambda.xml b/tests/data/reference_lambda.xml
new file mode 100644
index 0000000..288a913
--- /dev/null
+++ b/tests/data/reference_lambda.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" ?>
+<ReferenceSet CreatedAt="2015-07-16T10:48:02" MetaType="PacBio.DataSet.ReferenceSet" Name="" Tags="" UniqueId="1078368a-d5c5-433b-82b9-38ae124259c5" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+ <ExternalResources>
+ <ExternalResource MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="/pbi/dept/secondary/siv/references/lambdaNEB/sequence/lambdaNEB.fasta">
+ <FileIndices>
+ <FileIndex MetaType="PacBio.Index.SamIndex" ResourceId="/pbi/dept/secondary/siv/references/lambdaNEB/sequence/lambdaNEB.fasta.fai"/>
+ </FileIndices>
+ </ExternalResource>
+ </ExternalResources>
+ <DataSetMetadata>
+ <NumRecords>1</NumRecords>
+ <TotalLength>48502</TotalLength>
+ </DataSetMetadata>
+</ReferenceSet>
diff --git a/tests/data/subreads_dataset1.xml b/tests/data/subreads_dataset1.xml
new file mode 100644
index 0000000..82a9b6c
--- /dev/null
+++ b/tests/data/subreads_dataset1.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" ?>
+<AlignmentSet CreatedAt="2015-07-16T11:35:50" MetaType="PacBio.DataSet.AlignmentSet" Name="" Tags="" UniqueId="2e8eaedb-0928-2310-bffd-6d3b36062e53" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+ <ExternalResources>
+ <ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/test_bam/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.subreads.bam">
+ <FileIndices>
+ <FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/test_bam/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.subreads.bam.pbi"/>
+ <FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/test_bam/m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.subreads.bam.bai"/>
+ </FileIndices>
+ </ExternalResource>
+ <ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/test_bam/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.1.subreads.bam">
+ <FileIndices>
+ <FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/test_bam/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.1.subreads.bam.pbi"/>
+ <FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/test_bam/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.1.subreads.bam.bai"/>
+ </FileIndices>
+ </ExternalResource>
+ </ExternalResources>
+ <DataSetMetadata>
+ <NumRecords>-1</NumRecords>
+ <TotalLength>-1</TotalLength>
+ </DataSetMetadata>
+</AlignmentSet>
diff --git a/tests/data/subreads_dataset2.xml b/tests/data/subreads_dataset2.xml
new file mode 100644
index 0000000..931dbef
--- /dev/null
+++ b/tests/data/subreads_dataset2.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" ?>
+<SubreadSet CreatedAt="2015-07-16T11:39:29" MetaType="PacBio.DataSet.SubreadSet" Name="" Tags="" UniqueId="a263117b-9f93-6ae5-9c11-7ac144df4ae5" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+ <ExternalResources>
+ <ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/test_bam/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.1.subreads.bam"/>
+ </ExternalResources>
+ <DataSetMetadata>
+ <NumRecords>-1</NumRecords>
+ <TotalLength>-1</TotalLength>
+ </DataSetMetadata>
+</SubreadSet>
diff --git a/tests/data/test_bas.fofn b/tests/data/test_bas.fofn
index dd962ae..99b60ff 100644
--- a/tests/data/test_bas.fofn
+++ b/tests/data/test_bas.fofn
@@ -1 +1 @@
-/mnt/secondary-siv/testdata/BlasrTestData/pbalign/data/ccs_P4-C2/m130328_211423_ethan_c100499512550000001823070408081371_s1_p0.bas.h5
+/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/ccs_P4-C2/m130328_211423_ethan_c100499512550000001823070408081371_s1_p0.bas.h5
diff --git a/tests/data/test_ccs.fofn b/tests/data/test_ccs.fofn
index 1352ec4..a744657 100644
--- a/tests/data/test_ccs.fofn
+++ b/tests/data/test_ccs.fofn
@@ -1 +1 @@
-/mnt/secondary-siv/testdata/BlasrTestData/pbalign/data/new.ccs.h5
+/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/new.ccs.h5
diff --git a/tests/data/test_filterAdapterOnly.fofn b/tests/data/test_filterAdapterOnly.fofn
index 62697c8..826ce4e 100755
--- a/tests/data/test_filterAdapterOnly.fofn
+++ b/tests/data/test_filterAdapterOnly.fofn
@@ -1 +1 @@
-/mnt/data3/vol57/2820011/0006/Analysis_Results/m130302_124313_42130_c100502672550000001523078308081365_s1_p0.bas.h5
+/pbi/dept/secondary/siv/testdata/LIMS/2820011/0006/Analysis_Results/m130302_124313_42130_c100502672550000001523078308081365_s1_p0.bas.h5
diff --git a/tests/data/test_leftmost_query.fasta b/tests/data/test_leftmost_query.fasta
index 520bad1..e507cbd 100644
--- a/tests/data/test_leftmost_query.fasta
+++ b/tests/data/test_leftmost_query.fasta
@@ -1,4 +1,4 @@
->ref000001|gi|49175990|ref|NC_000913.2| Escherichia coli str. K-12 substr. MG1655 chromosome, complete genome
+>fakemovie/0/0_2940
AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC
TGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG
TCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTAC
diff --git a/tests/data/test_pulseFile.fofn b/tests/data/test_pulseFile.fofn
index 7f4e3da..4adee68 100644
--- a/tests/data/test_pulseFile.fofn
+++ b/tests/data/test_pulseFile.fofn
@@ -1,4 +1,4 @@
-/mnt/data3/vol57/2650250/0001/Analysis_Results/m121004_000921_42130_c100440700060000001523060402151341_s1_p0.bas.h5
-/mnt/data3/vol53/2450496/0001/Analysis_Results/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.1.bax.h5
-/mnt/data3/vol53/2450496/0001/Analysis_Results/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.2.bax.h5
-/mnt/data3/vol53/2450496/0001/Analysis_Results/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.3.bax.h5
+/pbi/dept/secondary/siv/testdata/LIMS/2650250/0001/Analysis_Results/m121004_000921_42130_c100440700060000001523060402151341_s1_p0.bas.h5
+/pbi/dept/secondary/siv/testdata/LIMS/2450496/0001/Analysis_Results/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.1.bax.h5
+/pbi/dept/secondary/siv/testdata/LIMS/2450496/0001/Analysis_Results/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.2.bax.h5
+/pbi/dept/secondary/siv/testdata/LIMS/2450496/0001/Analysis_Results/m130406_011850_42141_c100513442550000001823074308221310_s1_p0.3.bax.h5
diff --git a/tests/out/readme b/tests/out/readme
index 8606a09..26ecd90 100644
--- a/tests/out/readme
+++ b/tests/out/readme
@@ -1,2 +1 @@
-# This directory saves output files generated by pbalign's
-# unit tests and cram tests.
+FIXME this directory needs to be retired
diff --git a/tests/unit/nose.cfg b/tests/unit/nose.cfg
new file mode 100644
index 0000000..5ed666c
--- /dev/null
+++ b/tests/unit/nose.cfg
@@ -0,0 +1,17 @@
+# pbalign unittest config, copied from pbreports.
+
+[DEFAULT]
+
+[log]
+level=debug
+file=pbalign-unittest.log
+
+[data]
+dataDir=/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/
+stdDir=/pbi/dept/secondary/siv/testdata/pbalign-unittest/stdout/
+
+#This is for generating documentation of report attribute ids
+[doc]
+jinja-templates=doc/report-metrics
+html-dir=doc/_build/html
+
diff --git a/tests/unit/test_fileutil.py b/tests/unit/test_fileutil.py
index d6d89ff..cee5432 100755
--- a/tests/unit/test_fileutil.py
+++ b/tests/unit/test_fileutil.py
@@ -1,18 +1,31 @@
"""Test pbalign.util/fileutil.py"""
+import tempfile
import unittest
+import filecmp
+import shutil
from os import path
+
+from pbcore.io import DataSet
+
from pbalign.utils.fileutil import getFileFormat, \
isValidInputFormat, isValidOutputFormat, getFilesFromFOFN, \
checkInputFile, checkOutputFile, checkReferencePath, \
real_upath, real_ppath, isExist
-import filecmp
+
+from test_setpath import ROOT_DIR, DATA_DIR
class Test_fileutil(unittest.TestCase):
"""Test pbalign.util/fileutil.py"""
def setUp(self):
- self.rootDir = path.dirname(path.dirname(path.abspath(__file__)))
+ self.rootDir = ROOT_DIR
+ self.outDir = tempfile.mkdtemp()
+ self.dataDir = path.join(DATA_DIR,
+ "testLoadPulses/Analysis_Results/")
+
+ def tearDown(self):
+ shutil.rmtree(self.outDir)
def test_isValidInputFormat(self):
"""Test isValidInputFormat()."""
@@ -42,40 +55,43 @@ class Test_fileutil(unittest.TestCase):
def test_getFilesFromFOFN(self):
"""Test getFilesFromFOFN()."""
- fofnFN = "{0}/data/ecoli_lp.fofn".format(self.rootDir)
- fns = ["/home/UNIXHOME/yli/yliWorkspace/private/yli/data" + \
- "/testLoadPulses/m121215_065521_richard_c10042571" + \
- "0150000001823055001121371_s1_p0.pls.h5",
- "/home/UNIXHOME/yli/yliWorkspace/private/yli/data" + \
- "/testLoadPulses/m121215_065521_richard_c10042571" + \
- "0150000001823055001121371_s2_p0.pls.h5"]
+ fofnFN = path.join(self.rootDir, "data/ecoli_lp.fofn")
+ fns = [self.dataDir +
+ "m121215_065521_richard_c100425710150000001823055001121371_s1_p0.pls.h5",
+ self.dataDir +
+ "m121215_065521_richard_c100425710150000001823055001121371_s2_p0.pls.h5"]
self.assertEqual(fns, getFilesFromFOFN(fofnFN))
def test_checkInputFile(self):
"""Test checkInputFile()."""
- fastaFN = "{0}/data/ecoli.fasta".format(self.rootDir)
- plsFN = "/home/UNIXHOME/yli/yliWorkspace/private/yli/" + \
- "data/testLoadPulses/m121215_065521_richard_" + \
- "c100425710150000001823055001121371_s1_p0.pls.h5"
+ fastaFN = path.join(self.rootDir, "data/ecoli.fasta")
+ plsFN = self.dataDir + \
+ "m121215_065521_richard_c100425710150000001823055001121371_s1_p0.pls.h5"
self.assertTrue(filecmp.cmp(fastaFN, checkInputFile(fastaFN)))
self.assertTrue(filecmp.cmp(plsFN, checkInputFile(plsFN)))
- fofnFN = "{0}/data/ecoli_lp.fofn".format(self.rootDir)
+ fofnFN = path.join(self.rootDir, "data/ecoli_lp.fofn")
self.assertTrue(filecmp.cmp(fofnFN, checkInputFile(fofnFN)))
+ xmlFN = path.join(self.rootDir, "data/subreads_dataset1.xml")
+ ret = checkInputFile(xmlFN)
+ self.assertTrue(ret.endswith('.xml'))
+ fs = DataSet(ret).toExternalFiles()
+ self.assertTrue(fs[0].endswith("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.subreads.bam"))
+ self.assertTrue(fs[1].endswith("m130406_011850_42141_c100513442550000001823074308221310_s1_p0.1.subreads.bam"))
+
def test_checkOutputFile(self):
"""Test checkOutputFile()."""
- samFN = "{0}/out/lambda_out.sam".format(self.rootDir)
- cmpFN = "{0}/out/lambda_out.cmp.h5".format(self.rootDir)
+ samFN = path.join(self.outDir, "lambda_out.sam")
+ cmpFN = path.join(self.outDir, "lambda_out.cmp.h5")
self.assertTrue(filecmp.cmp(samFN, checkOutputFile(samFN)))
self.assertTrue(filecmp.cmp(cmpFN, checkOutputFile(cmpFN)))
def test_checkReferencePath(self):
"""Test checkReferencePath()."""
- refDir = "/mnt/secondary/Smrtanalysis/opt/smrtanalysis/common/" + \
- "references"
+ refDir = "/pbi/dept/secondary/siv/references/"
refPath = path.join(refDir, "lambda")
refPath, refFastaOut, refSaOut, isWithinRepository, annotation = \
checkReferencePath(refPath)
@@ -97,22 +113,27 @@ class Test_fileutil(unittest.TestCase):
"sequence/lambda.fasta.sa")))
self.assertTrue(isWithinRepository)
-
fastaFN = "{0}/data/ecoli.fasta".format(self.rootDir)
+
refpath, refFastaOut, refSaOut, isWithinRepository, annotation = \
checkReferencePath(fastaFN)
self.assertTrue(filecmp.cmp(refpath, refFastaOut))
self.assertIsNone(refSaOut)
self.assertFalse(isWithinRepository)
- refPathWithAnnotation = "/mnt/secondary-siv/" + \
- "testdata/BlasrTestData/pbalign/data/references/H1_6_Scal_6x/"
+ refPathWithAnnotation = path.join(DATA_DIR, "references/H1_6_Scal_6x/")
_refPath, _refFaOut, _refSaOut, _isWithinRepository, annotation = \
checkReferencePath(refPathWithAnnotation)
self.assertEqual(path.abspath(annotation),
path.abspath(path.join(refPathWithAnnotation,
"annotations/H1_6_Scal_6x_adapters.gff")))
+ xmlFN = path.join(self.rootDir, "data/reference_lambda.xml")
+ refpath, refFastaOut, refSaOut, isWithinRepository, annotation = \
+ checkReferencePath(xmlFN)
+ self.assertTrue(filecmp.cmp(refFastaOut,
+ "/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/lambda_ref.fasta"))
+
def test_isExist(self):
"""Test isExist(ff)."""
self.assertFalse(isExist(None))
diff --git a/tests/unit/test_options.py b/tests/unit/test_options.py
index 6b5c8e5..4136212 100755
--- a/tests/unit/test_options.py
+++ b/tests/unit/test_options.py
@@ -1,26 +1,46 @@
-from pbalign.options import *
+
+import tempfile
from argparse import *
from os import path
+import os
import filecmp
import unittest
-rootDir = path.dirname(path.dirname(path.abspath(__file__)))
-configFile = path.join(rootDir, "data/2.config")
-configFile3 = path.join(rootDir, "data/3.config")
+from pbalign.options import *
+
+
+def get_argument_parser():
+ return get_contract_parser().arg_parser.parser
def writeConfigFile(configFile, configOptions):
"""Write configs to a file."""
with open (configFile, 'w') as f:
f.write("\n".join(configOptions))
+def parseOptions(args):
+ return get_argument_parser().parse_args(args)
+
class Test_Options(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.infiles = ["tmp_readfile.bam", "tmp_reffile.fasta"]
+ for file_name in cls.infiles:
+ open(file_name, "w").write("\n")
+ cls.configFile = tempfile.NamedTemporaryFile(suffix=".config").name
+ cls.configFile3 = tempfile.NamedTemporaryFile(suffix=".config").name
+
+ @classmethod
+ def tearDownClass(cls):
+ for file_name in cls.infiles:
+ os.remove(file_name)
+
def test_importConfigOptions(self):
"""Test importConfigOptions()."""
configOptions = ("--minAccuracy = 40",
"--maxHits = 20")
- writeConfigFile(configFile, configOptions)
- options = Namespace(configFile=configFile,
+ writeConfigFile(self.configFile, configOptions)
+ options = Namespace(configFile=self.configFile,
minAccuracy=10,
maxHits=12)
@@ -29,51 +49,48 @@ class Test_Options(unittest.TestCase):
self.assertEqual(int(newOptions.maxHits), 20)
self.assertEqual(int(newOptions.minAccuracy), 40)
- def test_ConstructOptionParser(self):
- """Test constructOptionParser()."""
- ret = constructOptionParser()
- self.assertEqual(type(ret), argparse.ArgumentParser)
+ def test_get_argument_parser(self):
+ """Test get_argument_parser()."""
+ ret = get_argument_parser()
+ self.assertTrue(isinstance(ret, argparse.ArgumentParser))
def test_parseOptions(self):
- """Test parseOptions()."""
+ """Test parseOptions with a config file."""
configOptions = (
"--maxHits = 20",
"--minAnchorSize = 15",
"--minLength = 100",
- "--algorithmOptions = '-noSplitSubreads " + \
- "-maxMatch 30 -nCandidates 30'",
+ "--algorithmOptions = '--noSplitSubreads " + \
+ "--maxMatch 30 --nCandidates 30'",
"# Some comments",
- "--scoreFunction = blasr",
+ #"--scoreFunction = blasr",
"--hitPolicy = random",
"--maxDivergence = 40",
"--debug")
- writeConfigFile(configFile, configOptions)
-
- def test_parseOptions_with_config(self):
- """Test parseOptions with a config file."""
- # With the above config file
- argumentList = ['--configFile', configFile,
- '--maxHits', '30',
- '--minAccuracy', '50',
- 'readfile', 'reffile', 'outfile']
- parser, options, infoMsg = parseOptions(argumentList)
-
- self.assertTrue(filecmp.cmp(options.configFile, configFile))
+ writeConfigFile(self.configFile, configOptions)
+ argumentList = [
+ '--configFile', self.configFile,
+ '--maxHits', '30',
+ '--minAccuracy', '50',
+ ] + self.infiles + ['outfile']
+ options = parseOptions(argumentList)
+
+ self.assertTrue(filecmp.cmp(options.configFile, self.configFile))
self.assertEqual(int(options.maxHits), 30)
self.assertEqual(int(options.minAccuracy), 50)
self.assertEqual("".join(options.algorithmOptions),
- "-noSplitSubreads -maxMatch 30 -nCandidates 30")
- self.assertEqual(options.scoreFunction, "blasr")
+ "--noSplitSubreads --maxMatch 30 --nCandidates 30")
+ #self.assertEqual(options.scoreFunction, "blasr")
self.assertEqual(options.hitPolicy, "random")
self.assertEqual(int(options.maxDivergence), 40)
def test_parseOptions_without_config(self):
"""Test parseOptions without any config file."""
argumentList = ['--maxHits=30',
- '--minAccuracy=50',
- 'readfile', 'reffile', 'outfile']
- parser, options,infoMsg = parseOptions(argumentList)
+ '--minAccuracy=50'
+ ] + self.infiles + ['outfile']
+ options = parseOptions(argumentList)
self.assertIsNone(options.configFile)
self.assertEqual(int(options.maxHits), 30)
@@ -83,15 +100,15 @@ class Test_Options(unittest.TestCase):
def test_parseOptions_multi_algorithmOptions(self):
"""Test parseOptions with multiple algorithmOptions."""
- algo1 = " -holeNumbers 1"
- algo2 = " -nCandidate 25"
- algo3 = " ' -bestn 11 '"
- argumentList = ['--algorithmOptions=%s' % algo1,
- '--algorithmOptions=%s' % algo2,
- 'readfile', 'reffile', 'outfile']
-
+ algo1 = " --holeNumbers 1"
+ algo2 = " --nCandidate 25"
+ algo3 = " ' --bestn 11 '"
+ argumentList = [
+ '--algorithmOptions=%s' % algo1,
+ '--algorithmOptions=%s' % algo2,
+ ] + self.infiles + ["outfile"]
print argumentList
- parser, options, infoMsg = parseOptions(argumentList)
+ options = parseOptions(argumentList)
# Both algo1 and algo2 should be in algorithmOptions.
print options.algorithmOptions
#self.assertTrue(algo1 in options.algorithmOptions)
@@ -99,11 +116,11 @@ class Test_Options(unittest.TestCase):
# Construct a config file.
configOptions = ("--algorithmOptions = \"%s\"" % algo3)
- writeConfigFile(configFile3, [configOptions])
+ writeConfigFile(self.configFile3, [configOptions])
- argumentList.append("--configFile={0}".format(configFile3))
+ argumentList.append("--configFile={0}".format(self.configFile3))
print argumentList
- parser, options, infoMsg = parseOptions(argumentList)
+ options = parseOptions(argumentList)
# Make sure algo3 have been overwritten.
print options.algorithmOptions
self.assertTrue(algo1 in options.algorithmOptions)
@@ -114,16 +131,14 @@ class Test_Options(unittest.TestCase):
"""Test parseOptions without specifying maxHits and minAccuracy."""
# Test if maxHits and minAccuracy are not set,
# whether both options.minAnchorSize and maxHits are None
- argumentList = ["--minAccuracy", "50",
- 'readfile', 'reffile', 'outfile']
- parser, options,infoMsg = parseOptions(argumentList)
+ argumentList = ["--minAccuracy", "50"] + self.infiles + ["outfile.bam"]
+ options = parseOptions(argumentList)
self.assertIsNone(options.minAnchorSize)
self.assertIsNone(options.maxHits)
def test_importDefaultOptions(self):
"""Test importDefaultOptions"""
- options = Namespace(configFile=configFile,
- minAccuracy=10,
+ options = Namespace(minAccuracy=10,
maxHits=12)
defaultOptions = {"minAccuracy":30, "maxHits":14}
newOptions, infoMsg = importDefaultOptions(options, defaultOptions)
diff --git a/tests/unit/test_pbalign.py b/tests/unit/test_pbalign.py
index 369ae79..dca8cfc 100755
--- a/tests/unit/test_pbalign.py
+++ b/tests/unit/test_pbalign.py
@@ -1,40 +1,47 @@
+
import unittest
+import tempfile
+import shutil
from os import path
+
from pbalign.pbalignrunner import PBAlignRunner
+from test_setpath import ROOT_DIR, DATA_DIR
+
class Test_PBAlignRunner(unittest.TestCase):
def setUp(self):
- self.rootDir = path.dirname(path.dirname(path.abspath(__file__)))
- self.queryFile = path.join(self.rootDir, "data/lambda_query.fasta")
- self.referenceFile = "/mnt/secondary/Smrtanalysis/opt/" + \
- "smrtanalysis/common/references/" + \
- "lambda/sequence/lambda.fasta"
+ self.rootDir = ROOT_DIR
+ self.queryFile = path.join(self.rootDir, "data/subreads_dataset1.xml")
+ self.referenceFile = path.join(self.rootDir, "data/reference_lambda.xml")
self.configFile = path.join(self.rootDir, "data/1.config")
- self.samOut = path.join(self.rootDir, "out/lambda_out.sam")
- self.cmph5Out = path.join(self.rootDir, "out/lambda_out.cmp.h5")
+ self.OUT_DIR = tempfile.mkdtemp()
+ self.bamOut = path.join(self.OUT_DIR, "lambda_out.bam")
+
+ def tearDown(self):
+ shutil.rmtree(self.OUT_DIR)
def test_init(self):
"""Test PBAlignRunner.__init__()."""
argumentList = ['--minAccuracy', '70', '--maxDivergence', '30',
self.queryFile, self.referenceFile,
- self.samOut]
+ self.bamOut]
pbobj = PBAlignRunner(argumentList = argumentList)
self.assertEqual(pbobj.start(), 0)
def test_init_with_algorithmOptions(self):
"""Test PBAlignRunner.__init__() with --algorithmOptions."""
- argumentList = ['--algorithmOptions', '-minMatch 10 -useccsall',
+ argumentList = ['--algorithmOptions', '--minMatch 10 --useccsall',
self.queryFile, self.referenceFile,
- self.cmph5Out]
+ self.bamOut]
pbobj = PBAlignRunner(argumentList = argumentList)
self.assertEqual(pbobj.start(), 0)
def test_init_with_config_algorithmOptions(self):
"""Test PBAlignRunner.__init__() with a config file and --algorithmOptions."""
- argumentList = ['--algorithmOptions', '-maxMatch 20 -nCandidates 30',
+ argumentList = ['--algorithmOptions', '--maxMatch 20 --nCandidates 30',
'--configFile', self.configFile,
self.queryFile, self.referenceFile,
- self.cmph5Out]
+ self.bamOut]
pbobj = PBAlignRunner(argumentList = argumentList)
self.assertEqual(pbobj.start(), 0)
@@ -42,13 +49,13 @@ class Test_PBAlignRunner(unittest.TestCase):
def test_init_expect_conflicting_options(self):
"""Test PBAlignRunner.__init__() with a config file and --algorithmOptions
and expect a ValueError for conflicting options."""
- argumentList = ['--algorithmOptions', '-minMatch 10 -useccsall',
+ argumentList = ['--algorithmOptions', '--minMatch 10 --useccsall',
'--configFile', self.configFile,
self.queryFile, self.referenceFile,
- self.cmph5Out]
+ self.bamOut]
pbobj = PBAlignRunner(argumentList = argumentList)
with self.assertRaises(ValueError) as cm:
- # Expect a ValueError since -minMatch and --minAnchorSize conflicts.
+ # Expect a ValueError since --minMatch and --minAnchorSize conflicts.
pbobj.start()
diff --git a/tests/unit/test_pbalignfiles.py b/tests/unit/test_pbalignfiles.py
index 30a241b..34c999b 100755
--- a/tests/unit/test_pbalignfiles.py
+++ b/tests/unit/test_pbalignfiles.py
@@ -1,18 +1,27 @@
+
import unittest
+import tempfile
import filecmp
+import shutil
from os import path
+
from pbalign.pbalignfiles import PBAlignFiles
+from test_setpath import ROOT_DIR
+
class Test_PbAlignFiles_Ecoli(unittest.TestCase):
def setUp(self):
- self.rootDir = path.dirname(path.dirname(path.abspath(__file__)))
+ self.rootDir = ROOT_DIR
self.inputFileName = path.join(self.rootDir, "data/ecoli.fasta")
- self.referencePath = "/mnt/secondary/Smrtanalysis/opt/" + \
- "smrtanalysis/common/references/ecoli_K12_MG1655/"
+ self.referencePath = "/pbi/dept/secondary/siv/references/ecoli_k12_MG1655/"
self.targetFileName = path.join(self.referencePath,
- "sequence/ecoli_K12_MG1655.fasta")
+ "sequence/ecoli_k12_MG1655.fasta")
self.sawriterFileName = self.targetFileName + ".sa"
- self.outputFileName = path.join(self.rootDir, "out/tmp.sam")
+ self.OUT_DIR = tempfile.mkdtemp()
+ self.outputFileName = path.join(self.OUT_DIR, "tmp.sam")
+
+ def tearDown(self):
+ shutil.rmtree(self.OUT_DIR)
def test_init(self):
"""Test PBAlignFiles.__init__() with a reference repository."""
@@ -30,12 +39,14 @@ class Test_PbAlignFiles_Ecoli(unittest.TestCase):
class Test_PbAlignFiles(unittest.TestCase):
def setUp(self):
- self.rootDir = path.dirname(path.dirname(path.abspath(__file__)))
+ self.rootDir = ROOT_DIR
self.inputFileName = path.join(self.rootDir, "data/lambda_bax.fofn")
- self.referenceFile = "/mnt/secondary/Smrtanalysis/opt/" + \
- "smrtanalysis/common/references/" + \
- "lambda/sequence/lambda.fasta"
- self.outputFileName = path.join(self.rootDir, "out/tmp.sam")
+ self.referenceFile = "/pbi/dept/secondary/siv/references/lambda/sequence/lambda.fasta"
+ self.OUT_DIR = tempfile.mkdtemp()
+ self.outputFileName = path.join(self.OUT_DIR, "tmp.sam")
+
+ def tearDown(self):
+ shutil.rmtree(self.OUT_DIR)
def test_init(self):
"""Test PBAlignFiles.__init__()."""
diff --git a/tests/unit/test_referenceInfo.py b/tests/unit/test_referenceInfo.py
index c6f7fc2..9712ef8 100755
--- a/tests/unit/test_referenceInfo.py
+++ b/tests/unit/test_referenceInfo.py
@@ -5,8 +5,7 @@ class Test_ReferenceInfo(unittest.TestCase):
def test_init(self):
"""Test ReferenceInfo.__init__() with a valid reference.info.xml."""
- rootDir = "/mnt/secondary/Smrtanalysis/opt/" + \
- "smrtanalysis/common/references/lambda/"
+ rootDir = "/pbi/dept/secondary/siv/references/lambda/"
r = ReferenceInfo(rootDir + "reference.info.xml")
self.assertEqual(r.refFastaFile, rootDir + "sequence/lambda.fasta")
self.assertEqual(r.refSawriterFile, rootDir + "sequence/lambda.fasta.sa")
diff --git a/tests/unit/test_setpath.py b/tests/unit/test_setpath.py
new file mode 100755
index 0000000..fd71f17
--- /dev/null
+++ b/tests/unit/test_setpath.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+from os import path
+import ConfigParser
+
+"""Define test data path for pbalign."""
+
+THIS_DIR = path.dirname(path.abspath(__file__))
+ROOT_DIR = path.dirname(THIS_DIR)
+NOSE_CFG = path.join(THIS_DIR, "nose.cfg")
+
+def _get_data_std_dir():
+ """Get the data directory which contains all the unittests files.
+ """
+ nosecfg = ConfigParser.SafeConfigParser()
+ nosecfg.readfp(open(NOSE_CFG), 'r')
+ if nosecfg.has_section('data'):
+ data_dir = path.abspath(nosecfg.get('data', 'dataDir'))
+ std_dir = path.abspath(nosecfg.get('data', 'stdDir'))
+ return data_dir, std_dir
+ else:
+ msg = "Unable to find section [DATA] option [dataDir]" + \
+ "and [stdDir] in config file {f}.".format(f=NOSE_CFG)
+ raise KeyError(msg)
+
+DATA_DIR, STD_DIR = _get_data_std_dir()
diff --git a/tests/unit/test_tool_contract.py b/tests/unit/test_tool_contract.py
new file mode 100755
index 0000000..0a41d2a
--- /dev/null
+++ b/tests/unit/test_tool_contract.py
@@ -0,0 +1,91 @@
+
+import subprocess
+import tempfile
+import unittest
+import os.path
+import sys
+
+import pbcommand.testkit
+from pbcore.io import AlignmentSet, ConsensusAlignmentSet, openDataSet
+
+import pbtestdata
+
+DATA_DIR = "/pbi/dept/secondary/siv/testdata/SA3-DS"
+DATA2 = "/pbi/dept/secondary/siv/testdata/pbalign-unittest2/data"
+REF_DIR = "/pbi/dept/secondary/siv/references"
+
+
+class TestPbalign(pbcommand.testkit.PbTestApp):
+ DRIVER_BASE = "pbalign "
+ REQUIRES_PBCORE = True
+ INPUT_FILES = [
+ pbtestdata.get_file("subreads-xml"),
+ pbtestdata.get_file("lambdaNEB")
+ ]
+ TASK_OPTIONS = {
+ "pbalign.task_options.algorithm_options": "--holeNumbers 1-1000,30000-30500,60000-60600,100000-100500",
+ }
+
+ def run_after(self, rtc, output_dir):
+ ds_out = openDataSet(rtc.task.output_files[0])
+ self.assertTrue(isinstance(ds_out, AlignmentSet),
+ type(ds_out).__name__)
+
+
+class TestPbalignCCS(pbcommand.testkit.PbTestApp):
+ DRIVER_BASE = "python -m pbalign.ccs"
+ INPUT_FILES = [
+ pbtestdata.get_file("rsii-ccs"),
+ pbtestdata.get_file("lambdaNEB")
+ ]
+
+ def run_after(self, rtc, output_dir):
+ ds_out = openDataSet(rtc.task.output_files[0])
+ self.assertTrue(isinstance(ds_out, ConsensusAlignmentSet),
+ type(ds_out).__name__)
+
+
+HAVE_BAMTOOLS = False
+try:
+ with tempfile.TemporaryFile() as O, \
+ tempfile.TemporaryFile() as E:
+ assert subprocess.call(["bamtools", "--help"], stdout=O, stderr=E) == 0
+except Exception as e:
+ sys.stderr.write(str(e)+"\n")
+ sys.stderr.write("bamtools missing, skipping test\n")
+else:
+ HAVE_BAMTOOLS = True
+
+
+ at unittest.skipUnless(HAVE_BAMTOOLS, "bamtools not installed")
+class TestConsolidateBam(pbcommand.testkit.PbTestApp):
+ DRIVER_BASE = "python -m pbalign.tasks.consolidate_alignments"
+ INPUT_FILES = [pbtestdata.get_file("aligned-ds-2")]
+ TASK_OPTIONS = {
+ "pbalign.task_options.consolidate_aligned_bam": True,
+ }
+
+ def run_after(self, rtc, output_dir):
+ with AlignmentSet(rtc.task.output_files[0]) as f:
+ f.assertIndexed()
+ self.assertEqual(len(f.toExternalFiles()), 1)
+ # test for bug 33778
+ qnames = set()
+ for rec in f:
+ qnames.add(rec.qName)
+ self.assertEqual(len(qnames), len(f))
+
+
+ at unittest.skipUnless(HAVE_BAMTOOLS, "bamtools not installed")
+class TestConsolidateBamDisabled(TestConsolidateBam):
+ TASK_OPTIONS = {
+ "pbalign.task_options.consolidate_aligned_bam": False,
+ }
+
+ def run_after(self, rtc, output_dir):
+ with AlignmentSet(rtc.task.output_files[0]) as f:
+ self.assertEqual(len(f.toExternalFiles()), 2)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/unit/test_unrolled.py b/tests/unit/test_unrolled.py
new file mode 100755
index 0000000..dbfdc94
--- /dev/null
+++ b/tests/unit/test_unrolled.py
@@ -0,0 +1,87 @@
+
+"""
+Test unrolled alignment support for SubreadSet input. The output should be
+identical to the same command using pseudo-polymerase reads reconstructed by
+bam2bam as input.
+"""
+
+import subprocess
+import tempfile
+import unittest
+import os.path as op
+
+from pbcore.io import AlignmentSet
+import pbcommand.testkit
+
+DATA = "/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/unrolled"
+REFERENCE = "/pbi/analysis/smrtportal/internal/userdata/references/All4mer_V2_11_V2_13_V2_15_V2_44_circular_72x_l50256/sequence/All4mer_V2_11_V2_13_V2_15_V2_44_circular_72x_l50256.fasta"
+BASE_ARGS = [
+ "pbalign",
+ "--nproc", "8",
+ "--hitPolicy=leftmost",
+ "--algorithmOptions", "--bestn 1 --forwardOnly --fastMaxInterval --maxAnchorsPerPosition 30000 --ignoreHQRegions --minPctIdentity 60",
+]
+
+skip_unless_files_present = unittest.skipUnless(
+ op.isdir(DATA) and op.isfile(REFERENCE),
+ "missing {d} or {r}".format(r=REFERENCE, d=DATA))
+
+def _verify_alignment(self, aln_file):
+ with AlignmentSet(aln_file) as ds:
+ self.assertEqual(len(ds), 1)
+ bam = ds.resourceReaders()[0]
+ qlen = bam[0].qEnd - bam[0].qStart
+ alen = bam[0].aEnd - bam[0].aStart
+ # length of unrolled polymerase read (unaligned)
+ self.assertEqual(qlen, 22395)
+ self.assertTrue(alen > 21000,
+ "alignment length is only {l}".format(l=alen))
+
+
+ at skip_unless_files_present
+class TestUnrolledBAM(unittest.TestCase):
+
+ def _run_args(self, args, aln_file):
+ self.assertEqual(subprocess.call(args), 0)
+ _verify_alignment(self, aln_file)
+
+ def test_polymerase_bam(self):
+ aln_file = tempfile.NamedTemporaryFile(suffix=".polymerase.bam").name
+ args = BASE_ARGS + [
+ op.join(DATA, "m54006_151021_185942.polymerase.bam"),
+ REFERENCE,
+ aln_file,
+ ]
+ self._run_args(args, aln_file)
+
+ def test_subreadset(self):
+ aln_file = tempfile.NamedTemporaryFile(suffix=".unrolled.bam").name
+ args = BASE_ARGS + [
+ "--noSplitSubreads",
+ op.join(DATA, "m54006_151021_185942.subreadset.xml"),
+ REFERENCE,
+ aln_file,
+ ]
+ self._run_args(args, aln_file)
+
+
+ at skip_unless_files_present
+class TestToolContract(pbcommand.testkit.PbTestApp):
+ DRIVER_BASE = "pbalign"
+ INPUT_FILES = [
+ op.join(DATA, "m54006_151021_185942.subreadset.xml"),
+ REFERENCE
+ ]
+ TASK_OPTIONS = {
+ "pbalign.task_options.no_split_subreads": True,
+ "pbalign.task_options.hit_policy": "leftmost",
+ "pbalign.task_options.concordant": False,
+ "pbalign.task_options.algorithm_options": "--bestn 1 --forwardOnly --fastMaxInterval --maxAnchorsPerPosition 30000 --ignoreHQRegions --minPctIdentity 60"
+ }
+
+ def run_after(self, rtc, output_dir):
+ _verify_alignment(self, rtc.task.output_files[0])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/unit/test_xmlout.py b/tests/unit/test_xmlout.py
new file mode 100755
index 0000000..25c81be
--- /dev/null
+++ b/tests/unit/test_xmlout.py
@@ -0,0 +1,38 @@
+
+import unittest
+import tempfile
+import shutil
+from os import path
+
+from pbcore.io import AlignmentSet, ReferenceSet
+
+from pbalign.pbalignrunner import PBAlignRunner
+
+from test_setpath import ROOT_DIR
+
+class Test_PBAlignRunner(unittest.TestCase):
+ def setUp(self):
+ self.rootDir = ROOT_DIR
+ self.queryFile = path.join(self.rootDir, "data/subreads_dataset1.xml")
+ self.referenceFile = path.join(self.rootDir, "data/reference_lambda.xml")
+ self.configFile = path.join(self.rootDir, "data/1.config")
+ self.OUT_DIR = tempfile.mkdtemp()
+ self.bamOut = path.join(self.OUT_DIR, "lambda_out.bam")
+ self.xmlOut = path.join(self.OUT_DIR, "lambda_out.xml")
+
+ def tearDown(self):
+ shutil.rmtree(self.OUT_DIR)
+
+ def test_init_xml(self):
+ """Test PBAlignRunner.__init__() to XML."""
+ argumentList = ['--minAccuracy', '70', '--maxDivergence', '30',
+ self.queryFile, self.referenceFile,
+ self.xmlOut]
+ pbobj = PBAlignRunner(argumentList=argumentList)
+ self.assertEqual(pbobj.start(), 0)
+ aln = AlignmentSet(self.xmlOut)
+ self.assertEqual(aln.externalResources[0].reference,
+ ReferenceSet(self.referenceFile).toExternalFiles()[0])
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/unit_h5/nose.cfg b/tests/unit_h5/nose.cfg
new file mode 100644
index 0000000..5ed666c
--- /dev/null
+++ b/tests/unit_h5/nose.cfg
@@ -0,0 +1,17 @@
+# pbalign unittest config, copied from pbreports.
+
+[DEFAULT]
+
+[log]
+level=debug
+file=pbalign-unittest.log
+
+[data]
+dataDir=/pbi/dept/secondary/siv/testdata/pbalign-unittest/data/
+stdDir=/pbi/dept/secondary/siv/testdata/pbalign-unittest/stdout/
+
+#This is for generating documentation of report attribute ids
+[doc]
+jinja-templates=doc/report-metrics
+html-dir=doc/_build/html
+
diff --git a/tests/unit/test_filterservice.py b/tests/unit_h5/test_filterservice.py
similarity index 85%
rename from tests/unit/test_filterservice.py
rename to tests/unit_h5/test_filterservice.py
index 02aaca4..c754281 100755
--- a/tests/unit/test_filterservice.py
+++ b/tests/unit_h5/test_filterservice.py
@@ -2,6 +2,7 @@
from pbalign.filterservice import FilterService
from os import path
import unittest
+from test_setpath import ROOT_DIR, DATA_DIR, OUT_DIR, STD_DIR
class Opt(object):
"""The Option class."""
@@ -18,14 +19,12 @@ class Opt(object):
class Test_FilterService(unittest.TestCase):
"""Test pbalign.filterservice."""
def setUp(self):
- self.testDir = path.dirname(path.dirname(path.abspath(__file__)))
+ self.testDir = ROOT_DIR
self.alignedSam = path.join(self.testDir,
"data/lambda.sam")
- self.targetFileName = "/mnt/secondary/Smrtanalysis/opt/" + \
- "smrtanalysis/common/references/" + \
- "lambda/sequence/lambda.fasta"
- self.filteredSam = path.join(self.testDir,
- "out/lambda_filtered.sam")
+ self.targetFileName = "/pbi/dept/secondary/siv/references/lambda/sequence/lambda.fasta"
+ self.filteredSam = path.join(OUT_DIR,
+ "lambda_filtered.sam")
def test_init(self):
"""Test FilterService.__init__()."""
diff --git a/tests/unit/test_forquiverservice.py b/tests/unit_h5/test_forquiverservice.py
similarity index 72%
rename from tests/unit/test_forquiverservice.py
rename to tests/unit_h5/test_forquiverservice.py
index ccaf9d2..ac407eb 100755
--- a/tests/unit/test_forquiverservice.py
+++ b/tests/unit_h5/test_forquiverservice.py
@@ -5,6 +5,7 @@ from shutil import copyfile
from pbalign.forquiverservice.forquiver import ForQuiverService
from pbalign.pbalignfiles import PBAlignFiles
from tempfile import mkstemp
+from test_setpath import DATA_DIR
class Opt(object):
"""Simulate PBAlign options."""
@@ -18,17 +19,13 @@ class Opt(object):
class Test_ForQuiverService(unittest.TestCase):
"""Test pbalign.forquiverservice.forquiver."""
def setUp(self):
- self.rootDir = "/mnt/secondary-siv/" + \
- "testdata/BlasrTestData/pbalign"
- self.inCmpFile = path.join(self.rootDir, "data/testforquiver.cmp.h5")
- #self.outCmpFile = path.join(self.rootDir, "out/testforquiver.cmp.h5")
+ self.inCmpFile = path.join(DATA_DIR, "testforquiver.cmp.h5")
self.outCmpFile = mkstemp(suffix=".cmp.h5")[1]
copyfile(self.inCmpFile, self.outCmpFile)
- self.basFile = path.join(self.rootDir, "data/lambda_bax.fofn")
+ self.basFile = path.join(DATA_DIR, "lambda_bax.fofn")
- refpath = "/mnt/secondary/Smrtanalysis/opt/" + \
- "smrtanalysis/common/references/lambda/"
+ refpath = "/pbi/dept/secondary/siv/references/lambda/"
self.fileNames = PBAlignFiles()
self.fileNames.SetInOutFiles(self.basFile, refpath,
diff --git a/tests/unit/test_gmap.py b/tests/unit_h5/test_gmap.py
similarity index 90%
rename from tests/unit/test_gmap.py
rename to tests/unit_h5/test_gmap.py
index 68c5013..4155b5c 100755
--- a/tests/unit/test_gmap.py
+++ b/tests/unit_h5/test_gmap.py
@@ -5,20 +5,20 @@ from pbalign.alignservice.gmap import GMAPService
from pbalign.pbalignfiles import PBAlignFiles
from pbalign.options import parseOptions
import argparse
+from test_setpath import ROOT_DIR, OUT_DIR
class Test_GMAPService(unittest.TestCase):
"""Test pbalign.alignservices.gmap."""
def setUp(self):
"""Set up test data."""
- self.rootDir = path.dirname(path.dirname(path.abspath(__file__)))
- self.outDir = path.join(self.rootDir, "out/")
+ self.rootDir = ROOT_DIR
+ self.outDir = OUT_DIR
self.outSam = path.join(self.outDir, "test_gmap_01.sam")
self.dataDir = path.join(self.rootDir, "data/")
self.queryFofn = path.join(self.dataDir, "ecoli_lp.fofn")
self.refFa = path.join(self.dataDir, "ecoli.fasta")
- self.repoPath = "/mnt/secondary/Smrtanalysis/opt/smrtanalysis/" + \
- "common/references/ecoli/"
+ self.repoPath = "/pbi/dept/secondary/siv/references/ecoli/"
def test_gmapCreateDB_case1(self):
"""Test _gmapCreateDB(refFile, isWithinRepository, tempRootDir).
diff --git a/tests/unit/test_loadpulsesservice.py b/tests/unit_h5/test_loadpulsesservice.py
similarity index 73%
rename from tests/unit/test_loadpulsesservice.py
rename to tests/unit_h5/test_loadpulsesservice.py
index 6a094ba..a78ee8c 100755
--- a/tests/unit/test_loadpulsesservice.py
+++ b/tests/unit_h5/test_loadpulsesservice.py
@@ -5,19 +5,17 @@ from shutil import copyfile
from pbalign.forquiverservice.loadpulses import LoadPulsesService
from argparse import Namespace
from tempfile import mkstemp
+from test_setpath import ROOT_DIR, DATA_DIR
class Test_LoadPulsesService(unittest.TestCase):
"""Test pbalign.forquiverservice.loadpulses."""
def setUp(self):
"""Set up tests."""
- self.rootDir = "/mnt/secondary-siv/" + \
- "testdata/BlasrTestData/pbalign"
- self.inCmpFile = path.join(self.rootDir, "data/testloadpulses.cmp.h5")
- #self.outCmpFile = path.join(self.rootDir, "out/testloadpulses.cmp.h5")
+ self.inCmpFile = path.join(DATA_DIR, "testloadpulses.cmp.h5")
self.outCmpFile = mkstemp(suffix=".cmp.h5")[1]
- self.basFile = path.join(self.rootDir, "data/lambda_bax.fofn")
+ self.basFile = path.join(ROOT_DIR, "data/lambda_bax.fofn")
copyfile(self.inCmpFile, self.outCmpFile)
self.options = Namespace(metrics="DeletionQV", byread=False)
self.obj = LoadPulsesService(self.basFile,
diff --git a/tests/unit/test_pbalign.py b/tests/unit_h5/test_pbalign.py
similarity index 84%
copy from tests/unit/test_pbalign.py
copy to tests/unit_h5/test_pbalign.py
index 369ae79..eef6262 100755
--- a/tests/unit/test_pbalign.py
+++ b/tests/unit_h5/test_pbalign.py
@@ -1,17 +1,16 @@
import unittest
from os import path
from pbalign.pbalignrunner import PBAlignRunner
+from test_setpath import ROOT_DIR, DATA_DIR, OUT_DIR
class Test_PBAlignRunner(unittest.TestCase):
def setUp(self):
- self.rootDir = path.dirname(path.dirname(path.abspath(__file__)))
+ self.rootDir = ROOT_DIR
self.queryFile = path.join(self.rootDir, "data/lambda_query.fasta")
- self.referenceFile = "/mnt/secondary/Smrtanalysis/opt/" + \
- "smrtanalysis/common/references/" + \
- "lambda/sequence/lambda.fasta"
+ self.referenceFile = "/pbi/dept/secondary/siv/references/lambda/sequence/lambda.fasta"
self.configFile = path.join(self.rootDir, "data/1.config")
- self.samOut = path.join(self.rootDir, "out/lambda_out.sam")
- self.cmph5Out = path.join(self.rootDir, "out/lambda_out.cmp.h5")
+ self.samOut = path.join(OUT_DIR, "lambda_out.sam")
+ self.cmph5Out = path.join(OUT_DIR, "lambda_out.cmp.h5")
def test_init(self):
"""Test PBAlignRunner.__init__()."""
diff --git a/tests/unit/test_repackservice.py b/tests/unit_h5/test_repackservice.py
similarity index 72%
rename from tests/unit/test_repackservice.py
rename to tests/unit_h5/test_repackservice.py
index 19d9555..0050c8f 100755
--- a/tests/unit/test_repackservice.py
+++ b/tests/unit_h5/test_repackservice.py
@@ -4,20 +4,17 @@ from os import path, remove
from shutil import copyfile
from pbalign.forquiverservice.repack import RepackService
from tempfile import mkstemp
+from test_setpath import DATA_DIR
class Test_RepackService(unittest.TestCase):
"""Test pbalign.forquiverservice.repack."""
def setUp(self):
"""Set up the tests."""
- self.rootDir = "/mnt/secondary-siv/" + \
- "testdata/BlasrTestData/pbalign"
- self.inCmpFile = path.join(self.rootDir, "data/testrepack.cmp.h5")
- #self.outCmpFile = path.join(self.rootDir, "out/testrepack.cmp.h5")
+ self.inCmpFile = path.join(DATA_DIR, "testrepack.cmp.h5")
self.outCmpFile = mkstemp(suffix=".cmp.h5")[1]
self.tmpCmpFile = self.outCmpFile + ".tmp"
- #self.tmpCmpFile = path.join(self.rootDir, "out/testrepack.cmp.h5.tmp")
copyfile(self.inCmpFile, self.outCmpFile)
self.options = {}
self.obj = RepackService(self.outCmpFile, self.tmpCmpFile)
diff --git a/tests/unit/test_rgnh5io.py b/tests/unit_h5/test_rgnh5io.py
similarity index 92%
rename from tests/unit/test_rgnh5io.py
rename to tests/unit_h5/test_rgnh5io.py
index 6083bb2..c09208f 100755
--- a/tests/unit/test_rgnh5io.py
+++ b/tests/unit_h5/test_rgnh5io.py
@@ -6,6 +6,7 @@ from pbalign.utils.RgnH5IO import Region, RegionTable, RgnH5Reader, \
RgnH5Writer, addStrListAttr
from os import path
import h5py
+from test_setpath import ROOT_DIR, DATA_DIR, OUT_DIR
class Test_RgnH5IO(unittest.TestCase):
@@ -13,11 +14,10 @@ class Test_RgnH5IO(unittest.TestCase):
def setUp(self):
"""Set up test data."""
- self.rootDir = path.dirname(path.dirname(path.abspath(__file__)))
- self.inRgnFN = "/mnt/secondary-siv/testdata/" + \
- "BlasrTestData/pbalign/data/test_rgnh5io.rgn.h5"
- self.outRgnFN = path.join(self.rootDir, "out/test_rgnh5io_out.rgn.h5")
- self.outTmpFN = path.join(self.rootDir, "out/test_rgnh5io_tmp.h5")
+ self.rootDir = ROOT_DIR
+ self.inRgnFN = path.join(DATA_DIR, "test_rgnh5io.rgn.h5")
+ self.outRgnFN = path.join(OUT_DIR, "test_rgnh5io_out.rgn.h5")
+ self.outTmpFN = path.join(OUT_DIR, "test_rgnh5io_tmp.h5")
self.movieName = "m130427_152935_42178_" + \
"c100518602550000001823079209281316_s1_p0"
diff --git a/tests/unit_h5/test_setpath.py b/tests/unit_h5/test_setpath.py
new file mode 100755
index 0000000..c7c9e67
--- /dev/null
+++ b/tests/unit_h5/test_setpath.py
@@ -0,0 +1,27 @@
+#!/usr/bin/python
+from os import path
+import ConfigParser
+
+"""Define test data path for pbalign."""
+
+THIS_DIR = path.dirname(path.abspath(__file__))
+ROOT_DIR = path.dirname(THIS_DIR)
+NOSE_CFG = path.join(THIS_DIR, "nose.cfg")
+
+def _get_data_std_dir():
+ """Get the data directory which contains all the unittests files.
+ """
+ nosecfg = ConfigParser.SafeConfigParser()
+ nosecfg.readfp(open(NOSE_CFG), 'r')
+ if nosecfg.has_section('data'):
+ data_dir = path.abspath(nosecfg.get('data', 'dataDir'))
+ std_dir = path.abspath(nosecfg.get('data', 'stdDir'))
+ return data_dir, std_dir
+ else:
+ msg = "Unable to find section [DATA] option [dataDir]" + \
+ "and [stdDir] in config file {f}.".format(f=NOSE_CFG)
+ raise KeyError(msg)
+
+OUT_DIR = path.join(ROOT_DIR, "out")
+DATA_DIR, STD_DIR = _get_data_std_dir()
+
diff --git a/tests/unit/test_sortservice.py b/tests/unit_h5/test_sortservice.py
similarity index 84%
rename from tests/unit/test_sortservice.py
rename to tests/unit_h5/test_sortservice.py
index dad10a4..0eb27c3 100755
--- a/tests/unit/test_sortservice.py
+++ b/tests/unit_h5/test_sortservice.py
@@ -4,6 +4,7 @@ from os import path, remove
from shutil import copyfile
from pbalign.forquiverservice.sort import SortService
from tempfile import mkstemp
+from test_setpath import DATA_DIR
class Opt(object):
@@ -16,9 +17,7 @@ class Test_SortService(unittest.TestCase):
"""Test pbalign.forquiverservice.sort."""
def setUp(self):
"""Set up tests."""
- self.rootDir = "/mnt/secondary-siv/" + \
- "testdata/BlasrTestData/pbalign"
- self.inCmpFile = path.join(self.rootDir, "data/testsort.cmp.h5")
+ self.inCmpFile = path.join(DATA_DIR, "testsort.cmp.h5")
self.outCmpFile = mkstemp(suffix=".cmp.h5")[1]
copyfile(self.inCmpFile, self.outCmpFile)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbalign.git
More information about the debian-med-commit
mailing list