[med-svn] [python-pbcore] 01/12: Imported Upstream version 1.2.9+dfsg
Afif Elghraoui
afif at moszumanska.debian.org
Tue Apr 12 06:31:33 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository python-pbcore.
commit 60c0774d1ab9947ee237aa4d355ac5c5fb54f1a9
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Mon Apr 11 21:09:23 2016 -0700
Imported Upstream version 1.2.9+dfsg
---
CHANGELOG.org | 14 +-
bin/dataset.py | 64 +-
doc/pbcore.io.rst | 24 +-
pbcore/__init__.py | 2 +-
pbcore/chemistry/resources/mapping.xml | 8 +-
pbcore/data/datasets/__init__.py | 6 +-
pbcore/data/datasets/empty_lambda.aligned.bam | Bin 0 -> 700 bytes
pbcore/data/datasets/empty_lambda.aligned.bam.bai | Bin 0 -> 24 bytes
pbcore/data/datasets/empty_lambda.aligned.bam.pbi | Bin 0 -> 67 bytes
...69412550000001823090301191423_s1_p0.ccs.bam.pbi | Bin 0 -> 297 bytes
pbcore/io/BasH5IO.py | 145 +--
pbcore/io/FastaIO.py | 20 +-
pbcore/io/align/BamAlignment.py | 45 +-
pbcore/io/align/BamIO.py | 45 +-
pbcore/io/align/CmpH5IO.py | 99 +-
pbcore/io/align/_BamSupport.py | 7 +-
pbcore/io/dataset/DataSetIO.py | 1272 +++++++++++++-------
pbcore/io/dataset/DataSetMembers.py | 338 ++++--
pbcore/io/dataset/DataSetReader.py | 49 +-
pbcore/io/dataset/DataSetWriter.py | 10 +
pbcore/io/dataset/EntryPoints.py | 246 ----
pbcore/io/dataset/utils.py | 20 +-
pbcore/io/rangeQueries.py | 4 +-
pbcore/model/__init__.py | 2 +
pbcore/model/baseRegions.py | 192 +++
requirements-dev.txt | 1 +
requirements.txt | 2 +-
tests/test_pbcore_io_BasH5Reader.py | 3 +-
tests/test_pbcore_io_unaligned_bam.py | 2 +-
tests/test_pbdataset.py | 1139 ++++++++++++++++--
tests/test_pbdataset_subtypes.py | 319 ++++-
31 files changed, 2810 insertions(+), 1268 deletions(-)
diff --git a/CHANGELOG.org b/CHANGELOG.org
index 4eaeb5a..bf187aa 100644
--- a/CHANGELOG.org
+++ b/CHANGELOG.org
@@ -1,4 +1,16 @@
-* Current progress
+* Version 1.2.9
+ - Rename pulseFeature accessors to "baseFeature...", since they
+ reflect features accompanying basecalls as opposed to pulse calls;
+ add "pulseFeature..." accessors that actually get the pulse
+ features (from an internal-mode BAM)
+ - Improve FastaWriter performance with IndexedFastaRecords
+ - Improve DataSet chunking, filtration, consolidation
+
+* Version 1.2.8
+ - Dataset improvements
+ - Dataset CLI moved to pbcoretools
+ - Support for context flag filtering
+ - Misc fixes
* Version 1.2.7...
- Improved IPython tab completion
diff --git a/bin/dataset.py b/bin/dataset.py
index e5fcef8..ebf100f 100755
--- a/bin/dataset.py
+++ b/bin/dataset.py
@@ -1,67 +1,13 @@
#!/usr/bin/env python
import sys
-import argparse
-import logging
-import time
-from pbcore.io.dataset import EntryPoints
-from pbcore import __VERSION__
-
-
-LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
-
-def _setup_logging():
- log = logging.getLogger()
- logging.Formatter.converter = time.gmtime
- if not log.handlers:
- logging.basicConfig(level=logging.WARN, format=LOG_FORMAT)
- return log
-
-def get_subparsers():
- sps = [('create', EntryPoints.create_options),
- ('filter', EntryPoints.filter_options),
- ('merge', EntryPoints.merge_options),
- ('split', EntryPoints.split_options),
- ('validate', EntryPoints.validate_options),
- ('loadstats', EntryPoints.loadStatsXml_options),
- ('summarize', EntryPoints.summarize_options),
- ('consolidate', EntryPoints.consolidate_options)]
- return sps
-
-def add_subparsers(parser, sps):
- subparsers = parser.add_subparsers(
- title='DataSet sub-commands', dest='subparser_name',
- help="Type {command} -h for a command's options")
- for command_name, func in sps:
- subparser = subparsers.add_parser(command_name)
- subparser = func(subparser)
- return parser
-
-def get_parser():
- description = 'Run dataset.py by specifying a command.'
- parser = argparse.ArgumentParser(version=__VERSION__,
- description=description)
- parser.add_argument("--debug", default=False, action='store_true',
- help="Turn on debug level logging")
- parser.add_argument("--strict", default=False, action='store_true',
- help="Turn on strict tests, raise all errors")
- parser.add_argument("--skipCounts", default=False, action='store_true',
- help="Turn on strict tests, raise all errors")
- subparser_list = get_subparsers()
- parser = add_subparsers(parser, subparser_list)
- return parser
def main(argv=sys.argv):
- """Main point of Entry"""
- log = _setup_logging()
- log.info("Starting {f} version {v} dataset manipulator".format(
- f=__file__, v=__VERSION__))
- parser = get_parser()
- args = parser.parse_args()
- if args.debug:
- log.setLevel(logging.DEBUG)
- return args.func(args)
- #return main_runner_default(argv[1:], get_parser(), log)
+ """This file, its use in setup.py should be removed after 1.2.8 is cut"""
+ print ("WARNING: dataset.py is no longer part of pbcore. It has moved to "
+ "pbcoretools (https://github.com/PacificBiosciences/pbcoretools) "
+ "and is now just 'dataset'")
+ exit(1)
if __name__ == '__main__':
sys.exit(main())
diff --git a/doc/pbcore.io.rst b/doc/pbcore.io.rst
index 34a9744..4d481ee 100644
--- a/doc/pbcore.io.rst
+++ b/doc/pbcore.io.rst
@@ -12,9 +12,28 @@ The classes within ``pbcore.io`` adhere to a few conventions, in order
to provide a uniform API:
- Each data file type is thought of as a container of a `Record`
- type; all `Reader` classes support streaming access, and
- `CmpH5Reader` and `BasH5Reader` additionally provide random-access
+ type; all `Reader` classes support streaming access by iterating on the
+ reader object, and
+ `CmpH5Reader`, `BasH5Reader` and `IndexedBarReader` additionally
+ provide random-access
to alignments/reads.
+
+ For example::
+
+ from pbcore.io import *
+ with IndexedBamReader(filename) as f:
+ for r in f:
+ process(r)
+
+ To make scripts a bit more user friendly, a progress bar can be
+ easily added using the `tqdm` third-party package::
+
+ from pbcore.io import *
+ from tqdm import tqdm
+ with IndexedBamReader(filename) as f:
+ for r in tqdm(f):
+ process(r)
+
- The constructor argument needed to instantiate `Reader` and
`Writer` objects can be either a filename (which can be suffixed
@@ -31,7 +50,6 @@ to provide a uniform API:
the `CmpH5Reader` object will be automatically closed after the
block within the "with" statement is executed.
-
BAM/cmp.h5 compatibility: quick start
-------------------------------------
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
index 5edce9f..bfe805f 100644
--- a/pbcore/__init__.py
+++ b/pbcore/__init__.py
@@ -28,4 +28,4 @@
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
-__VERSION__ = "1.2.7"
+__VERSION__ = "1.2.9"
diff --git a/pbcore/chemistry/resources/mapping.xml b/pbcore/chemistry/resources/mapping.xml
index 2f625d8..c40b10a 100644
--- a/pbcore/chemistry/resources/mapping.xml
+++ b/pbcore/chemistry/resources/mapping.xml
@@ -176,9 +176,15 @@
<SoftwareVersion>2.3</SoftwareVersion>
</Mapping>
<Mapping>
- <SequencingChemistry>S/P1-C1</SequencingChemistry>
+ <SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
<BindingKit>100-619-300</BindingKit>
<SequencingKit>100-620-000</SequencingKit>
<SoftwareVersion>3.0</SoftwareVersion>
</Mapping>
+ <Mapping>
+ <SequencingChemistry>S/P1-C1</SequencingChemistry>
+ <BindingKit>100-619-300</BindingKit>
+ <SequencingKit>100-620-000</SequencingKit>
+ <SoftwareVersion>3.1</SoftwareVersion>
+ </Mapping>
</MappingTable>
diff --git a/pbcore/data/datasets/__init__.py b/pbcore/data/datasets/__init__.py
index 2d4378a..b3d17c8 100755
--- a/pbcore/data/datasets/__init__.py
+++ b/pbcore/data/datasets/__init__.py
@@ -27,7 +27,8 @@ XML_FILES = ["alignment.dataset.xml", #0
]
BAM_FILES = ["pbalchemysim0.pbalign.bam",
"pbalchemysim1.pbalign.bam",
- os.path.join('..', 'bam_mapping.bam')]
+ os.path.join('..', 'bam_mapping.bam'),
+ "empty_lambda.aligned.bam",]
STATS_FILES = [
"m150430_142051_Mon_p1_b25.sts.xml",
"m150616_053259_ethan_c100710482550000001823136404221563_s1_p0.sts.xml"]
@@ -46,6 +47,9 @@ def getXmlWithStats():
def getBam(no=0):
return _getAbsPath(BAM_FILES[no])
+def getEmptyAlignedBam():
+ return _getAbsPath(BAM_FILES[3])
+
def getStats(no=0):
return _getAbsPath(STATS_FILES[no])
diff --git a/pbcore/data/datasets/empty_lambda.aligned.bam b/pbcore/data/datasets/empty_lambda.aligned.bam
new file mode 100644
index 0000000..edd878c
Binary files /dev/null and b/pbcore/data/datasets/empty_lambda.aligned.bam differ
diff --git a/pbcore/data/datasets/empty_lambda.aligned.bam.bai b/pbcore/data/datasets/empty_lambda.aligned.bam.bai
new file mode 100644
index 0000000..13df110
Binary files /dev/null and b/pbcore/data/datasets/empty_lambda.aligned.bam.bai differ
diff --git a/pbcore/data/datasets/empty_lambda.aligned.bam.pbi b/pbcore/data/datasets/empty_lambda.aligned.bam.pbi
new file mode 100644
index 0000000..e398d79
Binary files /dev/null and b/pbcore/data/datasets/empty_lambda.aligned.bam.pbi differ
diff --git a/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam.pbi b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam.pbi
new file mode 100644
index 0000000..1d37ca5
Binary files /dev/null and b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam.pbi differ
diff --git a/pbcore/io/BasH5IO.py b/pbcore/io/BasH5IO.py
index 86e50a3..9d3055d 100644
--- a/pbcore/io/BasH5IO.py
+++ b/pbcore/io/BasH5IO.py
@@ -44,30 +44,13 @@ from pbcore.io.FofnIO import readFofn
from pbcore.chemistry import (decodeTriple,
tripleFromMetadataXML,
ChemistryLookupError)
+from pbcore.model import ExtraBaseRegionsMixin, HQ_REGION
from ._utils import arrayFromDataset, CommonEqualityMixin
-def intersectRanges(r1, r2):
- b1, e1 = r1
- b2, e2 = r2
- b, e = max(b1, b2), min(e1, e2)
- return (b, e) if (b < e) else None
-
-def rangeLength(r):
- b, e = r
- return e - b
-
-def removeNones(lst):
- return filter(lambda x: x!=None, lst)
-
# ZMW hole Types
SEQUENCING_ZMW = 0
-# Region types
-ADAPTER_REGION = 0
-INSERT_REGION = 1
-HQ_REGION = 2
-
# This seems to be the magic incantation to get a RecArray that can be
# indexed to yield a record that can then be accessed using dot
# notation.
@@ -85,7 +68,7 @@ def _makeQvAccessor(featureName):
return self.qv(featureName)
return f
-class Zmw(CommonEqualityMixin):
+class Zmw(CommonEqualityMixin, ExtraBaseRegionsMixin):
"""
A Zmw represents all data from a ZMW (zero-mode waveguide) hole
within a bas.h5 movie file. Accessor methods provide convenient
@@ -111,65 +94,6 @@ class Zmw(CommonEqualityMixin):
return toRecArray(REGION_TABLE_DTYPE,
[ (self.holeNumber, HQ_REGION, 0, 0, 0) ])
- #
- # The following "region" calls return one or more intervals ((int, int)).
- # - The default implementations perform clipping to the hqRegion.
- # - The "unclipped" implementations entail no clipping
- #
- @property
- def adapterRegionsNoQC(self):
- """
- Get adapter regions as intervals, without clipping to the HQ
- region. Don't use this unless you know what you're doing.
- """
- return [ (region.regionStart, region.regionEnd)
- for region in self.regionTable
- if region.regionType == ADAPTER_REGION ]
-
- @property
- def adapterRegions(self):
- """
- Get adapter regions as intervals, performing clipping to the HQ region
- """
- hqRegion = self.hqRegion
- return removeNones([ intersectRanges(hqRegion, region)
- for region in self.adapterRegionsNoQC ])
-
- @property
- def insertRegionsNoQC(self):
- """
- Get insert regions as intervals, without clipping to the HQ
- region. Don't use this unless you know what you're doing.
- """
- return [ (region.regionStart, region.regionEnd)
- for region in self.regionTable
- if region.regionType == INSERT_REGION ]
-
- @property
- def insertRegions(self):
- """
- Get insert regions as intervals, clipped to the HQ region
- """
- hqRegion = self.hqRegion
- return removeNones([ intersectRanges(hqRegion, region)
- for region in self.insertRegionsNoQC ])
- @property
- def hqRegion(self):
- """
- Return the HQ region interval.
-
- The HQ region is an interval of basecalls where the basecaller has
- inferred that a single sequencing reaction is taking place.
- Secondary analysis should only use subreads within the HQ
- region. Methods in this class, with the exception of the
- "NoQC" methods, return data appropriately clipped/filtered to
- the HQ region.
- """
- rt = self.regionTable
- hqRows = rt[rt.regionType == HQ_REGION]
- assert len(hqRows) == 1
- hqRow = hqRows[0]
- return hqRow.regionStart, hqRow.regionEnd
@property
def readScore(self):
@@ -242,7 +166,6 @@ class Zmw(CommonEqualityMixin):
readEnd = hqEnd if readEnd is None else readEnd
return ZmwRead(self.baxH5, self.holeNumber, readStart, readEnd)
-
def readNoQC(self, readStart=None, readEnd=None):
"""
Given no arguments, returns the entire polymerase read, *not
@@ -255,71 +178,12 @@ class Zmw(CommonEqualityMixin):
as we make no guarantees about what happens outside of the
HQ region.
"""
- if not self.baxH5.hasRawBasecalls:
- raise ValueError, "No raw reads in this file"
polymeraseBegin = 0
polymeraseEnd = self.numEvents
readStart = polymeraseBegin if readStart is None else readStart
readEnd = polymeraseEnd if readEnd is None else readEnd
return ZmwRead(self.baxH5, self.holeNumber, readStart, readEnd)
- @property
- def subreadsNoQC(self):
- """
- Get the subreads, including data beyond the bounds of the HQ region.
-
- .. warning::
-
- It is not recommended that production code use this method
- as we make no guarantees about what happens outside of the
- HQ region.
- """
- if not self.baxH5.hasRawBasecalls:
- raise ValueError, "No raw reads in this file"
- return [ self.read(readStart, readEnd)
- for (readStart, readEnd) in self.insertRegionsNoQC ]
-
- @property
- def subreads(self):
- """
- Get the subreads as a list of ZmwRead objects. Restricts focus,
- and clips to, the HQ region. This method can be used by
- production code.
- """
- if not self.baxH5.hasRawBasecalls:
- raise ValueError, "No raw reads in this file"
- return [ self.read(readStart, readEnd)
- for (readStart, readEnd) in self.insertRegions ]
-
-
- @property
- def adapters(self):
- """
- Get the adapter hits as a list of ZmwRead objects. Restricts
- focus, and clips to, the HQ region. This method can be used
- by production code.
- """
- if not self.baxH5.hasRawBasecalls:
- raise ValueError, "No raw reads in this file"
- return [ self.read(readStart, readEnd)
- for (readStart, readEnd) in self.adapterRegions ]
-
- @property
- def adaptersNoQC(self):
- """
- Get the adapters, including data beyond the bounds of the HQ
- region.
-
- .. warning::
-
- It is not recommended that production code use this method
- as we make no guarantees about what happens outside of the
- HQ region.
- """
- if not self.baxH5.hasRawBasecalls:
- raise ValueError, "No raw reads in this file"
- return [ self.read(readStart, readEnd)
- for (readStart, readEnd) in self.adapterRegionsNoQC ]
@property
def ccsRead(self):
@@ -429,8 +293,9 @@ class CCSZmwRead(ZmwRead):
def _makeOffsetsDataStructure(h5Group):
numEvent = h5Group["ZMW/NumEvent"].value
holeNumber = h5Group["ZMW/HoleNumber"].value
- endOffset = np.cumsum(numEvent)
- beginOffset = np.hstack(([0], endOffset[0:-1]))
+ endOffset = np.cumsum(numEvent, dtype=np.uint32)
+ singleZero = np.array([0], dtype=np.uint32)
+ beginOffset = np.hstack((singleZero, endOffset[0:-1]))
offsets = zip(beginOffset, endOffset)
return dict(zip(holeNumber, offsets))
diff --git a/pbcore/io/FastaIO.py b/pbcore/io/FastaIO.py
index 79867fc..6735636 100644
--- a/pbcore/io/FastaIO.py
+++ b/pbcore/io/FastaIO.py
@@ -252,12 +252,9 @@ class FastaWriter(WriterBase):
raise ValueError
if len(args) == 1:
record = args[0]
- if isinstance(record, IndexedFastaRecord):
- record = FastaRecord(record.header, record.sequence)
- assert isinstance(record, FastaRecord)
else:
header, sequence = args
- record = FastaRecord(header, sequence)
+ record = FastaRecord(header, str(sequence))
self.file.write(str(record))
self.file.write("\n")
@@ -350,7 +347,14 @@ class MmappedFastaSequence(Sequence):
return (isinstance(other, MmappedFastaSequence) and
self[:] == other[:])
+ def __str__(self):
+ return str(self[:])
+
+
class IndexedFastaRecord(object):
+
+ COLUMNS = 60
+
def __init__(self, view, faiRecord):
self.view = view
self.faiRecord = faiRecord
@@ -391,6 +395,14 @@ class IndexedFastaRecord(object):
self.header == other.header and
self.sequence == other.sequence)
+ def __str__(self):
+ """
+ Output a string representation of this FASTA record, observing
+ standard conventions about sequence wrapping.
+ """
+ return (">%s\n" % self.header) + \
+ wrap(self.sequence, self.COLUMNS)
+
class IndexedFastaReader(ReaderBase, Sequence):
"""
Random-access FASTA file reader.
diff --git a/pbcore/io/align/BamAlignment.py b/pbcore/io/align/BamAlignment.py
index 2bdba22..8428d0f 100644
--- a/pbcore/io/align/BamAlignment.py
+++ b/pbcore/io/align/BamAlignment.py
@@ -58,12 +58,11 @@ def _unrollCigar(cigar, exciseSoftClips=False):
else:
return ops
-def _makePulseFeatureAccessor(featureName):
+def _makeBaseFeatureAccessor(featureName):
def f(self, aligned=True, orientation="native"):
- return self.pulseFeature(featureName, aligned, orientation)
+ return self.baseFeature(featureName, aligned, orientation)
return f
-
def requiresReference(method):
@wraps(method)
def f(bamAln, *args, **kwargs):
@@ -260,11 +259,27 @@ class BamAlignment(AlignmentRecordMixin):
return self.readGroupInfo.ReadType
@property
+ def scrapType(self):
+ if self.readType != "SCRAP":
+ raise ValueError, "scrapType not meaningful for non-scrap reads"
+ else:
+ return self.peer.opt("sc")
+
+ @property
def sequencingChemistry(self):
return self.readGroupInfo.SequencingChemistry
@property
def hqRegionSnr(self):
+ """
+ Return the per-channel SNR averaged over the HQ region.
+
+ .. note::
+
+ This capability was not available in `cmp.h5` files, so
+ use of this property can result in code that won't work on
+ legacy data.
+ """
return self.peer.opt("sn")
@property
@@ -431,9 +446,9 @@ class BamAlignment(AlignmentRecordMixin):
return pos[ucOriented != BAM_CINS]
- def pulseFeature(self, featureName, aligned=True, orientation="native"):
+ def baseFeature(self, featureName, aligned=True, orientation="native"):
"""
- Retrieve the pulse feature as indicated.
+ Retrieve the base feature as indicated.
- `aligned` : whether gaps should be inserted to reflect the alignment
- `orientation`: "native" or "genomic"
@@ -450,10 +465,10 @@ class BamAlignment(AlignmentRecordMixin):
# 0. Get the "concrete" feature name. (Example: Ipd could be
# Ipd:Frames or Ipd:CodecV1)
- concreteFeatureName = self.bam._featureNameMappings[self.qId][featureName]
+ concreteFeatureName = self.bam._baseFeatureNameMappings[self.qId][featureName]
# 1. Extract in native orientation
- tag, kind_, dtype_ = PULSE_FEATURE_TAGS[concreteFeatureName]
+ tag, kind_, dtype_ = BASE_FEATURE_TAGS[concreteFeatureName]
data_ = self.peer.opt(tag)
if isinstance(data_, str):
@@ -518,14 +533,14 @@ class BamAlignment(AlignmentRecordMixin):
alnData[~gapMask] = data
return alnData
- IPD = _makePulseFeatureAccessor("Ipd")
- PulseWidth = _makePulseFeatureAccessor("PulseWidth")
- #QualityValue = _makePulseFeatureAccessor("QualityValue")
- InsertionQV = _makePulseFeatureAccessor("InsertionQV")
- DeletionQV = _makePulseFeatureAccessor("DeletionQV")
- DeletionTag = _makePulseFeatureAccessor("DeletionTag")
- MergeQV = _makePulseFeatureAccessor("MergeQV")
- SubstitutionQV = _makePulseFeatureAccessor("SubstitutionQV")
+ IPD = _makeBaseFeatureAccessor("Ipd")
+ PulseWidth = _makeBaseFeatureAccessor("PulseWidth")
+ #QualityValue = _makeBaseFeatureAccessor("QualityValue")
+ InsertionQV = _makeBaseFeatureAccessor("InsertionQV")
+ DeletionQV = _makeBaseFeatureAccessor("DeletionQV")
+ DeletionTag = _makeBaseFeatureAccessor("DeletionTag")
+ MergeQV = _makeBaseFeatureAccessor("MergeQV")
+ SubstitutionQV = _makeBaseFeatureAccessor("SubstitutionQV")
def read(self, aligned=True, orientation="native"):
if not (orientation == "native" or orientation == "genomic"):
diff --git a/pbcore/io/align/BamIO.py b/pbcore/io/align/BamIO.py
index f731b14..a9e7fa7 100644
--- a/pbcore/io/align/BamIO.py
+++ b/pbcore/io/align/BamIO.py
@@ -98,7 +98,10 @@ class _BamReaderBase(ReaderBase):
def _loadReadGroupInfo(self):
rgs = self.peer.header["RG"]
readGroupTable_ = []
- self._featureNameMappings = {} # RGID -> ("abstract feature name" -> actual feature name)
+
+ # RGID -> ("abstract feature name" -> actual feature name)
+ self._baseFeatureNameMappings = {}
+ self._pulseFeatureNameMappings = {}
for rg in rgs:
rgID = rgAsInt(rg["ID"])
@@ -110,9 +113,7 @@ class _BamReaderBase(ReaderBase):
triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
rgChem = decodeTriple(*triple)
rgReadType = ds["READTYPE"]
- # TODO(dalexander): need FRAMERATEHZ in RG::DS!
- #rgFrameRate = ds["FRAMERATEHZ"]
- rgFrameRate = 75.0
+ rgFrameRate = ds["FRAMERATEHZ"]
readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate))
# Look for the features manifest entries within the DS tag,
@@ -120,10 +121,14 @@ class _BamReaderBase(ReaderBase):
# "Ipd" to "Ipd:Frames"
# (This is a bit messy. Can we separate the manifest from
# the rest of the DS content?)
- featureNameMapping = { key.split(":")[0] : key
- for key in ds.keys()
- if key in PULSE_FEATURE_TAGS }
- self._featureNameMappings[rgID] = featureNameMapping
+ baseFeatureNameMapping = { key.split(":")[0] : key
+ for key in ds.keys()
+ if key in BASE_FEATURE_TAGS }
+ pulseFeatureNameMapping = { key.split(":")[0] : key
+ for key in ds.keys()
+ if key in PULSE_FEATURE_TAGS }
+ self._baseFeatureNameMappings[rgID] = baseFeatureNameMapping
+ self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping
self._readGroupTable = np.rec.fromrecords(
readGroupTable_,
@@ -138,10 +143,12 @@ class _BamReaderBase(ReaderBase):
self._readGroupDict = { rg.ID : rg
for rg in self._readGroupTable }
- # The pulse features "available" to clients of this file are the intersection
- # of pulse features available from each read group.
+ # The base/pulse features "available" to clients of this file are the intersection
+ # of features available from each read group.
+ self._baseFeaturesAvailable = set.intersection(
+ *[set(mapping.keys()) for mapping in self._baseFeatureNameMappings.values()])
self._pulseFeaturesAvailable = set.intersection(
- *[set(mapping.keys()) for mapping in self._featureNameMappings.values()])
+ *[set(mapping.keys()) for mapping in self._pulseFeatureNameMappings.values()])
def _loadProgramInfo(self):
pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None))
@@ -288,12 +295,25 @@ class _BamReaderBase(ReaderBase):
self.peer.seek(offset)
return BamAlignment(self, next(self.peer))
+ def hasBaseFeature(self, featureName):
+ return featureName in self._baseFeaturesAvailable
+
+ def baseFeaturesAvailable(self):
+ return self._baseFeaturesAvailable
+
def hasPulseFeature(self, featureName):
return featureName in self._pulseFeaturesAvailable
def pulseFeaturesAvailable(self):
return self._pulseFeaturesAvailable
+ def hasPulseFeatures(self):
+ """
+ Is this BAM file a product of running analysis with the
+ PacBio-internal analysis mode enabled?
+ """
+ return self.hasPulseFeature("PulseCall")
+
@property
def barcode(self):
raise Unimplemented()
@@ -369,7 +389,8 @@ class IndexedBamReader(_BamReaderBase, IndexedAlignmentReaderMixin):
if exists(pbiFname):
self.pbi = PacBioBamIndex(pbiFname)
else:
- raise IOError, "IndexedBamReader requires bam.pbi index file"
+ raise IOError("IndexedBamReader requires bam.pbi index file "+
+ "to read {f}".format(f=fname))
else:
self.pbi = sharedIndex
diff --git a/pbcore/io/align/CmpH5IO.py b/pbcore/io/align/CmpH5IO.py
index 61eb81a..aa5e83b 100755
--- a/pbcore/io/align/CmpH5IO.py
+++ b/pbcore/io/align/CmpH5IO.py
@@ -187,9 +187,9 @@ OFFSET_TABLE_DTYPE = [ ("ID", np.uint32),
("EndRow", np.uint32) ]
-def _makePulseFeatureAccessor(featureName):
+def _makeBaseFeatureAccessor(featureName):
def f(self, aligned=True, orientation="native"):
- return self.pulseFeature(featureName, aligned, orientation)
+ return self.baseFeature(featureName, aligned, orientation)
return f
class CmpH5Alignment(AlignmentRecordMixin):
@@ -546,9 +546,9 @@ class CmpH5Alignment(AlignmentRecordMixin):
else:
return self.rStart + np.hstack([0, np.cumsum(readNonGapMask[:-1])])
- def pulseFeature(self, featureName, aligned=True, orientation="native"):
+ def baseFeature(self, featureName, aligned=True, orientation="native"):
"""
- Access a pulse feature by name.
+ Access a base feature by name.
"""
pulseDataset = self._alignmentGroup[featureName]
pulseArray = arrayFromDataset(pulseDataset, self.Offset_begin, self.Offset_end)
@@ -561,14 +561,14 @@ class CmpH5Alignment(AlignmentRecordMixin):
else:
return ungappedPulseArray(alignedPulseArray)
- IPD = _makePulseFeatureAccessor("IPD")
- PulseWidth = _makePulseFeatureAccessor("PulseWidth")
- QualityValue = _makePulseFeatureAccessor("QualityValue")
- InsertionQV = _makePulseFeatureAccessor("InsertionQV")
- DeletionQV = _makePulseFeatureAccessor("DeletionQV")
- DeletionTag = _makePulseFeatureAccessor("DeletionTag")
- MergeQV = _makePulseFeatureAccessor("MergeQV")
- SubstitutionQV = _makePulseFeatureAccessor("SubstitutionQV")
+ IPD = _makeBaseFeatureAccessor("IPD")
+ PulseWidth = _makeBaseFeatureAccessor("PulseWidth")
+ QualityValue = _makeBaseFeatureAccessor("QualityValue")
+ InsertionQV = _makeBaseFeatureAccessor("InsertionQV")
+ DeletionQV = _makeBaseFeatureAccessor("DeletionQV")
+ DeletionTag = _makeBaseFeatureAccessor("DeletionTag")
+ MergeQV = _makeBaseFeatureAccessor("MergeQV")
+ SubstitutionQV = _makeBaseFeatureAccessor("SubstitutionQV")
def __getattr__(self, key):
return self.cmpH5.alignmentIndex[self.rowNumber][key]
@@ -755,8 +755,8 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
# dataset_name -> dataset }. This way we avoid B-tree
# scanning in basic data access.
self._alignmentGroupById = {}
- for (alnGroupId, alnGroupPath) in zip(self.file["/AlnGroup/ID"],
- self.file["/AlnGroup/Path"]):
+ for (alnGroupId, alnGroupPath) in zip(self.file["/AlnGroup/ID"][:],
+ self.file["/AlnGroup/Path"][:]):
alnGroup = self.file[alnGroupPath]
self._alignmentGroupById[alnGroupId] = dict(alnGroup.items())
@@ -807,18 +807,18 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
def _loadReferenceInfo(self):
_referenceGroupTbl = np.rec.fromrecords(
- zip(self.file["/RefGroup/ID"],
- self.file["/RefGroup/RefInfoID"],
+ zip(self.file["/RefGroup/ID"][:],
+ self.file["/RefGroup/RefInfoID"][:],
[path[1:] for path in self.file["/RefGroup/Path"]]),
dtype=[("ID" , int),
("RefInfoID", int),
("Name" , object)])
_referenceInfoTbl = np.rec.fromrecords(
- zip(self.file["/RefInfo/ID"],
- self.file["/RefInfo/FullName"],
- self.file["/RefInfo/Length"],
- self.file["/RefInfo/MD5"]) ,
+ zip(self.file["/RefInfo/ID"][:],
+ self.file["/RefInfo/FullName"][:],
+ self.file["/RefInfo/Length"][:],
+ self.file["/RefInfo/MD5"][:]) ,
dtype=[("RefInfoID", int),
("FullName" , object),
("Length" , int),
@@ -838,26 +838,37 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
jointype="inner")
self._referenceDict = {}
self._readLocatorByKey = {}
- for record in self._referenceInfoTable:
- if record.ID != -1:
- assert record.ID != record.Name
+
+ # For cmp.h5 files with large numbers of references, accessing
+ # the recarray fields in the inner loop was terribly slow.
+ # This makes things faster, though the code is less
+ # straightforward. (One of the tradeoffs we have to make
+ # without a compiler to help us...)
+ recordID = self._referenceInfoTable.ID
+ recordName = self._referenceInfoTable.Name
+ recordFullName = self._referenceInfoTable.FullName
+ recordMD5 = self._referenceInfoTable.MD5
+
+ for i, record in enumerate(self._referenceInfoTable):
+ if recordID[i] != -1:
+ assert recordID[i] != record.Name
shortName = splitFastaHeader(record.FullName)[0]
- if (shortName in self._referenceDict or
- record.ID in self._referenceDict or
- record.Name in self._referenceDict or
- record.FullName in self._referenceDict or
- record.MD5 in self._referenceDict):
+ if (shortName in self._referenceDict or
+ recordID[i] in self._referenceDict or
+ recordName[i] in self._referenceDict or
+ recordFullName[i] in self._referenceDict or
+ recordMD5[i] in self._referenceDict):
raise ValueError, "Duplicate reference contig sequence or identifier"
else:
- self._referenceDict[shortName] = record
- self._referenceDict[record.ID] = record
- self._referenceDict[record.Name] = record
- self._referenceDict[record.FullName] = record
- self._referenceDict[record.MD5] = record
+ self._referenceDict[shortName] = record
+ self._referenceDict[recordID[i]] = record
+ self._referenceDict[recordName[i]] = record
+ self._referenceDict[recordFullName[i]] = record
+ self._referenceDict[recordMD5[i]] = record
if self.isSorted:
- readLocator = makeReadLocator(self, record.ID)
- self._readLocatorByKey[record.ID] = readLocator
+ readLocator = makeReadLocator(self, recordID[i])
+ self._readLocatorByKey[recordID[i]] = readLocator
self._readLocatorByKey[shortName] = readLocator
def _loadMiscInfo(self):
@@ -1169,7 +1180,7 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
else:
return self[rowNumbers]
- def hasPulseFeature(self, featureName):
+ def hasBaseFeature(self, featureName):
"""
Are the datasets for pulse feature `featureName` loaded in
this file? Specifically, is it loaded for all movies within
@@ -1177,30 +1188,30 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
.. doctest::
- >>> c.hasPulseFeature("InsertionQV")
+ >>> c.hasBaseFeature("InsertionQV")
True
- >>> c.hasPulseFeature("MergeQV")
+ >>> c.hasBaseFeature("MergeQV")
False
"""
return all(featureName in alnGroup.keys()
for alnGroup in self._alignmentGroupById.values())
- def pulseFeaturesAvailable(self):
+ def baseFeaturesAvailable(self):
"""
What pulse features are available in this cmp.h5 file?
.. doctest::
- >>> c.pulseFeaturesAvailable()
+ >>> c.baseFeaturesAvailable()
[u'QualityValue', u'IPD', u'PulseWidth', u'InsertionQV', u'DeletionQV']
"""
- pulseFeaturesByMovie = [ alnGroup.keys()
+ baseFeaturesByMovie = [ alnGroup.keys()
for alnGroup in self._alignmentGroupById.values() ]
- pulseFeaturesAvailableAsSet = set.intersection(*map(set, pulseFeaturesByMovie))
- pulseFeaturesAvailableAsSet.discard("AlnArray")
- return list(pulseFeaturesAvailableAsSet)
+ baseFeaturesAvailableAsSet = set.intersection(*map(set, baseFeaturesByMovie))
+ baseFeaturesAvailableAsSet.discard("AlnArray")
+ return list(baseFeaturesAvailableAsSet)
@property
def barcode(self):
diff --git a/pbcore/io/align/_BamSupport.py b/pbcore/io/align/_BamSupport.py
index 4f8f92a..fd12f62 100644
--- a/pbcore/io/align/_BamSupport.py
+++ b/pbcore/io/align/_BamSupport.py
@@ -38,7 +38,7 @@ class ReferenceMismatch(Exception): pass
class IncompatibleFile(Exception): pass
-PULSE_FEATURE_TAGS = { "InsertionQV" : ("iq", "qv", np.uint8),
+BASE_FEATURE_TAGS = { "InsertionQV" : ("iq", "qv", np.uint8),
"DeletionQV" : ("dq", "qv", np.uint8),
"DeletionTag" : ("dt", "base", np.int8 ),
"SubstitutionQV" : ("sq", "qv", np.uint8),
@@ -48,6 +48,11 @@ PULSE_FEATURE_TAGS = { "InsertionQV" : ("iq", "qv", np.uint8),
"PulseWidth:Frames" : ("pw", "frames", np.uint16),
"PulseWidth:CodecV1" : ("pw", "codecV1", np.uint8) }
+PULSE_FEATURE_TAGS = { "PulseCall" : ("pc", "pulse", np.uint8),
+ "StartFrame" : ("sf", "frames32", np.uint32),
+ "PkMid" : ("pm", "photons", np.uint16),
+ "PkMean" : ("pa", "photons", np.uint16) }
+
ASCII_COMPLEMENT_MAP = { ord("A") : ord("T"),
ord("T") : ord("A"),
ord("C") : ord("G"),
diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py
index d80191b..a5edaef 100755
--- a/pbcore/io/dataset/DataSetIO.py
+++ b/pbcore/io/dataset/DataSetIO.py
@@ -2,21 +2,22 @@
Classes representing DataSets of various types.
"""
-from collections import defaultdict, Counter
import hashlib
import datetime
-import math
import copy
-import os
+import os, sys
+import re
import errno
import logging
import itertools
import xml.dom.minidom
import tempfile
-from functools import wraps
import numpy as np
from urlparse import urlparse
+from functools import wraps, partial
+from collections import defaultdict, Counter
from pbcore.util.Process import backticks
+from pbcore.chemistry.chemistry import ChemistryLookupError
from pbcore.io.align.PacBioBamIndex import PBI_FLAGS_BARCODE
from pbcore.io.FastaIO import splitFastaHeader, FastaWriter
from pbcore.io.FastqIO import FastqReader, FastqWriter, qvsFromAscii
@@ -55,6 +56,13 @@ def filtered(generator):
yield read
return wrapper
+def _getTimeStampedName(mType):
+ """Generate a timestamped name using the given metatype 'mType' and the
+ current UTC time"""
+ mType = mType.lower()
+ mType = '_'.join(mType.split('.'))
+ time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
+ return "{m}-{t}".format(m=mType, t=time)
def _toDsId(name):
"""Translate a class name into a MetaType/ID"""
@@ -100,15 +108,16 @@ def isDataSet(xmlfile):
try:
_typeDataSet(xmlfile)
return True
- except:
+ except Exception:
return False
def openDataSet(*files, **kwargs):
"""Factory function for DataSet types as suggested by the first file"""
- if not isDataSet(files[0]):
+ try:
+ tbrType = _typeDataSet(files[0])
+ return tbrType(*files, **kwargs)
+ except Exception:
raise TypeError("openDataSet requires that the first file is a DS")
- tbrType = _typeDataSet(files[0])
- return tbrType(*files, **kwargs)
def openDataFile(*files, **kwargs):
"""Factory function for DataSet types determined by the first data file"""
@@ -158,12 +167,59 @@ def _stackRecArrays(recArrays):
tbr = tbr.view(np.recarray)
return tbr
+def _uniqueRecords(recArray):
+ """Remove duplicate records"""
+ unique = set()
+ unique_i = []
+ for i, rec in enumerate(recArray):
+ rect = tuple(rec)
+ if rect not in unique:
+ unique.add(rect)
+ unique_i.append(i)
+ return recArray[unique_i]
+
+def _fieldsView(recArray, fields):
+ vdtype = np.dtype({fi:recArray.dtype.fields[fi] for fi in fields})
+ return np.ndarray(recArray.shape, vdtype, recArray, 0, recArray.strides)
+
+def _renameField(recArray, current, new):
+ ind = recArray.dtype.names.index(current)
+ names = list(recArray.dtype.names)
+ names[ind] = new
+ recArray.dtype.names = names
+
def _flatten(lol, times=1):
""" This wont do well with mixed nesting"""
for _ in range(times):
lol = np.concatenate(lol)
return lol
+def divideKeys(keys, chunks):
+ if chunks < 1:
+ return []
+ if chunks > len(keys):
+ chunks = len(keys)
+ chunksize = len(keys)/chunks
+ key_chunks = [keys[(i * chunksize):((i + 1) * chunksize)] for i in
+ range(chunks-1)]
+ key_chunks.append(keys[((chunks - 1) * chunksize):])
+ return key_chunks
+
+def splitKeys(keys, chunks):
+ if chunks < 1:
+ return []
+ if chunks > len(keys):
+ chunks = len(keys)
+ chunksize = len(keys)/chunks
+ key_chunks = [(keys[i * chunksize], keys[(i + 1) * chunksize - 1]) for i in
+ range(chunks-1)]
+ key_chunks.append((keys[(chunks - 1) * chunksize], keys[-1]))
+ return key_chunks
+
+def keysToRanges(keys):
+ key_ranges = [[min(k), max(k)] for k in keys]
+ return key_ranges
+
class DataSetMetaTypes(object):
"""
This mirrors the PacBioSecondaryDataModel.xsd definitions and be used
@@ -199,6 +255,32 @@ def _fileType(fname):
ftype = ftype.strip('.')
return ftype
+def _fileExists(fname):
+ """Assert that a file exists with a useful failure mode"""
+ if not isinstance(fname, str):
+ fname = fname.resourceId
+ if not os.path.isfile(fname):
+ raise InvalidDataSetIOError("Resource {f} not found".format(f=fname))
+ return True
+
+def checkAndResolve(fname, possibleRelStart=None):
+ """Try and skip resolveLocation if possible"""
+ tbr = fname
+ if not fname.startswith(os.path.sep):
+ log.debug('Unable to assume path is already absolute')
+ tbr = resolveLocation(fname, possibleRelStart)
+ return tbr
+
+def _pathChanger(osPathFunc, metaTypeFunc, resource):
+ """Apply these two functions to the resource or ResourceId"""
+ resId = resource.resourceId
+ currentPath = urlparse(resId)
+ if currentPath.scheme == 'file' or not currentPath.scheme:
+ currentPath = currentPath.path
+ currentPath = osPathFunc(currentPath)
+ resource.resourceId = currentPath
+ metaTypeFunc(resource)
+
class DataSet(object):
"""The record containing the DataSet information, with possible type
@@ -248,7 +330,8 @@ class DataSet(object):
>>> ds1.numExternalResources
2
>>> # Constructors should be used directly
- >>> SubreadSet(data.getSubreadSet()) # doctest:+ELLIPSIS
+ >>> SubreadSet(data.getSubreadSet(),
+ ... skipMissing=True) # doctest:+ELLIPSIS
<SubreadSet...
>>> # Even with untyped inputs
>>> AlignmentSet(data.getBam()) # doctest:+ELLIPSIS
@@ -266,7 +349,9 @@ class DataSet(object):
True
"""
+ files = [str(fn) for fn in files]
self._strict = kwargs.get('strict', False)
+ skipMissing = kwargs.get('skipMissing', False)
self._skipCounts = kwargs.get('skipCounts', False)
_induceIndices = kwargs.get('generateIndices', False)
@@ -288,9 +373,10 @@ class DataSet(object):
self.fileNames = files
# parse files
- log.debug('Containers specified')
populateDataSet(self, files)
- log.debug('Done populating')
+
+ if not skipMissing:
+ self._modResources(_fileExists)
# DataSet base class shouldn't really be used. It is ok-ish for just
# basic xml mainpulation. May start warning at some point, but
@@ -311,8 +397,8 @@ class DataSet(object):
else:
dsType = self.objMetadata.setdefault("MetaType",
_toDsId('DataSet'))
- if not self.objMetadata.get("TimeStampedName", ""):
- self.objMetadata["TimeStampedName"] = self._getTimeStampedName(
+ if not "TimeStampedName" in self.objMetadata:
+ self.objMetadata["TimeStampedName"] = _getTimeStampedName(
self.objMetadata["MetaType"])
self.objMetadata.setdefault("Name",
self.objMetadata["TimeStampedName"])
@@ -351,8 +437,12 @@ class DataSet(object):
self._referenceInfoTable = None
self._referenceDict = {}
self._indexMap = None
- self._stackedReferenceInfoTable = False
+ self._referenceInfoTableIsStacked = None
+ self._readGroupTableIsRemapped = False
self._index = None
+ # only to be used against incorrect counts from the XML, not for
+ # internal accounting:
+ self._countsUpdated = False
# update counts
if files:
@@ -360,12 +450,17 @@ class DataSet(object):
self.updateCounts()
elif self.totalLength <= 0 or self.numRecords <= 0:
self.updateCounts()
+ elif len(files) > 1:
+ self.updateCounts()
# generate indices if requested and needed
if _induceIndices:
self.induceIndices()
def induceIndices(self):
+ """Generate indices for ExternalResources.
+
+ Not compatible with DataSet base type"""
raise NotImplementedError()
def __repr__(self):
@@ -428,7 +523,7 @@ class DataSet(object):
# Block on filters?
if (not firstIn and
not self.filters.testCompatibility(other.filters)):
- log.warning("Filter incompatibility has blocked the addition "
+ log.warning("Filter incompatibility has blocked the merging "
"of two datasets")
return None
else:
@@ -526,21 +621,34 @@ class DataSet(object):
"closed properly.")
else:
raise
+ del reader
self._openReaders = []
def __exit__(self, *exec_info):
self.close()
def __len__(self):
- if self.numRecords == -1:
+ """Return the number of records in this DataSet"""
+ if self.numRecords <= 0:
if self._filters:
- log.warn("Base class DataSet length cannot be calculate when "
- "filters present")
+ if isinstance(self.datasetType, tuple):
+ log.debug("Base class DataSet length cannot be calculated "
+ "when filters are present")
+ else:
+ self.updateCounts()
else:
- count = 0
- for reader in self.resourceReaders():
- count += len(reader)
- self.numRecords = count
+ try:
+ # a little cheaper:
+ count = 0
+ for reader in self.resourceReaders():
+ count += len(reader)
+ self.numRecords = count
+ except UnavailableFeature:
+ # UnavailableFeature: no .bai
+ self.updateCounts()
+ elif not self._countsUpdated:
+ # isn't that expensive, avoids crashes due to incorrect numRecords:
+ self.updateCounts()
return self.numRecords
def newUuid(self, setter=True):
@@ -619,8 +727,7 @@ class DataSet(object):
... ds1.subdatasets for ds2d in
... ds2.subdatasets])
>>> # But types are maintained:
- >>> # TODO: turn strict back on once sim sets are indexable
- >>> ds1 = SubreadSet(data.getXml(no=10), strict=False)
+ >>> ds1 = SubreadSet(data.getXml(no=10), strict=True)
>>> ds1.metadata # doctest:+ELLIPSIS
<SubreadSetMetadata...
>>> ds2 = ds1.copy()
@@ -650,8 +757,6 @@ class DataSet(object):
"{t}".format(s=type(self).__name__,
t=asType))
tbr.merge(self)
- # update the metatypes: (TODO: abstract out 'iterate over all
- # resources and modify the element that contains them')
tbr.makePathsAbsolute()
return tbr
result = copy.deepcopy(self)
@@ -738,19 +843,25 @@ class DataSet(object):
True
>>> # unmerge:
>>> ds1, ds2 = sorted(
- ... dss.split(2, ignoreSubDatasets=False), key=lambda x: x.totalLength, reverse=True)
+ ... dss.split(2, ignoreSubDatasets=False),
+ ... key=lambda x: x.totalLength, reverse=True)
>>> ds1.totalLength == ds1tl
True
>>> ds2.totalLength == ds2tl
True
"""
+ # File must have pbi index to be splittable:
+ if len(self) == 0:
+ return [self.copy()]
if contigs:
return self._split_contigs(chunks, maxChunks, breakContigs,
targetSize=targetSize,
byRecords=byRecords,
updateCounts=updateCounts)
elif zmws:
+ if chunks == 0:
+ chunks = maxChunks
return self._split_zmws(chunks, targetSize=targetSize)
elif barcodes:
return self._split_barcodes(chunks)
@@ -863,9 +974,9 @@ class DataSet(object):
>>> import tempfile, os
>>> outdir = tempfile.mkdtemp(suffix="dataset-doctest")
>>> outfile = os.path.join(outdir, 'tempfile.xml')
- >>> ds1 = DataSet(data.getXml())
+ >>> ds1 = DataSet(data.getXml(), skipMissing=True)
>>> ds1.write(outfile, validate=False)
- >>> ds2 = DataSet(outfile)
+ >>> ds2 = DataSet(outfile, skipMissing=True)
>>> ds1 == ds2
True
"""
@@ -879,8 +990,14 @@ class DataSet(object):
if pretty:
xml_string = xml.dom.minidom.parseString(xml_string).toprettyxml(
encoding="UTF-8")
+
+ # not useful yet as a list, but nice to keep the options open:
+ validation_errors = []
if validate:
- validateString(xml_string, relTo=os.path.dirname(outFile))
+ try:
+ validateString(xml_string, relTo=os.path.dirname(outFile))
+ except Exception as e:
+ validation_errors.append(e)
fileName = urlparse(outFile).path.strip()
if self._strict and not isinstance(self.datasetType, tuple):
if not fileName.endswith(_dsIdToSuffix(self.datasetType)):
@@ -892,6 +1009,10 @@ class DataSet(object):
with open(fileName, 'w') as outFile:
outFile.writelines(xml_string)
+ for e in validation_errors:
+ log.error("Invalid file produced: {f}".format(f=fileName))
+ raise e
+
def loadStats(self, filename):
"""Load pipeline statistics from a <moviename>.sts.xml file. The subset
of these data that are defined in the DataSet XSD become available
@@ -917,8 +1038,8 @@ class DataSet(object):
[3152, 1802, 798, 0]
"""
- if isinstance(filename, str):
- statsMetadata = parseStats(filename)
+ if isinstance(filename, basestring):
+ statsMetadata = parseStats(str(filename))
else:
statsMetadata = filename
if self.metadata.summaryStats:
@@ -953,16 +1074,10 @@ class DataSet(object):
return filters
def _filterType(self):
+ """A key that maps to a set of filtration keywords specific to this
+ DataSet's ExternalResource type"""
raise NotImplementedError()
- def checkAndResolve(self, fname, possibleRelStart=None):
- """Try and skip resolveLocation if possible"""
- if fname.startswith(os.path.sep):
- return fname
- else:
- log.debug('Unable to assume path is already absolute')
- return resolveLocation(fname, possibleRelStart)
-
def makePathsAbsolute(self, curStart="."):
"""As part of the validation process, make all ResourceIds absolute
URIs rather than relative paths. Generally not called by API users.
@@ -972,7 +1087,7 @@ class DataSet(object):
"""
log.debug("Making paths absolute")
self._changePaths(
- lambda x, s=curStart: self.checkAndResolve(x, s))
+ lambda x, s=curStart: checkAndResolve(x, s))
def makePathsRelative(self, outDir=False):
"""Make things easier for writing test cases: make all
@@ -990,6 +1105,8 @@ class DataSet(object):
self._changePaths(os.path.relpath)
def _modResources(self, func):
+ """Execute some function 'func' on each external resource in the
+ dataset and each subdataset"""
# check all ExternalResources
stack = list(self.externalResources)
while stack:
@@ -1000,6 +1117,7 @@ class DataSet(object):
func(item)
try:
stack.extend(list(item.indices))
+ stack.extend(list(item.externalResources))
except AttributeError:
# Some things just don't have indices
pass
@@ -1016,34 +1134,13 @@ class DataSet(object):
:osPathFunc: A function for modifying paths (e.g. os.path.abspath)
:checkMetaType: Update the metatype of externalResources if needed
"""
- # check all ExternalResources
- stack = list(self.externalResources)
- while stack:
- item = stack.pop()
- resId = item.resourceId
- if not resId:
- continue
- currentPath = urlparse(resId)
- if currentPath.scheme == 'file' or not currentPath.scheme:
- currentPath = currentPath.path
- currentPath = osPathFunc(currentPath)
- item.resourceId = currentPath
- if checkMetaType:
- self._updateMetaType(item)
- try:
- stack.extend(list(item.indices))
- stack.extend(list(item.externalResources))
- except AttributeError:
- # Some things just don't have indices or subresources
- pass
-
- # check all DataSetMetadata
-
- # check all SubDatasets
- for dataset in self.subdatasets:
- dataset._changePaths(osPathFunc)
+ metaTypeFunc = self._updateMetaType if checkMetaType else lambda x: x
+ resFunc = partial(_pathChanger, osPathFunc, metaTypeFunc)
+ self._modResources(resFunc)
def _populateMetaTypes(self):
+ """Add metatypes to those ExternalResources that currently are
+ without"""
self._modResources(self._updateMetaType)
def _updateMetaType(self, resource):
@@ -1054,15 +1151,9 @@ class DataSet(object):
resource.metaType = self._metaTypeMapping().get(file_type, "")
if not resource.timeStampedName:
mtype = resource.metaType
- tsName = self._getTimeStampedName(mtype)
+ tsName = _getTimeStampedName(mtype)
resource.timeStampedName = tsName
- def _getTimeStampedName(self, mType):
- mType = mType.lower()
- mType = '_'.join(mType.split('.'))
- time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
- return "{m}-{t}".format(m=mType, t=time)
-
@staticmethod
def _metaTypeMapping():
"""The available mappings between file extension and MetaType (informed
@@ -1071,6 +1162,8 @@ class DataSet(object):
return {}
def copyFiles(self, outdir):
+ """Copy all of the top level ExternalResources to an output
+ directory 'outdir'"""
backticks('cp {i} {o}'.format(i=' '.join(self.toExternalFiles()),
o=outdir))
@@ -1078,6 +1171,7 @@ class DataSet(object):
"""Disable read filtering for this object"""
self.reFilter()
self.noFiltering = True
+ # a dummy filter:
self._cachedFilters = [lambda x: True]
def enableFilters(self):
@@ -1095,9 +1189,9 @@ class DataSet(object):
Doctest:
>>> import pbcore.data.datasets as data
- >>> from pbcore.io import DataSet
+ >>> from pbcore.io import SubreadSet
>>> from pbcore.io.dataset.DataSetMembers import Filters
- >>> ds1 = DataSet()
+ >>> ds1 = SubreadSet()
>>> filt = Filters()
>>> filt.addRequirement(rq=[('>', '0.85')])
>>> ds1.addFilters(filt)
@@ -1120,12 +1214,25 @@ class DataSet(object):
:newMetadata: The object metadata of a DataSet being considered for
merging
"""
- if self.objMetadata.get('Version'):
- if newMetadata.get('Version') > self.objMetadata.get('Version'):
+ # If there is no objMetadata, this is a new dataset being populated
+ if self.objMetadata:
+ # if there isn't a Version in each, that will fail eventually
+ if 'Version' in self.objMetadata and 'Version' in newMetadata:
+ if newMetadata['Version'] == self.objMetadata['Version']:
+ return True
+ # We'll make an exception for now: major version number passes
+ elif (newMetadata['Version'].split('.')[0] ==
+ self.objMetadata['Version'].split('.')[0]):
+ log.warn("Future warning: merging datasets that don't "
+ "share a version number will fail.")
+ return True
raise ValueError("Wrong dataset version for merging {v1} vs "
"{v2}".format(
v1=newMetadata.get('Version'),
v2=self.objMetadata.get('Version')))
+ log.warn("Future warning: merging will require Version "
+ "numbers for both DataSets")
+ return True
def addMetadata(self, newMetadata, **kwargs):
@@ -1187,8 +1294,16 @@ class DataSet(object):
self.metadata.addMetadata(key, value)
def updateCounts(self):
+ """Update the TotalLength and NumRecords for this DataSet.
+
+ Not compatible with the base DataSet class, which has no ability to
+ touch ExternalResources. -1 is used as a sentinel value for failed size
+ determination. It should never be written out to XML in regular use.
+
+ """
self.metadata.totalLength = -1
self.metadata.numRecords = -1
+ self._countsUpdated = True
def addExternalResources(self, newExtResources, updateCount=True):
"""Add additional ExternalResource objects, ensuring no duplicate
@@ -1256,9 +1371,12 @@ class DataSet(object):
self.subdatasets.append(otherDataSet)
def _openFiles(self):
+ """Open the top level ExternalResources"""
raise RuntimeError("Not defined for this type of DataSet")
def resourceReaders(self):
+ """Return a list of open pbcore Reader objects for the
+ top level ExternalResources in this DataSet"""
raise RuntimeError("Not defined for this type of DataSet")
@property
@@ -1284,9 +1402,18 @@ class DataSet(object):
yield record
def __iter__(self):
- """ Iterate over the records. (sorted for AlignmentSet objects)"""
- for record in self.records:
- yield record
+ """Iterate over the records.
+
+ The order of yielded reads is determined by the order of the
+ ExternalResources and record order within each file"""
+ if self.isIndexed:
+ # this uses the index to respect filters
+ for i in xrange(len(self)):
+ yield self[i]
+ else:
+ # this uses post-filtering to respect filters
+ for record in self.records:
+ yield record
@property
def subSetNames(self):
@@ -1296,8 +1423,10 @@ class DataSet(object):
subNames.extend(sds.name)
return subNames
- def readsInSubDatasets(self, subNames=[]):
+ def readsInSubDatasets(self, subNames=None):
"""To be used in conjunction with self.subSetNames"""
+ if subNames is None:
+ subNames = []
if self.subdatasets:
for sds in self.subdatasets:
if subNames and sds.name not in subNames:
@@ -1372,7 +1501,8 @@ class DataSet(object):
Doctest:
>>> from pbcore.io import DataSet
- >>> DataSet("bam1.bam", "bam2.bam", strict=False).toFofn(uri=False)
+ >>> DataSet("bam1.bam", "bam2.bam", strict=False,
+ ... skipMissing=True).toFofn(uri=False)
['bam1.bam', 'bam2.bam']
"""
lines = [er.resourceId for er in self.externalResources]
@@ -1397,6 +1527,7 @@ class DataSet(object):
@property
def _castableDataSetTypes(self):
+ """Tuple of DataSet types to which this DataSet can be cast"""
if isinstance(self.datasetType, tuple):
return self.datasetType
else:
@@ -1444,7 +1575,7 @@ class DataSet(object):
@property
def filters(self):
"""Limit setting to ensure cache hygiene and filter compatibility"""
- self._filters.registerCallback(lambda x=self: x.reFilter())
+ self._filters.registerCallback(self._wipeCaches)
return self._filters
@filters.setter
@@ -1453,7 +1584,9 @@ class DataSet(object):
if value is None:
self._filters = Filters()
else:
+ value.clearCallbacks()
self._filters = value
+ self.reFilter()
def reFilter(self, light=True):
"""
@@ -1468,6 +1601,15 @@ class DataSet(object):
self.metadata.numRecords = -1
if self.metadata.summaryStats:
self.metadata.removeChildren('SummaryStats')
+ self.updateCounts()
+
+ def _wipeCaches(self):
+ self.reFilter(False)
+
+ @property
+ def createdAt(self):
+ """Return the DataSet CreatedAt timestamp"""
+ return self.objMetadata.get('CreatedAt')
@property
def numRecords(self):
@@ -1505,6 +1647,16 @@ class DataSet(object):
self.objMetadata['Name'] = value
@property
+ def description(self):
+ """The description of this DataSet"""
+ return self.objMetadata.get('Description', '')
+
+ @description.setter
+ def description(self, value):
+ """The description of this DataSet"""
+ self.objMetadata['Description'] = value
+
+ @property
def numExternalResources(self):
"""The number of ExternalResources in this DataSet"""
return len(self.externalResources)
@@ -1524,24 +1676,24 @@ class DataSet(object):
raise ResourceMismatchError(responses)
return responses[0]
- def hasPulseFeature(self, featureName):
+ def hasBaseFeature(self, featureName):
responses = self._pollResources(
- lambda x: x.hasPulseFeature(featureName))
+ lambda x: x.hasBaseFeature(featureName))
return self._unifyResponses(responses)
- def pulseFeaturesAvailable(self):
- responses = self._pollResources(lambda x: x.pulseFeaturesAvailable())
+ def baseFeaturesAvailable(self):
+ responses = self._pollResources(lambda x: x.baseFeaturesAvailable())
return self._unifyResponses(responses)
@property
def sequencingChemistry(self):
- key = 'sequencingChemistry'
- responses = self._pollResources(lambda x: getattr(x, key))
+ responses = self._pollResources(lambda x: x.sequencingChemistry)
return list(_flatten(responses))
@property
def isEmpty(self):
- return self._checkIdentical('isEmpty')
+ responses = self._pollResources(lambda x: getattr(x, 'isEmpty'))
+ return all(responses)
@property
def readType(self):
@@ -1583,27 +1735,19 @@ class DataSet(object):
def _indexRecords(self):
raise NotImplementedError()
+ def isIndexed(self):
+ raise NotImplementedError()
+
def assertIndexed(self):
raise NotImplementedError()
def _assertIndexed(self, acceptableTypes):
if not self._openReaders:
- try:
- tmp = self._strict
- self._strict = True
- self._openFiles()
- except Exception:
- # Catch everything to recover the strictness status, then raise
- # whatever error was found.
- self._strict = tmp
- raise
- finally:
- self._strict = tmp
- else:
- for fname, reader in zip(self.toExternalFiles(),
- self.resourceReaders()):
- if not isinstance(reader, acceptableTypes):
- raise IOError(errno.EIO, "File not indexed", fname)
+ self._openFiles()
+ for fname, reader in zip(self.toExternalFiles(),
+ self.resourceReaders()):
+ if not isinstance(reader, acceptableTypes):
+ raise IOError(errno.EIO, "File not indexed", fname)
return True
def __getitem__(self, index):
@@ -1632,7 +1776,7 @@ class DataSet(object):
class InvalidDataSetIOError(Exception):
- """The base class for all DataSetIO related custom exceptions (hopefully)
+ """The base class for all DataSetIO related custom exceptions
"""
@@ -1680,11 +1824,16 @@ class ReadSet(DataSet):
return True
return False
- def assertBarcoded(self):
- """Test whether all resources are barcoded files"""
+ @property
+ def isBarcoded(self):
+ """Determine whether all resources are barcoded files"""
res = self._pollResources(
lambda x: x.index.pbiFlags & PBI_FLAGS_BARCODE)
- if not self._unifyResponses(res):
+ return self._unifyResponses(res)
+
+ def assertBarcoded(self):
+ """Test whether all resources are barcoded files"""
+ if not self.isBarcoded:
raise RuntimeError("File not barcoded")
def _openFiles(self):
@@ -1695,17 +1844,22 @@ class ReadSet(DataSet):
log.debug("Closing old readers...")
self.close()
log.debug("Opening ReadSet resources")
+ sharedRefs = {}
+ infotables = []
+ infodicts = []
for extRes in self.externalResources:
refFile = extRes.reference
if refFile:
- log.debug("Using reference: {r}".format(r=refFile))
+ if not refFile in sharedRefs:
+ log.debug("Using reference: {r}".format(r=refFile))
+ sharedRefs[refFile] = IndexedFastaReader(refFile)
location = urlparse(extRes.resourceId).path
resource = None
try:
if extRes.resourceId.endswith('bam'):
- resource = IndexedBamReader(
- location,
- referenceFastaFname=refFile)
+ resource = IndexedBamReader(location)
+ if refFile:
+ resource.referenceFasta = sharedRefs[refFile]
else:
resource = CmpH5Reader(location)
except (IOError, ValueError):
@@ -1713,23 +1867,43 @@ class ReadSet(DataSet):
log.warn("pbi file missing for {f}, operating with "
"reduced speed and functionality".format(
f=location))
- resource = BamReader(
- location, referenceFastaFname=refFile)
+ resource = BamReader(location)
+ if refFile:
+ resource.referenceFasta = sharedRefs[refFile]
else:
raise
+ # Consolidate referenceDicts
+ # This gets huge when there are ~90k references. If you have ~28
+ # chunks, each with 28 BamReaders, each with 100MB referenceDicts,
+ # you end up storing tens of gigs of just these (often identical)
+ # dicts
+ if not len(infotables):
+ infotables.append(resource._referenceInfoTable)
+ infodicts.append(resource._referenceDict)
+ else:
+ for ri, rd in zip(infotables, infodicts):
+ if np.array_equal(resource._referenceInfoTable, ri):
+ del resource._referenceInfoTable
+ del resource._referenceDict
+ resource._referenceInfoTable = ri
+ resource._referenceDict = rd
+ break
+ infotables.append(resource._referenceInfoTable)
+ infodicts.append(resource._referenceDict)
self._openReaders.append(resource)
try:
if resource.isEmpty:
- log.warn("{f} contains no reads!".format(
+ log.debug("{f} contains no reads!".format(
f=extRes.resourceId))
except UnavailableFeature: # isEmpty requires bai
if not list(itertools.islice(resource, 1)):
- log.warn("{f} contains no reads!".format(
+ log.debug("{f} contains no reads!".format(
f=extRes.resourceId))
if len(self._openReaders) == 0 and len(self.toExternalFiles()) != 0:
raise IOError("No files were openable")
log.debug("Done opening resources")
+
def _filterType(self):
return 'bam'
@@ -1742,7 +1916,7 @@ class ReadSet(DataSet):
return self._unifyResponses(res)
except ResourceMismatchError:
if not self._strict:
- log.error("Resources inconsistently indexed")
+ log.warn("Resources inconsistently indexed")
return False
else:
raise
@@ -1762,7 +1936,7 @@ class ReadSet(DataSet):
try:
self.assertBarcoded()
except RuntimeError:
- log.warn("No barcodes found in BAM file, skipping split")
+ log.info("No barcodes found in BAM file, skipping split")
return [self.copy()]
barcodes = defaultdict(int)
for bcTuple in itertools.izip(self.index.bcForward,
@@ -1789,6 +1963,7 @@ class ReadSet(DataSet):
log.debug("Done chunking")
log.debug("Modifying filters or resources")
for result, chunk in zip(results, chunks):
+ result.filters.removeRequirement('bc')
result.filters.addRequirement(
bc=[('=', list(c[0])) for c in chunk])
@@ -1806,83 +1981,121 @@ class ReadSet(DataSet):
return results
def _split_zmws(self, chunks, targetSize=None):
- files_to_movies = defaultdict(list)
- n_bam = 0
- for bam in self.resourceReaders():
- n_bam += 1
- if len(bam.readGroupTable) > 1:
- raise RuntimeError("Multiple read groups in single .bam")
- if chunks < n_bam:
- return self.split(chunks=chunks)
- target_nchunks = self.numRecords/targetSize
- # turn on to provide more reasonable nchunks on small datasets:
- #chunks = min(chunks, target_nchunks)
- n_chunks_per_bam = max(1, int(math.floor(float(chunks) / n_bam)))
- if n_chunks_per_bam < 2:
- log.warn("%d ZMW chunks requested but there are %d files" %
- (chunks, n_bam))
- n_chunks = n_bam * n_chunks_per_bam
- active_holenumbers = 0
- for reader in self.resourceReaders():
- active_holenumbers += len(set(reader.holeNumber))
- n_chunks = min(active_holenumbers, n_chunks)
+ """Holenumbers must be unique within each movie"""
+
+ if chunks == 1:
+ return [self.copy()]
+ # make sure we can pull out the movie name:
+ rgIdMovieNameMap = {rg[0]: rg[1] for rg in self.readGroupTable}
+
+ # find atoms:
+ active_holenumbers = self.index
+ n_chunks = min(len(active_holenumbers), chunks)
+
+ # if we have a target size and can have two or more chunks:
+ if (not targetSize is None and len(active_holenumbers) > 1 and
+ chunks > 1):
+ n_chunks = min(n_chunks, len(active_holenumbers)/targetSize)
+ # we want at least two if we can swing it:
+ n_chunks = max(n_chunks, 2)
+
+ # make sure there aren't too few atoms
if n_chunks != chunks:
log.info("Adjusted number of chunks to %d" % n_chunks)
- log.debug("Making copies")
- results = [self.copy() for _ in range(n_chunks)]
- j_chunk = 0
- for bam in self.resourceReaders():
- rg = bam.readGroupTable[0]
- n_reads = len(bam.holeNumber)
- chunk_size = int(math.ceil(float(n_reads / n_chunks_per_bam)))
- i_read = 0
- for i_chunk in range(n_chunks_per_bam):
- if i_read >= n_reads:
- break
- result = results[j_chunk]
- j_chunk += 1
- zmw_start = bam.holeNumber[i_read]
- if i_chunk == n_chunks_per_bam - 1:
- zmw_end = bam.holeNumber.max()
- else:
- i_read += chunk_size
- zmw_end = bam.holeNumber[min(i_read, n_reads-1)]
- while i_read < n_reads-1:
- if zmw_end == bam.holeNumber[i_read+1]:
- i_read += 1
- zmw_end = bam.holeNumber[i_read]
- else:
- break
- result.filters.addRequirement(
- movie=[('=', rg.MovieName)],
- zm=[('<', zmw_end+1)])
- result.filters.addRequirement(
- zm=[('>', zmw_start-1)])
- i_read += 1
- # UniqueId was regenerated when the ExternalResource list was
- # whole, therefore we need to regenerate it again here
- log.debug("Generating new UUID")
- for result in results:
- result.reFilter()
- result.newUuid()
- result.updateCounts()
+ # sort atoms and group into chunks:
+ active_holenumbers.sort(order=['qId', 'holeNumber'])
+ view = _fieldsView(self.index, ['qId', 'holeNumber'])
+ keys = np.unique(view)
+ ranges = splitKeys(keys, n_chunks)
+
+ # The above ranges can include hidden, unrepresented movienames that
+ # are sandwiched between those in the range. In order to capture those,
+ # we need to find the indices of the range bounds, then pull out the
+ # chunks.
+ hn_chunks = []
+ for zmw_range in ranges:
+ if zmw_range[0][0] == zmw_range[1][0]:
+ hn_chunks.append(active_holenumbers[
+ (active_holenumbers['qId'] == zmw_range[0][0]) &
+ (active_holenumbers['holeNumber'] >= zmw_range[0][1]) &
+ (active_holenumbers['holeNumber'] <= zmw_range[1][1])])
+ else:
+ start = np.flatnonzero(
+ (active_holenumbers['qId'] == zmw_range[0][0]) &
+ (active_holenumbers['holeNumber'] == zmw_range[0][1]))[0]
+ end = np.flatnonzero(
+ (active_holenumbers['qId'] == zmw_range[1][0]) &
+ (active_holenumbers['holeNumber'] == zmw_range[1][1]))[-1]
+ hn_chunks.append(active_holenumbers[start:(end + 1)])
+
+ results = []
+ log.debug("Making copies")
+ tmp_results = [self.copy() for _ in range(n_chunks)]
+
+ # add filters
+ for chunk, res in zip(hn_chunks, tmp_results):
+ # check if multiple movies:
+ if chunk[0]['qId'] == chunk[-1]['qId']:
+ movieName = rgIdMovieNameMap[chunk[0]['qId']]
+ zmwStart = chunk[0]['holeNumber']
+ zmwEnd = chunk[-1]['holeNumber']
+ res._filters.clearCallbacks()
+ res._filters.addRequirement(
+ movie=[('=', movieName)],
+ zm=[('<', zmwEnd+1)])
+ res._filters.addRequirement(
+ zm=[('>', zmwStart-1)])
+ else:
+ movieNames = []
+ zmwStarts = []
+ zmwEnds = []
+ for mov in np.unique(chunk['qId']):
+ movieNames.append(rgIdMovieNameMap[mov])
+ inds = np.flatnonzero(chunk['qId'] == mov)
+ zmwStarts.append(chunk[inds[0]]['holeNumber'])
+ zmwEnds.append(chunk[inds[-1]]['holeNumber'])
+ res._filters.clearCallbacks()
+ res._filters.addRequirement(
+ movie=[('=', mn) for mn in movieNames],
+ zm=[('<', ze + 1) for ze in zmwEnds])
+ res._filters.mapRequirement(
+ zm=[('>', zs - 1) for zs in zmwStarts])
+ res.numRecords = len(chunk)
+ res.totalLength = sum(chunk['qEnd'] - chunk['qStart'])
+ res.newUuid()
+ results.append(res)
+
+ # we changed the sort order above, so this is dirty:
+ self._index = None
+ self._indexMap = None
# Update the basic metadata for the new DataSets from external
# resources, or at least mark as dirty
# TODO
return results
+
@property
def readGroupTable(self):
"""Combine the readGroupTables of each external resource"""
responses = self._pollResources(lambda x: x.readGroupTable)
if len(responses) > 1:
- tbr = reduce(np.append, responses)
+ # append the read groups, but eliminate duplicates.
+ tbr = _uniqueRecords(reduce(np.append, responses))
+ # reassign qIds if dupes:
+ if len(set(tbr['ID'])) < len(tbr):
+ self._readGroupTableIsRemapped = True
+ tbr['ID'] = range(len(tbr))
return tbr
else:
return responses[0]
+ @property
+ def movieIds(self):
+ """A dict of movieName: movieId for the joined readGroupTable"""
+ return {rg.MovieName: rg.ID for rg in self.readGroupTable}
+
def assertIndexed(self):
self._assertIndexed((IndexedBamReader, CmpH5Reader))
@@ -1892,6 +2105,33 @@ class ReadSet(DataSet):
res = self._pollResources(lambda x: isinstance(x, CmpH5Reader))
return self._unifyResponses(res)
+ def _fixQIds(self, indices, resourceReader):
+ qId_acc = lambda x: x.MovieID
+ if not self.isCmpH5:
+ qId_acc = lambda x: x.qId
+
+ rr = resourceReader
+ try:
+ # this would populate the _readGroupTableIsRemapped member, but
+ # for whatever reason a lot of cmp.h5's are broken
+ _ = self.readGroupTable
+ except ChemistryLookupError:
+ # this should be an error, but that would mess up Quiver cram
+ # tests. If anyone tries to access the readGroupTable in a
+ # dataset it will still fail, at least
+ log.info("Chemistry information could not be found in "
+ "cmp.h5, cannot fix the readGroupTable or "
+ "MovieID field.")
+ if self._readGroupTableIsRemapped:
+ log.debug("Must correct index qId's")
+ qIdMap = dict(zip(rr.readGroupTable.ID,
+ rr.readGroupTable.MovieName))
+ nameMap = self.movieIds
+ for qId in qIdMap.keys():
+ qId_acc(indices)[qId_acc(indices) == qId] = nameMap[
+ qIdMap[qId]]
+
+
def _indexRecords(self):
"""Returns index recarray summarizing all of the records in all of
the resources that conform to those filters addressing parameters
@@ -1899,13 +2139,17 @@ class ReadSet(DataSet):
"""
recArrays = []
- self._indexMap = []
+ _indexMap = []
for rrNum, rr in enumerate(self.resourceReaders()):
indices = rr.index
+ if len(indices) == 0:
+ continue
+
+ self._fixQIds(indices, rr)
if not self._filters or self.noFiltering:
recArrays.append(indices._tbl)
- self._indexMap.extend([(rrNum, i) for i in
+ _indexMap.extend([(rrNum, i) for i in
range(len(indices._tbl))])
else:
# Filtration will be necessary:
@@ -1914,14 +2158,17 @@ class ReadSet(DataSet):
nameMap = {name: n
for n, name in enumerate(
rr.referenceInfoTable['Name'])}
-
passes = self._filters.filterIndexRecords(indices._tbl,
- nameMap)
+ nameMap,
+ self.movieIds)
newInds = indices._tbl[passes]
recArrays.append(newInds)
- self._indexMap.extend([(rrNum, i) for i in
- np.nonzero(passes)[0]])
- self._indexMap = np.array(self._indexMap)
+ _indexMap.extend([(rrNum, i) for i in
+ np.flatnonzero(passes)])
+ self._indexMap = np.array(_indexMap, dtype=[('reader', 'uint64'),
+ ('index', 'uint64')])
+ if recArrays == []:
+ return recArrays
return _stackRecArrays(recArrays)
def resourceReaders(self):
@@ -1944,7 +2191,9 @@ class ReadSet(DataSet):
"""
count = len(self.index)
- length = sum(self.index.qEnd - self.index.qStart)
+ length = 0
+ if count:
+ length = sum(self.index.qEnd - self.index.qStart)
return count, length
def _resourceSizes(self):
@@ -2005,20 +2254,14 @@ class ReadSet(DataSet):
log.debug("Replacing resources")
self.externalResources = ExternalResources()
self.addExternalResources([dataFile])
+ # reset the indexmap especially, as it is out of date:
+ self._index = None
+ self._indexMap = None
+ self._openReaders = []
self._populateMetaTypes()
- def __len__(self):
- if self.numRecords == -1:
- if self._filters:
- self.updateCounts()
- else:
- count = 0
- for reader in self.resourceReaders():
- count += len(reader)
- self.numRecords = count
- return self.numRecords
-
def updateCounts(self):
+ self._countsUpdated = True
if self._skipCounts:
log.debug("SkipCounts is true, skipping updateCounts()")
self.metadata.totalLength = -1
@@ -2044,13 +2287,12 @@ class HdfSubreadSet(ReadSet):
datasetType = DataSetMetaTypes.HDF_SUBREAD
def __init__(self, *files, **kwargs):
- log.debug("Opening HdfSubreadSet")
super(HdfSubreadSet, self).__init__(*files, **kwargs)
# The metatype for this dataset type is inconsistent, plaster over it
# here:
self.objMetadata["MetaType"] = "PacBio.DataSet.HdfSubreadSet"
- self.objMetadata["TimeStampedName"] = self._getTimeStampedName(
+ self.objMetadata["TimeStampedName"] = _getTimeStampedName(
self.objMetadata["MetaType"])
def induceIndices(self):
@@ -2058,7 +2300,7 @@ class HdfSubreadSet(ReadSet):
@property
def isIndexed(self):
- return True
+ return False
def _openFiles(self):
"""Open the files (assert they exist, assert they are of the proper
@@ -2089,6 +2331,7 @@ class HdfSubreadSet(ReadSet):
def updateCounts(self):
"""Overriding here so we don't have to assertIndexed"""
+ self._countsUpdated = True
if self._skipCounts:
log.debug("SkipCounts is true, skipping updateCounts()")
self.metadata.totalLength = -1
@@ -2124,8 +2367,8 @@ class SubreadSet(ReadSet):
>>> from pbcore.io import SubreadSet
>>> from pbcore.io.dataset.DataSetMembers import ExternalResources
>>> import pbcore.data.datasets as data
- >>> ds1 = SubreadSet(data.getXml(no=5))
- >>> ds2 = SubreadSet(data.getXml(no=5))
+ >>> ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
+ >>> ds2 = SubreadSet(data.getXml(no=5), skipMissing=True)
>>> # So they don't conflict:
>>> ds2.externalResources = ExternalResources()
>>> ds1 # doctest:+ELLIPSIS
@@ -2143,7 +2386,7 @@ class SubreadSet(ReadSet):
>>> ds3 = ds1 + ds2
>>> len(ds3.metadata.collections)
2
- >>> ds4 = SubreadSet(data.getSubreadSet())
+ >>> ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True)
>>> ds4 # doctest:+ELLIPSIS
<SubreadSet...
>>> ds4._metadata # doctest:+ELLIPSIS
@@ -2155,7 +2398,6 @@ class SubreadSet(ReadSet):
datasetType = DataSetMetaTypes.SUBREAD
def __init__(self, *files, **kwargs):
- log.debug("Opening SubreadSet")
super(SubreadSet, self).__init__(*files, **kwargs)
@staticmethod
@@ -2184,11 +2426,37 @@ class AlignmentSet(ReadSet):
:strict=False: see base class
:skipCounts=False: see base class
"""
- log.debug("Opening AlignmentSet with {f}".format(f=files))
super(AlignmentSet, self).__init__(*files, **kwargs)
fname = kwargs.get('referenceFastaFname', None)
if fname:
self.addReference(fname)
+ self.__referenceIdMap = None
+
+ @property
+ @filtered
+ def records(self):
+ """A generator of (usually) BamAlignment objects for the
+ records in one or more Bam files pointed to by the
+ ExternalResources in this DataSet.
+
+ Yields:
+ A BamAlignment object
+
+ Doctest:
+ >>> import pbcore.data.datasets as data
+ >>> from pbcore.io import AlignmentSet
+ >>> ds = AlignmentSet(data.getBam())
+ >>> for record in ds.records:
+ ... print 'hn: %i' % record.holeNumber # doctest:+ELLIPSIS
+ hn: ...
+ """
+ if self.isIndexed:
+ for i in range(len(self.index)):
+ yield self[i]
+ else:
+ for resource in self.resourceReaders():
+ for record in resource:
+ yield record
def consolidate(self, *args, **kwargs):
if self.isCmpH5:
@@ -2226,6 +2494,21 @@ class AlignmentSet(ReadSet):
res.reference = reference[0]
self._openFiles()
+ def _fixTIds(self, indices, rr, correctIds=True):
+ tId_acc = lambda x: x.RefGroupID
+ rName = lambda x: x['FullName']
+ if not self.isCmpH5:
+ tId_acc = lambda x: x.tId
+ rName = lambda x: x['Name']
+ if correctIds and self._stackedReferenceInfoTable:
+ log.debug("Must correct index tId's")
+ tIdMap = dict(zip(rr.referenceInfoTable['ID'],
+ rName(rr.referenceInfoTable)))
+ nameMap = self.refIds
+ for tId in tIdMap.keys():
+ tId_acc(indices)[tId_acc(indices) == tId] = nameMap[
+ tIdMap[tId]]
+
def _indexRecords(self, correctIds=True):
"""Returns index records summarizing all of the records in all of
the resources that conform to those filters addressing parameters
@@ -2234,47 +2517,57 @@ class AlignmentSet(ReadSet):
"""
recArrays = []
log.debug("Processing resource pbis")
- self._indexMap = []
+ _indexMap = []
for rrNum, rr in enumerate(self.resourceReaders()):
indices = rr.index
- tId = lambda x: x.tId
+ # pbi files lack e.g. mapping cols when bam emtpy, ignore
+ if len(indices) == 0:
+ continue
+ # TODO(mdsmith)(2016-01-19) rename the fields instead of branching:
+ #if self.isCmpH5:
+ # _renameField(indices, 'MovieID', 'qId')
+ # _renameField(indices, 'RefGroupID', 'tId')
if not self.isCmpH5:
indices = indices._tbl
- tId = lambda x: x.RefGroupID
- if correctIds and self._stackedReferenceInfoTable:
- log.debug("Must correct index tId's")
- tIdMap = {n: name
- for n, name in enumerate(
- rr.referenceInfoTable['Name'])}
- nameMap = self.refIds
+ # Correct tId field
+ self._fixTIds(indices, rr, correctIds)
+ # Correct qId field
+ self._fixQIds(indices, rr)
+
+ # filter
if not self._filters or self.noFiltering:
- if correctIds and self._stackedReferenceInfoTable:
- for i in range(len(indices)):
- tId(indices)[i] = nameMap[
- tIdMap[tId(indices)[i]]]
recArrays.append(indices)
- self._indexMap.extend([(rrNum, i) for i in
- range(len(indices))])
+ _indexMap.extend([(rrNum, i) for i in
+ range(len(indices))])
else:
- # Filtration will be necessary:
- nameMap = {name: n
- for n, name in enumerate(
- rr.referenceInfoTable['Name'])}
-
- passes = self._filters.filterIndexRecords(indices,
- nameMap)
- if correctIds and self._stackedReferenceInfoTable:
- for i in range(len(indices)):
- tId(indices)[i] = nameMap[
- tIdMap[tId(indices)[i]]]
+ passes = self._filters.filterIndexRecords(indices, self.refIds,
+ self.movieIds)
newInds = indices[passes]
recArrays.append(newInds)
- self._indexMap.extend([(rrNum, i) for i in
- np.nonzero(passes)[0]])
- self._indexMap = np.array(self._indexMap)
- return _stackRecArrays(recArrays)
+ _indexMap.extend([(rrNum, i) for i in
+ np.flatnonzero(passes)])
+ self._indexMap = np.array(_indexMap, dtype=[('reader', 'uint64'),
+ ('index', 'uint64')])
+ if recArrays == []:
+ return recArrays
+ tbr = _stackRecArrays(recArrays)
+
+ # sort if cmp.h5 so we can rectify RowStart/End, maybe someday bam
+ if self.isCmpH5:
+ sort_order = np.argsort(tbr, order=('RefGroupID', 'tStart',
+ 'tEnd',))
+ tbr = tbr[sort_order]
+ self._indexMap = self._indexMap[sort_order]
+ for ref in self.referenceInfoTable:
+ hits = np.flatnonzero(tbr.RefGroupID == ref.ID)
+ if len(hits) != 0:
+ ref.StartRow = hits[0]
+ ref.EndRow = hits[-1]
+ # and fix the naming scheme while we're at it
+ ref.Name = self._cleanCmpName(ref.FullName)
+ return tbr
def resourceReaders(self, refName=False):
"""A generator of Indexed*Reader objects for the ExternalResources
@@ -2315,10 +2608,7 @@ class AlignmentSet(ReadSet):
@property
def refNames(self):
"""A list of reference names (id)."""
- if self.isCmpH5:
- return [self._cleanCmpName(name) for _, name in
- self.refInfo('FullName')]
- return sorted([name for _, name in self.refInfo('Name')])
+ return np.sort(self.referenceInfoTable["Name"])
def _indexReadsInReference(self, refName):
# This can probably be deprecated for all but the official reads in
@@ -2326,7 +2616,7 @@ class AlignmentSet(ReadSet):
refName = self.guaranteeName(refName)
desiredTid = self.refIds[refName]
- tIds = self.index.tId
+ tIds = self.tId
passes = tIds == desiredTid
return self.index[passes]
@@ -2336,34 +2626,12 @@ class AlignmentSet(ReadSet):
sizes.append((len(rr.index), sum(rr.index.aEnd - rr.index.aStart)))
return sizes
- def _countMappedReads(self):
- """It is too slow for large datasets to use _indexReadsInReference"""
- counts = {rId: 0 for _, rId in self.refIds.items()}
- for ind in self.index:
- counts[ind["tId"]] += 1
- tbr = {}
- idMap = {rId: name for name, rId in self.refIds.items()}
- for key, value in counts.iteritems():
- tbr[idMap[key]] = value
- return tbr
-
- def _getMappedReads(self):
- """It is too slow for large datasets to use _indexReadsInReference for
- each reference"""
- counts = {rId: 0 for _, rId in self.refIds.items()}
- for ind in self.index:
- counts[ind["tId"]].append(ind)
- tbr = {}
- idMap = {rId: name for name, rId in self.refIds.items()}
- for key, value in counts.iteritems():
- tbr[idMap[key]] = value
- return tbr
-
@property
def refWindows(self):
"""Going to be tricky unless the filters are really focused on
windowing the reference. Much nesting or duplication and the correct
results are really not guaranteed"""
+ log.debug("Fetching reference windows...")
windowTuples = []
nameIDs = self.refInfo('Name')
refLens = None
@@ -2396,6 +2664,7 @@ class AlignmentSet(ReadSet):
refLens = self.refLengths
refLen = refLens[name]
windowTuples.append((refId, 0, refLen))
+ log.debug("Done fetching reference windows")
return sorted(windowTuples)
def countRecords(self, rname=None, winStart=None, winEnd=None):
@@ -2429,30 +2698,15 @@ class AlignmentSet(ReadSet):
"""
- if isinstance(refName, np.int64):
- refName = str(refName)
- if refName.isdigit():
- if (not refName in self.refNames
- and not refName in self.fullRefNames):
- try:
- refName = self._idToRname(int(refName))
- except AttributeError:
- raise StopIteration
-
- # I would love to use readsInRange(refName, None, None), but
- # IndexedBamReader breaks this (works for regular BamReader).
- # So I have to do a little hacking...
- refLen = 0
- for resource in self.resourceReaders():
- if (refName in resource.referenceInfoTable['Name'] or
- refName in resource.referenceInfoTable['FullName']):
- refLen = resource.referenceInfo(refName).Length
- break
- if refLen:
- for read in self.readsInRange(refName, 0, refLen):
- yield read
+ try:
+ refName = self.guaranteeName(refName)
+ refLen = self.refLengths[refName]
+ except (KeyError, AttributeError):
+ raise StopIteration
+ for read in self.readsInRange(refName, 0, refLen):
+ yield read
- def _intervalContour(self, rname):
+ def intervalContour(self, rname, tStart=0, tEnd=None):
"""Take a set of index records and build a pileup of intervals, or
"contour" describing coverage over the contig
@@ -2468,15 +2722,27 @@ class AlignmentSet(ReadSet):
"""
log.debug("Generating coverage summary")
index = self._indexReadsInReference(rname)
- # indexing issue. Doesn't really matter (just for split). Shifted:
- coverage = [0] * (self.refLengths[rname] + 1)
+ if tEnd is None:
+ tEnd = self.refLengths[rname]
+ coverage = [0] * (tEnd - tStart)
starts = sorted(index.tStart)
for i in starts:
- coverage[i] += 1
+ # ends are exclusive
+ if i >= tEnd:
+ continue
+ if i >= tStart:
+ coverage[i - tStart] += 1
+ else:
+ coverage[0] += 1
del starts
ends = sorted(index.tEnd)
for i in ends:
- coverage[i] -= 1
+ # ends are exclusive
+ if i <= tStart:
+ continue
+ # ends are exclusive
+ if i < tEnd:
+ coverage[i - tStart] -= 1
del ends
curCov = 0
for i, delta in enumerate(coverage):
@@ -2484,7 +2750,7 @@ class AlignmentSet(ReadSet):
coverage[i] = curCov
return coverage
- def _splitContour(self, contour, splits):
+ def splitContour(self, contour, splits):
"""Take a contour and a number of splits, return the location of each
coverage mediated split with the first at 0"""
log.debug("Splitting coverage summary")
@@ -2511,8 +2777,8 @@ class AlignmentSet(ReadSet):
rnames[atom[0]].append(atom)
for rname, rAtoms in rnames.iteritems():
if len(rAtoms) > 1:
- contour = self._intervalContour(rname)
- splits = self._splitContour(contour, len(rAtoms))
+ contour = self.intervalContour(rname)
+ splits = self.splitContour(contour, len(rAtoms))
ends = splits[1:] + [self.refLengths[rname]]
for start, end in zip(splits, ends):
newAtom = (rname, start, end)
@@ -2545,37 +2811,24 @@ class AlignmentSet(ReadSet):
log.debug("{i} references found".format(i=len(refNames)))
log.debug("Finding contigs")
- # FIXME: this mess:
- if len(refNames) < 100 and len(refNames) > 1:
- if byRecords:
- log.debug("Counting records...")
- atoms = [(rn, 0, 0, self.countRecords(rn))
- for rn in refNames
- if len(self._indexReadsInReference(rn)) != 0]
- else:
- atoms = [(rn, 0, refLens[rn]) for rn in refNames if
- len(self._indexReadsInReference(rn)) != 0]
- else:
- log.debug("Skipping records for each reference check")
- atoms = [(rn, 0, refLens[rn]) for rn in refNames]
- if byRecords:
- log.debug("Counting records...")
- # This is getting out of hand, but the number of references
- # determines the best read counting algorithm:
- if len(refNames) < 100:
- atoms = [(rn, 0, refLens[rn], self.countRecords(rn))
- for rn in refNames]
- else:
- counts = self._countMappedReads()
- atoms = [(rn, 0, refLens[rn], counts[rn])
- for rn in refNames]
- log.debug("{i} contigs found".format(i=len(atoms)))
-
if byRecords:
+ log.debug("Counting records...")
+ atoms = [(rn, 0, 0, count)
+ for rn, count in zip(refNames, map(self.countRecords,
+ refNames))
+ if count != 0]
balanceKey = lambda x: self.countRecords(*x)
else:
- # The window length is used for balancing
+ # if there are that many references, on average they will probably
+ # be distributed pretty evenly. Checking the counts will also be
+ # super expensive
+ if len(refNames) < 100:
+ atoms = [(rn, 0, refLens[rn]) for rn in refNames if
+ self.countRecords(rn) != 0]
+ else:
+ atoms = [(rn, 0, refLens[rn]) for rn in refNames]
balanceKey = lambda x: x[2] - x[1]
+ log.debug("{i} contigs found".format(i=len(atoms)))
# By providing maxChunks and not chunks, this combination will set
# chunks down to < len(atoms) < maxChunks
@@ -2675,13 +2928,16 @@ class AlignmentSet(ReadSet):
log.debug("Done chunking")
log.debug("Modifying filters or resources")
for result, chunk in zip(results, chunks):
+ # we don't want to updateCounts or anything right now, so we'll
+ # block that functionality:
+ result._filters.clearCallbacks()
if atoms[0][2]:
- result.filters.addRequirement(
+ result._filters.addRequirement(
rname=[('=', c[0]) for c in chunk],
tStart=[('<', c[2]) for c in chunk],
tEnd=[('>', c[1]) for c in chunk])
else:
- result.filters.addRequirement(
+ result._filters.addRequirement(
rname=[('=', c[0]) for c in chunk])
# UniqueId was regenerated when the ExternalResource list was
@@ -2691,10 +2947,23 @@ class AlignmentSet(ReadSet):
# here:
for result in results:
result.newUuid()
- if updateCounts:
+ # If there are so many filters that it will be really expensive, we
+ # will use an approximation for the number of records and bases.
+ # This is probably not too far off, if there are that many chunks
+ # to distribute. We'll still round to indicate that it is an
+ # abstraction.
+ if len(result._filters) > 100:
+ meanNum = self.numRecords/len(chunks)
+ result.numRecords = long(round(meanNum,
+ (-1 * len(str(meanNum))) + 3))
+ meanLen = self.totalLength/len(chunks)
+ result.totalLength = long(round(meanLen,
+ (-1 * len(str(meanLen))) + 3))
+ elif updateCounts:
result._openReaders = self._openReaders
passes = result._filters.filterIndexRecords(self.index,
- self.refIds)
+ self.refIds,
+ self.movieIds)
result._index = self.index[passes]
result.updateCounts()
del result._index
@@ -2987,6 +3256,18 @@ class AlignmentSet(ReadSet):
yield read
@property
+ def tId(self):
+ if self.isCmpH5:
+ return self.index.RefGroupID
+ return self.index.tId
+
+ @property
+ def mapQV(self):
+ if self.isCmpH5:
+ return self.index.MapQV
+ return self.index.mapQV
+
+ @property
def isSorted(self):
return self._checkIdentical('isSorted')
@@ -3012,20 +3293,22 @@ class AlignmentSet(ReadSet):
"""
count = len(self.index)
- if self.isCmpH5:
- length = sum(self.index.rEnd - self.index.rStart)
- else:
- try:
- length = sum(self.index.aEnd - self.index.aStart)
- except AttributeError:
- # If the bam is empty or the file is not actually aligned, this
- # field wont be populated
- if self.isMapped:
- log.warn(".pbi mapping columns missing from mapped bam, "
- "bam may be empty")
- else:
- log.warn("File not actually mapped!")
- length = 0
+ length = 0
+ if count:
+ if self.isCmpH5:
+ length = sum(self.index.rEnd - self.index.rStart)
+ else:
+ try:
+ length = sum(self.index.aEnd - self.index.aStart)
+ except AttributeError:
+ # If the bam is empty or the file is not actually aligned,
+ # this field wont be populated
+ if self.isMapped:
+ log.debug(".pbi mapping columns missing from mapped "
+ "bam, bam may be empty")
+ else:
+ log.warn("File not actually mapped!")
+ length = 0
return count, length
@property
@@ -3046,16 +3329,23 @@ class AlignmentSet(ReadSet):
def referenceInfo(self, refName):
"""Select a row from the DataSet.referenceInfoTable using the reference
- name as a unique key"""
- # TODO: upgrade to use _referenceDict if needed
- if not self.isCmpH5:
- for row in self.referenceInfoTable:
- if row.Name == refName:
- return row
+ name as a unique key (or ID, if you really have to)"""
+
+ # Convert it to a name if you have to:
+ if not isinstance(refName, str):
+ refName = str(refName)
+ if refName.isdigit():
+ if not refName in self.refNames:
+ refName = self._idToRname(int(refName))
+
+ tbr = self.referenceInfoTable[
+ self.referenceInfoTable['Name'] == refName]
+ if len(tbr) > 1:
+ log.info("Duplicate reference names detected")
+ elif len(tbr) == 1:
+ return tbr[0]
else:
- for row in self.referenceInfoTable:
- if row.FullName.startswith(refName):
- return row
+ log.debug("No reference found by that Name or ID")
@property
def referenceInfoTable(self):
@@ -3064,15 +3354,28 @@ class AlignmentSet(ReadSet):
is preferred). Record.Names are remapped for cmp.h5 files to be
consistent with bam files.
+ ..note:: Reference names are assumed to be unique
+
"""
if self._referenceInfoTable is None:
- # This isn't really right for cmp.h5 files (rowStart, rowEnd, for
- # instance). Use the resource readers directly instead.
- responses = self._pollResources(lambda x: x.referenceInfoTable)
+ self._referenceInfoTableIsStacked = False
+ responses = []
+
+ # allow for merge here:
+ for res in self._pollResources(lambda x: x.referenceInfoTable):
+ if not res is None:
+ if self.isCmpH5:
+ for rec in res:
+ rec.StartRow = 0
+ rec.EndRow = 0
+ responses.append(res)
table = []
if len(responses) > 1:
- assert not self.isCmpH5 # see above
+ # perhaps this can be removed someday so that cmpH5 references
+ # are 0-indexed even if only one exists
try:
+ # this works even for cmp's, because we overwrite the
+ # highly variable fields above
table = self._unifyResponses(
responses,
eqFunc=np.array_equal)
@@ -3081,18 +3384,19 @@ class AlignmentSet(ReadSet):
table = np.unique(table)
for i, rec in enumerate(table):
rec.ID = i
- self._stackedReferenceInfoTable = True
- else:
+ rec.RefInfoID = i
+ self._referenceInfoTableIsStacked = True
+ elif len(responses) == 1:
table = responses[0]
- if self.isCmpH5:
- for rec in table:
- rec.Name = self._cleanCmpName(rec.FullName)
+ else:
+ raise InvalidDataSetIOError("No reference tables found, "
+ "are these input files aligned?")
+ if self.isCmpH5:
+ for rec in table:
+ rec.Name = self._cleanCmpName(rec.FullName)
log.debug("Filtering reference entries")
if not self.noFiltering and self._filters:
- passes = []
- for i, reference in enumerate(table):
- if self._filters.testParam('rname', reference.Name):
- passes.append(i)
+ passes = self._filters.testField('rname', table['Name'])
table = table[passes]
self._referenceInfoTable = table
#TODO: Turn on when needed (expensive)
@@ -3101,26 +3405,21 @@ class AlignmentSet(ReadSet):
return self._referenceInfoTable
@property
+ def _stackedReferenceInfoTable(self):
+ if self._referenceInfoTableIsStacked is None:
+ _ = self.referenceInfoTable
+ return self._referenceInfoTableIsStacked
+
+ @property
def _referenceIdMap(self):
"""Map the dataset shifted refIds to the [resource, refId] they came
from.
"""
- # This isn't really possible for cmp.h5 files (rowStart, rowEnd, for
- # instance). Use the resource readers directly instead.
- responses = self._pollResources(lambda x: x.referenceInfoTable)
- if len(responses) > 1:
- assert not self.isCmpH5 # see above
- tbr = reduce(np.append, responses)
- tbrMeta = []
- for i, res in enumerate(responses):
- for j in res['ID']:
- tbrMeta.append([i, j])
- _, indices = np.unique(tbr, return_index=True)
- tbrMeta = list(tbrMeta[i] for i in indices)
- return {i: meta for i, meta in enumerate(tbrMeta)}
- else:
- return {i: [0, i] for i in responses[0]['ID']}
+ if self.__referenceIdMap is None:
+ self.__referenceIdMap = {ref.ID: ref.Name
+ for ref in self.referenceInfoTable}
+ return self.__referenceIdMap
def _cleanCmpName(self, name):
return splitFastaHeader(name)[0]
@@ -3147,24 +3446,12 @@ class AlignmentSet(ReadSet):
return [name for _, name in self.refInfo('FullName')]
def refInfo(self, key):
- """The reference names present in the referenceInfoTable of the
- ExtResources.
-
- Args:
- :key: a key for the referenceInfoTable of each resource
- Returns:
- A list of tuples of refrence name, key_result pairs
+ """Return a column in the referenceInfoTable, tupled with the reference
+ name. TODO(mdsmith)(2016-01-27): pick a better name for this method...
"""
- #log.debug("Sampling references")
- names = self.referenceInfoTable['Name']
- infos = self.referenceInfoTable[key]
-
- #log.debug("Removing duplicate reference entries")
- sampled = zip(names, infos)
- sampled = set(sampled)
-
- return sampled
+ return zip(self.referenceInfoTable['Name'],
+ self.referenceInfoTable[key])
def _idToRname(self, rId):
"""Map the DataSet.referenceInfoTable.ID to the superior unique
@@ -3177,17 +3464,7 @@ class AlignmentSet(ReadSet):
The referenceInfoTable.Name corresponding to rId
"""
- resNo, rId = self._referenceIdMap[rId]
- if self.isCmpH5:
- rId -= 1
- if self.isCmpH5:
- # This is what CmpIO recognizes as the 'shortname'
- refName = self.resourceReaders()[
- resNo].referenceInfoTable[rId]['FullName']
- else:
- refName = self.resourceReaders()[
- resNo].referenceInfoTable[rId]['Name']
- return refName
+ return self._referenceIdMap[rId]
@staticmethod
def _metaTypeMapping():
@@ -3207,7 +3484,8 @@ class ConsensusReadSet(ReadSet):
Doctest:
>>> import pbcore.data.datasets as data
>>> from pbcore.io import ConsensusReadSet
- >>> ds2 = ConsensusReadSet(data.getXml(2), strict=False)
+ >>> ds2 = ConsensusReadSet(data.getXml(2), strict=False,
+ ... skipMissing=True)
>>> ds2 # doctest:+ELLIPSIS
<ConsensusReadSet...
>>> ds2._metadata # doctest:+ELLIPSIS
@@ -3247,11 +3525,29 @@ class ContigSet(DataSet):
datasetType = DataSetMetaTypes.CONTIG
def __init__(self, *files, **kwargs):
- log.debug("Opening ContigSet")
- super(ContigSet, self).__init__(*files, **kwargs)
- self._metadata = ContigSetMetadata(self._metadata)
- self._updateMetadata()
self._fastq = False
+ super(ContigSet, self).__init__(*files, **kwargs)
+ # weaken by permitting failure to allow BarcodeSet to have own
+ # Metadata type
+ try:
+ self._metadata = ContigSetMetadata(self._metadata)
+ self._updateMetadata()
+ except TypeError:
+ pass
+
+ def split(self, nchunks):
+ log.debug("Getting and dividing contig id's")
+ keys = self.index.id
+ chunks = divideKeys(keys, nchunks)
+ log.debug("Creating copies of the dataset")
+ results = [self.copy() for _ in range(nchunks)]
+ log.debug("Applying filters and updating counts")
+ for chunk, res in zip(chunks, results):
+ res._filters.addRequirement(id=[('=', n) for n in chunk])
+ res.newUuid()
+ log.debug("Updating new res counts:")
+ res.updateCounts()
+ return results
def consolidate(self, outfn=None, numFiles=1, useTmp=False):
"""Consolidation should be implemented for window text in names and
@@ -3286,6 +3582,8 @@ class ContigSet(DataSet):
# consolidate multiple files into one
if len(self.toExternalFiles()) > 1:
writeTemp = True
+ if self._filters and not self.noFiltering:
+ writeTemp = True
writeMatches = {}
writeComments = {}
writeQualities = {}
@@ -3330,28 +3628,40 @@ class ContigSet(DataSet):
if not outfn:
log.debug("Writing to a temp directory as no path given")
outdir = tempfile.mkdtemp(suffix="consolidated-contigset")
- outfn = os.path.join(outdir,
- 'consolidated_contigs.fasta')
+ if self._fastq:
+ outfn = os.path.join(outdir,
+ 'consolidated_contigs.fastq')
+ else:
+ outfn = os.path.join(outdir,
+ 'consolidated_contigs.fasta')
with self._writer(outfn) as outfile:
log.debug("Writing new resource {o}".format(o=outfn))
for name, seq in writeMatches.items():
+ name_key = name
if writeComments[name]:
name = ' '.join([name, writeComments[name]])
if self._fastq:
- outfile.writeRecord(name, seq,
- qvsFromAscii(writeQualities[name]))
- continue
- outfile.writeRecord(name, seq)
+ outfile.writeRecord(
+ name, seq, qvsFromAscii(writeQualities[name_key]))
+ else:
+ outfile.writeRecord(name, seq)
if not self._fastq:
_indexFasta(outfn)
# replace resources
log.debug("Replacing resources")
self.externalResources = ExternalResources()
self.addExternalResources([outfn])
+ self._index = None
+ self._indexMap = None
+ self._openReaders = []
+ self._populateMetaTypes()
+ self.updateCounts()
+
# replace contig info
- log.debug("Replacing metadata")
- self._metadata.contigs = []
- self._populateContigMetadata()
+ if not self._fastq:
+ log.debug("Replacing metadata")
+ self._metadata.contigs = []
+ self._populateContigMetadata()
self._populateMetaTypes()
@property
@@ -3364,7 +3674,7 @@ class ContigSet(DataSet):
"""Chunking and quivering adds suffixes to contig names, after the
normal ID and window. This complicates our dewindowing and
consolidation, so we'll remove them for now"""
- observedSuffixes = ['|quiver', '|plurality', '|arrow']
+ observedSuffixes = ['|quiver', '|plurality', '|arrow', '|poa']
for suff in observedSuffixes:
if name.endswith(suff):
log.debug("Suffix found: {s}".format(s=suff))
@@ -3392,6 +3702,8 @@ class ContigSet(DataSet):
"""Chunking and quivering appends a window to the contig ID, which
allows us to consolidate the contig chunks."""
name, _ = self._popSuffix(name)
+ if re.search("_\d+_\d+$", name) is None:
+ return None
possibilities = name.split('_')[-2:]
for pos in possibilities:
if not pos.isdigit():
@@ -3413,20 +3725,22 @@ class ContigSet(DataSet):
self._metadata.addContig(contig)
def updateCounts(self):
+ self._countsUpdated = True
if self._skipCounts:
if not self.metadata.totalLength:
self.metadata.totalLength = -1
if not self.metadata.numRecords:
self.metadata.numRecords = -1
return
- try:
- log.debug('Updating counts')
+ if not self.isIndexed:
+ log.info("Cannot updateCounts without an index file")
self.metadata.totalLength = 0
self.metadata.numRecords = 0
- for res in self.resourceReaders():
- self.metadata.numRecords += len(res)
- for index in res.fai:
- self.metadata.totalLength += index.length
+ return
+ try:
+ log.debug('Updating counts')
+ self.metadata.totalLength = sum(self.index.length)
+ self.metadata.numRecords = len(self.index)
except (IOError, UnavailableFeature, TypeError):
# IOError for missing files
# UnavailableFeature for files without companion files
@@ -3477,16 +3791,26 @@ class ContigSet(DataSet):
type before accessing any file)
"""
if self._openReaders:
- log.debug("Closing old readers...")
+ log.debug("Closing old {t} readers".format(
+ t=self.__class__.__name__))
self.close()
- log.debug("Opening resources")
+ log.debug("Opening {t} resources".format(
+ t=self.__class__.__name__))
for extRes in self.externalResources:
- location = urlparse(extRes.resourceId).path
- if location.endswith("fastq"):
- self._fastq = True
- self._openReaders.append(FastqReader(location))
- continue
- resource = None
+ resource = self._openFile(urlparse(extRes.resourceId).path)
+ if resource is not None:
+ self._openReaders.append(resource)
+ if len(self._openReaders) == 0 and len(self.toExternalFiles()) != 0:
+ raise IOError("No files were openable")
+ log.debug("Done opening {t} resources".format(
+ t=self.__class__.__name__))
+
+ def _openFile(self, location):
+ resource = None
+ if location.endswith("fastq"):
+ self._fastq = True
+ resource = FastqReader(location)
+ else:
try:
resource = IndexedFastaReader(location)
except IOError:
@@ -3498,18 +3822,14 @@ class ContigSet(DataSet):
else:
raise
except ValueError:
- log.warn("Fasta file is empty")
+ log.debug("Fasta file is empty")
# this seems to work for an emtpy fasta, interesting:
resource = FastaReader(location)
# we know this file is empty
self._skipCounts = True
self.metadata.totalLength = 0
self.metadata.numRecords = 0
- if resource is not None:
- self._openReaders.append(resource)
- if len(self._openReaders) == 0 and len(self.toExternalFiles()) != 0:
- raise IOError("No files were openable")
- log.debug("Done opening resources")
+ return resource
def resourceReaders(self, refName=None):
"""A generator of fastaReader objects for the ExternalResources in this
@@ -3521,7 +3841,12 @@ class ContigSet(DataSet):
"""
if refName:
log.error("Specifying a contig name not yet implemented")
- self._openFiles()
+ if not self._openReaders:
+ self._openFiles()
+ else:
+ for reader in self._openReaders:
+ if isinstance(reader, (FastaReader, FastqReader)):
+ reader.file = open(reader.filename, "r")
return self._openReaders
@property
@@ -3546,11 +3871,12 @@ class ContigSet(DataSet):
return contig
def assertIndexed(self):
- # I would prefer to use _assertIndexed, but want to pass up the
- # original error
- if not self.isIndexed:
- self._strict = True
- self._openFiles()
+ try:
+ self._assertIndexed(IndexedFastaReader)
+ except IOError:
+ raise IOError("Companion FASTA index (.fai) file not found or "
+ "malformatted! Use 'samtools faidx' to generate "
+ "FASTA index.")
return True
@property
@@ -3561,13 +3887,13 @@ class ContigSet(DataSet):
return self._unifyResponses(res)
except ResourceMismatchError:
if not self._strict:
- log.error("Not all resource are equally indexed.")
+ log.info("Not all resource are equally indexed.")
return False
else:
raise
except IndexError:
if not self._strict:
- log.error("No resource readers!")
+ log.info("No resource readers!")
return False
else:
raise InvalidDataSetIOError("No openable resources!")
@@ -3576,6 +3902,8 @@ class ContigSet(DataSet):
def contigNames(self):
"""The names assigned to the External Resources, or contigs if no name
assigned."""
+ # TODO{mdsmith}{3/10/2016} Make this faster by using (optionally) the
+ # index file
names = []
for contig in self.contigs:
if (self.noFiltering
@@ -3602,9 +3930,12 @@ class ContigSet(DataSet):
"""
recArrays = []
- self._indexMap = []
+ _indexMap = []
for rrNum, rr in enumerate(self.resourceReaders()):
indices = rr.fai
+ if len(indices) == 0:
+ continue
+ # have to manually specify the dtype to make it work like a pbi
indices = np.rec.fromrecords(
indices,
dtype=[('id', 'O'), ('comment', 'O'), ('header', 'O'),
@@ -3613,19 +3944,27 @@ class ContigSet(DataSet):
if not self._filters or self.noFiltering:
recArrays.append(indices)
- self._indexMap.extend([(rrNum, i) for i in
- range(len(indices))])
+ _indexMap.extend([(rrNum, i) for i in
+ range(len(indices))])
else:
# Filtration will be necessary:
# dummy map, the id is the name in fasta space
nameMap = {name: name for name in indices.id}
- passes = self._filters.filterIndexRecords(indices, nameMap)
+ passes = self._filters.filterIndexRecords(indices, nameMap, {},
+ readType='fasta')
newInds = indices[passes]
recArrays.append(newInds)
- self._indexMap.extend([(rrNum, i) for i in
- np.nonzero(passes)[0]])
- self._indexMap = np.array(self._indexMap)
+ _indexMap.extend([(rrNum, i) for i in
+ np.flatnonzero(passes)])
+ self._indexMap = np.array(_indexMap, dtype=[('reader', 'uint64'),
+ ('index', 'uint64')])
+ if len(recArrays) == 0:
+ recArrays = [np.array(
+ [],
+ dtype=[('id', 'O'), ('comment', 'O'), ('header', 'O'),
+ ('length', '<i8'), ('offset', '<i8'),
+ ('lineWidth', '<i8'), ('stride', '<i8')])]
return _stackRecArrays(recArrays)
def _filterType(self):
@@ -3638,7 +3977,6 @@ class ReferenceSet(ContigSet):
datasetType = DataSetMetaTypes.REFERENCE
def __init__(self, *files, **kwargs):
- log.debug("Opening ReferenceSet with {f}".format(f=files))
super(ReferenceSet, self).__init__(*files, **kwargs)
@property
@@ -3659,15 +3997,15 @@ class ReferenceSet(ContigSet):
}
-class BarcodeSet(DataSet):
+class BarcodeSet(ContigSet):
"""DataSet type specific to Barcodes"""
datasetType = DataSetMetaTypes.BARCODE
def __init__(self, *files, **kwargs):
- log.debug("Opening BarcodeSet")
super(BarcodeSet, self).__init__(*files, **kwargs)
- self._metadata = BarcodeSetMetadata(self._metadata)
+ self._metadata = BarcodeSetMetadata(self._metadata.record)
+ self._updateMetadata()
def addMetadata(self, newMetadata, **kwargs):
"""Add metadata specific to this subtype, while leaning on the
@@ -3690,6 +4028,24 @@ class BarcodeSet(DataSet):
# Pull subtype specific values where important
# -> No type specific merging necessary, for now
+ @property
+ def metadata(self):
+ if not isinstance(self._metadata, BarcodeSetMetadata):
+ self._metadata = BarcodeSetMetadata(self._metadata)
+ return self._metadata
+
+ @metadata.setter
+ def metadata(self, value):
+ if not isinstance(value, BarcodeSetMetadata):
+ value = BarcodeSetMetadata(value)
+ self._metadata = value
+
+ def _updateMetadata(self):
+ # update barcode specific metadata:
+ if not self._metadata.barcodeConstruction:
+ self._metadata.barcodeConstruction = ''
+
+
@staticmethod
def _metaTypeMapping():
return {'fasta':'PacBio.BarcodeFile.BarcodeFastaFile',
diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py
index 6e3837c..05303a7 100755
--- a/pbcore/io/dataset/DataSetMembers.py
+++ b/pbcore/io/dataset/DataSetMembers.py
@@ -65,6 +65,11 @@ from pbcore.io.dataset.DataSetWriter import namespaces
log = logging.getLogger(__name__)
+DISTLIST = ["ProdDist", "ReadTypeDist", "ReadLenDist", "ReadQualDist",
+ "MedianInsertDist", "InsertReadQualDist",
+ "InsertReadLenDist", "ControlReadQualDist",
+ "ControlReadLenDist"]
+
def newUuid(record):
# At some point the uuid may need to be a digest
#newId = str(hashlib.md5(str(record)).hexdigest())
@@ -83,6 +88,47 @@ def getTime():
def getTimeStampedName(mType):
return "{m}-{t}".format(m=mType, t=getTime())
+OPMAP = {'==': OP.eq,
+ '=': OP.eq,
+ 'eq': OP.eq,
+ '!=': OP.ne,
+ 'ne': OP.ne,
+ '>=': OP.ge,
+ '>=': OP.ge,
+ 'gte': OP.ge,
+ '<=': OP.le,
+ '<=': OP.le,
+ 'lte': OP.le,
+ '>': OP.gt,
+ '>': OP.gt,
+ 'gt': OP.gt,
+ '<': OP.lt,
+ '<': OP.lt,
+ 'lt': OP.lt,
+ '&': lambda x, y: OP.and_(x, y).view(np.bool_),
+ '~': lambda x, y: np.logical_not(OP.and_(x, y).view(np.bool_)),
+ }
+
+def mapOp(op):
+ return OPMAP[op]
+
+class PbiFlags(object):
+ NO_LOCAL_CONTEXT = 0
+ ADAPTER_BEFORE = 1
+ ADAPTER_AFTER = 2
+ BARCODE_BEFORE = 4
+ BARCODE_AFTER = 8
+ FORWARD_PASS = 16
+ REVERSE_PASS = 32
+
+ @classmethod
+ def flagMap(cls, flag):
+ if flag.isdigit():
+ return int(flag)
+ return reduce(OP.or_,
+ (getattr(cls, fl.strip()) for fl in flag.split('|')))
+
+
class RecordWrapper(object):
"""The base functionality of a metadata element.
@@ -292,10 +338,14 @@ class RecordWrapper(object):
def removeChildren(self, tag):
keepers = []
+ removed = []
for child in self.record['children']:
if child['tag'] != tag:
keepers.append(child)
+ else:
+ removed.append(child)
self.record['children'] = keepers
+ return removed
def pruneChildrenTo(self, whitelist):
newChildren = []
@@ -350,6 +400,9 @@ class Filters(RecordWrapper):
if func not in self._callbacks:
self._callbacks.append(func)
+ def clearCallbacks(self):
+ self._callbacks = []
+
def _runCallbacks(self):
for func in self._callbacks:
func()
@@ -412,31 +465,23 @@ class Filters(RecordWrapper):
for i, filt in enumerate(self):
for req in filt:
if req.name == param:
- if not self.opMap(oper)(testType(req.value),
+ if not mapOp(oper)(testType(req.value),
testType(value)):
options[i] = False
return any(options)
- def opMap(self, op):
- ops = {'==': OP.eq,
- '=': OP.eq,
- 'eq': OP.eq,
- '!=': OP.ne,
- 'ne': OP.ne,
- '>=': OP.ge,
- '>=': OP.ge,
- 'gte': OP.ge,
- '<=': OP.le,
- '<=': OP.le,
- 'lte': OP.le,
- '>': OP.gt,
- '>': OP.gt,
- 'gt': OP.gt,
- '<': OP.lt,
- '<': OP.lt,
- 'lt': OP.lt,
- }
- return ops[op]
+ def testField(self, param, values, testType=str, oper='='):
+ passes = np.zeros(len(values), dtype=np.bool_)
+ tested = False
+ for i, filt in enumerate(self):
+ for req in filt:
+ if req.name == param:
+ tested = True
+ passes |= mapOp(oper)(testType(req.value),
+ values)
+ if not tested:
+ return np.ones(len(values), dtype=np.bool_)
+ return passes
@property
def _bamAccMap(self):
@@ -457,11 +502,13 @@ class Filters(RecordWrapper):
'readstart': (lambda x: int(x.aStart)),
'tstart': (lambda x: int(x.tStart)),
'tend': (lambda x: int(x.tEnd)),
+ 'n_subreads': (lambda x: len(np.flatnonzero(
+ x.reader.holeNumber ==
+ x.HoleNumber))),
}
- def _pbiAccMap(self, tIdMap):
- return {'rname': (lambda x, m=tIdMap: m[x.tId]),
- 'length': (lambda x: int(x.aEnd)-int(x.aStart)),
+ def _pbiAccMap(self):
+ return {'length': (lambda x: int(x.aEnd)-int(x.aStart)),
'qname': (lambda x: x.qId),
'zm': (lambda x: int(x.holeNumber)),
'pos': (lambda x: int(x.tStart)),
@@ -470,29 +517,40 @@ class Filters(RecordWrapper):
'tend': (lambda x: int(x.tEnd)),
}
- def _pbiMappedVecAccMap(self, tIdMap):
- plus = {'rname': (lambda x, m=tIdMap: m[x.tId]),
+ def _pbiMappedVecAccMap(self):
+ plus = {'rname': (lambda x: x.tId),
'length': (lambda x: x.aEnd - x.aStart),
'pos': (lambda x: x.tStart),
'readstart': (lambda x: x.aStart),
'tstart': (lambda x: x.tStart),
'tend': (lambda x: x.tEnd),
+ 'accuracy': (
+ lambda x: (np.ones(len(x.nMM), dtype='f4') -
+ (x.nMM + x.nIns + x.nDel).astype(np.float32)/
+ (x.aEnd - x.aStart + x.tEnd - x.tStart -
+ x.nM - x.nMM)))
}
- base = self._pbiVecAccMap(tIdMap)
+ base = self._pbiVecAccMap()
base.update(plus)
return base
- def _pbiVecAccMap(self, tIdMap):
+ def _pbiVecAccMap(self):
return {'length': (lambda x: x.qEnd - x.qStart),
'qstart': (lambda x: x.qStart),
'qend': (lambda x: x.qEnd),
'qname': (lambda x: x.qId),
+ 'movie': (lambda x: x.qId),
'zm': (lambda x: x.holeNumber),
'rq': (lambda x: x.readQual),
'bcf': (lambda x: x.bcForward),
'bcr': (lambda x: x.bcForward),
'bcq': (lambda x: x.bcQual),
- 'bc': (lambda x: np.array(zip(x.bcForward, x.bcReverse))),
+ 'bq': (lambda x: x.bcQual),
+ 'bc': (lambda x: x['bcForward', 'bcReverse']),
+ 'cx': (lambda x: x.contextFlag),
+ 'n_subreads': (lambda x: np.array(
+ [len(np.flatnonzero(x.holeNumber == hn))
+ for hn in x.holeNumber])),
}
@property
@@ -503,13 +561,19 @@ class Filters(RecordWrapper):
'movie': str,
'zm': int,
'bc': str,
+ 'bcr': int,
+ 'bcf': int,
+ 'bcq': int,
+ 'bq': int,
'qs': int,
- 'rq': float,
+ 'rq': np.float32,
'pos': int,
'tstart': int,
'tend': int,
- 'accuracy': float,
+ 'accuracy': np.float32,
'readstart': int,
+ 'cx': PbiFlags.flagMap,
+ 'n_subreads': int,
}
def tests(self, readType="bam", tIdMap=None):
@@ -522,13 +586,13 @@ class Filters(RecordWrapper):
typeMap = self._bamTypeMap
elif readType.lower() == "fasta":
accMap = {'id': (lambda x: x.id),
- 'length': (lambda x: int(x.length)),
+ 'length': (lambda x: int(len(x))),
}
typeMap = {'id': str,
'length': int,
}
elif readType.lower() == "pbi":
- accMap = self._pbiAccMap(tIdMap)
+ accMap = self._pbiAccMap()
typeMap = self._bamTypeMap
else:
raise TypeError("Read type not properly specified")
@@ -538,58 +602,72 @@ class Filters(RecordWrapper):
for req in filt:
param = req.name
value = typeMap[param](req.value)
- operator = self.opMap(req.operator)
+ operator = mapOp(req.operator)
reqTests.append(P(filter_read, accMap[param], operator, value))
tests.append(
lambda x, reqTests=reqTests: all([f(x) for f in reqTests]))
return tests
- def filterIndexRecords(self, indexRecords, nameMap):
- typeMap = self._bamTypeMap
- accMap = self._pbiVecAccMap({})
- if 'aEnd' in indexRecords.dtype.names:
- accMap = self._pbiMappedVecAccMap({})
- accMap['rname'] = lambda x: x.tId
- filterLastResult = None
+ def filterIndexRecords(self, indexRecords, nameMap, movieMap,
+ readType='bam'):
+ if readType == 'bam':
+ typeMap = self._bamTypeMap
+ accMap = self._pbiVecAccMap()
+ if 'tStart' in indexRecords.dtype.names:
+ accMap = self._pbiMappedVecAccMap()
+ if 'RefGroupID' in indexRecords.dtype.names:
+ accMap['rname'] = (lambda x: x.RefGroupID)
+ if 'MovieID' in indexRecords.dtype.names:
+ # TODO(mdsmith)(2016-01-29) remove these once the fields are
+ # renamed:
+ accMap['movie'] = (lambda x: x.MovieID)
+ accMap['qname'] = (lambda x: x.MovieID)
+ elif readType == 'fasta':
+ accMap = {'id': (lambda x: x.id),
+ 'length': (lambda x: x.length.astype(int)),
+ }
+ typeMap = {'id': str,
+ 'length': int,
+ }
+
+ filterLastResult = np.zeros(len(indexRecords), dtype=np.bool_)
for filt in self:
- lastResult = None
+ lastResult = np.ones(len(indexRecords), dtype=np.bool_)
for req in filt:
param = req.name
if param in accMap.keys():
value = typeMap[param](req.value)
if param == 'rname':
value = nameMap[value]
+ if param == 'movie':
+ value = movieMap[value]
if param == 'bc':
# convert string to list:
values = ast.literal_eval(value)
param = 'bcf'
value = values[0]
- operator = self.opMap(req.operator)
+ operator = mapOp(req.operator)
reqResultsForRecords = operator(
accMap[param](indexRecords), value)
param = 'bcr'
value = values[1]
- operator = self.opMap(req.operator)
+ operator = mapOp(req.operator)
reqResultsForRecords &= operator(
accMap[param](indexRecords), value)
else:
- operator = self.opMap(req.operator)
+ operator = mapOp(req.operator)
reqResultsForRecords = operator(
accMap[param](indexRecords), value)
- if lastResult is None:
- lastResult = reqResultsForRecords
- else:
- lastResult = np.logical_and(lastResult,
- reqResultsForRecords)
- del reqResultsForRecords
- if filterLastResult is None:
- filterLastResult = lastResult
- else:
- filterLastResult = np.logical_or(filterLastResult, lastResult)
- del lastResult
+ lastResult &= reqResultsForRecords
+ del reqResultsForRecords
+ else:
+ log.warn("Filter not recognized: {f}".format(f=param))
+ filterLastResult |= lastResult
+ del lastResult
return filterLastResult
def fromString(self, filterString):
+ # TODO(mdsmith)(2016-02-09) finish this
filtDict = {}
self._runCallbacks()
@@ -604,6 +682,8 @@ class Filters(RecordWrapper):
name: The name of the requirement, e.g. 'rq'
options: A list of (operator, value) tuples, e.g. ('>', '0.85')
"""
+ if not kwargs:
+ return
# if there are already filters, you must copy the filters for each new
# option and add one set of requirements to each option:
if self.submetadata:
@@ -622,14 +702,58 @@ class Filters(RecordWrapper):
for i, (oper, val) in enumerate(options):
newFilts[i].addRequirement(name, oper, val)
self.extend(newFilts)
+ #log.debug("Current filters: {s}".format(s=str(self)))
self._runCallbacks()
+ def addFilter(self, **kwargs):
+ """Use this to add filters. Members of the list will be considered
+ requirements for fulfilling this option. Use multiple calls to add
+ multiple filters.
+
+ Args:
+ name: The name of the requirement, e.g. 'rq'
+ options: A list of (operator, value) tuples, e.g. ('>', '0.85')
+ """
+ if not kwargs:
+ return
+ newFilt = Filter()
+ for name, options in kwargs.items():
+ for oper, val in options:
+ newFilt.addRequirement(name, oper, val)
+ self.append(newFilt)
+ log.debug("Current filters: {s}".format(s=str(self)))
+ self._runCallbacks()
+
+ def removeFilter(self, index):
+ self.pop(index)
+ log.debug("Current filters: {s}".format(s=str(self)))
+ self._runCallbacks()
+
+ def mapRequirement(self, **kwargs):
+ """Add requirements to each of the existing requirements, mapped one
+ to one"""
+ # Check that all lists of values are the same length:
+ values = kwargs.values()
+ if len(values) > 1:
+ for v in values[1:]:
+ assert len(v) == len(values[0])
+
+ # Check that this length is equal to the current number of filters:
+ assert len(kwargs.values()[0]) == len(list(self))
+
+ for req, opvals in kwargs.items():
+ for filt, opval in zip(self, opvals):
+ filt.addRequirement(req, opval[0], opval[1])
+
def removeRequirement(self, req):
log.debug("Removing requirement {r}".format(r=req))
+ to_remove = []
for i, filt in enumerate(self):
empty = filt.removeRequirement(req)
if empty:
- self.pop(i)
+ to_remove.append(i)
+ for i in sorted(to_remove, reverse=True):
+ self.pop(i)
self._runCallbacks()
@@ -664,9 +788,12 @@ class Filter(RecordWrapper):
self.plist.append(param)
def removeRequirement(self, req):
+ to_remove = []
for i, param in enumerate(self):
if param.name == req:
- self.pop(i)
+ to_remove.append(i)
+ for i in sorted(to_remove, reverse=True):
+ self.pop(i)
if len(self.plist):
return False
else:
@@ -909,6 +1036,14 @@ class ExternalResource(RecordWrapper):
return index.resourceId
@property
+ def sts(self):
+ return self._getSubResByMetaType('PacBio.SubreadFile.ChipStatsFile')
+
+ @sts.setter
+ def sts(self, value):
+ self._setSubResByMetaType('PacBio.SubreadFile.ChipStatsFile', value)
+
+ @property
def scraps(self):
return self._getSubResByMetaType('PacBio.SubreadFile.ScrapsBamFile')
@@ -917,6 +1052,14 @@ class ExternalResource(RecordWrapper):
self._setSubResByMetaType('PacBio.SubreadFile.ScrapsBamFile', value)
@property
+ def barcodes(self):
+ return self._getSubResByMetaType("PacBio.DataSet.BarcodeSet")
+
+ @barcodes.setter
+ def barcodes(self, value):
+ self._setSubResByMetaType("PacBio.DataSet.BarcodeSet", value)
+
+ @property
def reference(self):
return self._getSubResByMetaType(
'PacBio.ReferenceFile.ReferenceFastaFile')
@@ -979,7 +1122,7 @@ class ExternalResource(RecordWrapper):
fileIndices = FileIndices(fileIndices[0])
else:
fileIndices = FileIndices()
- for index in indices:
+ for index in list(indices):
temp = FileIndex()
temp.resourceId = index
fileIndices.append(temp)
@@ -1093,6 +1236,12 @@ class DataSetMetadata(RecordWrapper):
except ValueError:
return None
+ @summaryStats.setter
+ def summaryStats(self, value):
+ self.removeChildren('SummaryStats')
+ if value:
+ self.append(value)
+
@property
def provenance(self):
try:
@@ -1119,8 +1268,14 @@ class SubreadSetMetadata(DataSetMetadata):
def merge(self, other):
super(self.__class__, self).merge(other)
- self.collections.merge(other.collections)
- self.bioSamples.merge(other.bioSamples)
+ if other.collections and not self.collections:
+ self.append(other.collections)
+ else:
+ self.collections.merge(other.collections)
+ if other.bioSamples and not self.bioSamples:
+ self.append(other.bioSamples)
+ else:
+ self.bioSamples.merge(other.bioSamples)
@property
def collections(self):
@@ -1129,6 +1284,12 @@ class SubreadSetMetadata(DataSetMetadata):
return CollectionsMetadata(self.getV(tag='Collections',
container='children'))
+ @collections.setter
+ def collections(self, value):
+ self.removeChildren('Collections')
+ if value:
+ self.append(value)
+
@property
def bioSamples(self):
"""Return a list of wrappers around BioSamples elements of the Metadata
@@ -1220,7 +1381,14 @@ class BarcodeSetMetadata(DataSetMetadata):
@property
def barcodeConstruction(self):
- return self.getMemberV('BarcodeConstruction')
+ try:
+ return self.getMemberV('BarcodeConstruction')
+ except ValueError:
+ return None
+
+ @barcodeConstruction.setter
+ def barcodeConstruction(self, value):
+ self.setMemberV('BarcodeConstruction', value)
class ContigsMetadata(RecordWrapper):
@@ -1395,20 +1563,14 @@ class StatsMetadata(RecordWrapper):
self.prodDist.bins[1]
+ other.prodDist.bins[1])
self.numSequencingZmws += other.numSequencingZmws
- toHandle = [
- (self.prodDist, other.prodDist),
- (self.readTypeDist, other.readTypeDist),
- (self.readLenDist, other.readLenDist),
- (self.readQualDist, other.readQualDist),
- (self.medianInsertDist, other.medianInsertDist),
- (self.insertReadQualDist, other.insertReadQualDist),
- (self.insertReadLenDist, other.insertReadLenDist),
- (self.controlReadQualDist, other.controlReadQualDist),
- (self.controlReadLenDist, other.controlReadLenDist),
- ]
- for selfDist, otherDist in toHandle:
+ for dist in DISTLIST:
+ selfDist = getattr(self, dist[0].lower() + dist[1:])
+ otherDist = getattr(other, dist[0].lower() + dist[1:])
try:
selfDist.merge(otherDist)
+ except ZeroBinWidthError as e:
+ removed = self.removeChildren(dist)
+ self.append(otherDist)
except BinMismatchError:
self.append(otherDist)
except ValueError:
@@ -1519,6 +1681,10 @@ def _staggeredZip(binWidth, start1, start2, bins1, bins2):
class ContinuousDistribution(RecordWrapper):
def merge(self, other):
+ if other.binWidth == 0:
+ return
+ if self.binWidth == 0:
+ raise ZeroBinWidthError(self.binWidth, other.binWidth)
if self.binWidth != other.binWidth:
raise BinWidthMismatchError(self.binWidth, other.binWidth)
if (self.minBinValue % self.binWidth
@@ -1533,6 +1699,10 @@ class ContinuousDistribution(RecordWrapper):
def numBins(self):
return int(self.getMemberV('NumBins'))
+ @numBins.setter
+ def numBins(self, value):
+ self.setMemberV('NumBins', str(value))
+
@property
def sampleSize(self):
return int(self.getMemberV('SampleSize'))
@@ -1581,6 +1751,10 @@ class ContinuousDistribution(RecordWrapper):
def maxBinValue(self):
return float(self.getMemberV('MaxBinValue'))
+ @maxBinValue.setter
+ def maxBinValue(self, value):
+ self.setMemberV('MaxBinValue', str(value))
+
@property
def description(self):
return self.getMemberV('MetricDescription')
@@ -1608,6 +1782,16 @@ class ContinuousDistribution(RecordWrapper):
return [self.minBinValue + i * self.binWidth for i in
range(len(self.bins))]
+class ZeroBinWidthError(Exception):
+
+ def __init__(self, width1, width2):
+ self.width1 = width1
+ self.width2 = width2
+
+ def __str__(self):
+ return "Zero bin width: {w1}, {w2}".format(w1=self.width1,
+ w2=self.width2)
+
class BinMismatchError(Exception):
pass
@@ -1748,7 +1932,7 @@ class PrimaryMetadata(RecordWrapper):
>>> import os, tempfile
>>> from pbcore.io import SubreadSet
>>> import pbcore.data.datasets as data
- >>> ds1 = SubreadSet(data.getXml(5))
+ >>> ds1 = SubreadSet(data.getXml(5), skipMissing=True)
>>> ds1.metadata.collections[0].primary.outputOptions.resultsFolder
'Analysis_Results'
>>> ds1.metadata.collections[0].primary.outputOptions.resultsFolder = (
@@ -1758,7 +1942,7 @@ class PrimaryMetadata(RecordWrapper):
>>> outdir = tempfile.mkdtemp(suffix="dataset-doctest")
>>> outXml = 'xml:' + os.path.join(outdir, 'tempfile.xml')
>>> ds1.write(outXml, validate=False)
- >>> ds2 = SubreadSet(outXml)
+ >>> ds2 = SubreadSet(outXml, skipMissing=True)
>>> ds2.metadata.collections[0].primary.outputOptions.resultsFolder
'BetterAnalysis_Results'
"""
@@ -1810,7 +1994,7 @@ class BioSamplesMetadata(RecordWrapper):
Doctest:
>>> from pbcore.io import SubreadSet
>>> import pbcore.data.datasets as data
- >>> ds = SubreadSet(data.getSubreadSet())
+ >>> ds = SubreadSet(data.getSubreadSet(), skipMissing=True)
>>> ds.metadata.bioSamples[0].name
'consectetur purus'
>>> for bs in ds.metadata.bioSamples:
diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py
index e2a8a58..c33ddb1 100755
--- a/pbcore/io/dataset/DataSetReader.py
+++ b/pbcore/io/dataset/DataSetReader.py
@@ -9,7 +9,7 @@ from pbcore.io.dataset.DataSetMembers import (ExternalResource,
ExternalResources,
DataSetMetadata,
Filters, AutomationParameters,
- StatsMetadata)
+ StatsMetadata, DISTLIST)
from pbcore.io.dataset.DataSetWriter import _eleFromDictList
log = logging.getLogger(__name__)
@@ -58,6 +58,14 @@ def _addXmlFile(dset, path):
root = tree.getroot()
tmp = _parseXml(type(dset), root)
tmp.makePathsAbsolute(curStart=os.path.dirname(path))
+ if not tmp.metadata.summaryStats:
+ for extres in tmp.externalResources:
+ if extres.sts:
+ try:
+ tmp.loadStats(extres.sts)
+ except IOError as e:
+ log.info("Sts.xml file {f} "
+ "unopenable".format(f=extres.sts))
# copyOnMerge must be false, you're merging in a tmp and maintaining dset
dset.merge(tmp, copyOnMerge=False)
@@ -122,12 +130,6 @@ def wrapNewResource(path):
def _parseXml(dsetType, element):
"""Parse an XML DataSet tag, or the root tag of a DataSet XML file (they
should be equivalent)
-
- Doctest:
- >>> import pbcore.data.datasets as data
- >>> ds = _openXmlFile(data.getXml(no=8).split(':')[-1])
- >>> type(ds).__name__
- 'SubreadSet'
"""
result = dsetType()
result.objMetadata = element.attrib
@@ -154,6 +156,8 @@ def _parseXmls(dsetType, element):
return result
def _updateStats(element):
+ """This is ugly, hackish and prone to failure. Delete as soon as pre 3.0.16
+ sts.xml files can go unsupported"""
namer = functools.partial(
_namespaceTag,
"http://pacificbiosciences.com/PipelineStats/PipeStats.xsd")
@@ -174,28 +178,28 @@ def _updateStats(element):
]
for tag in binCounts:
if element.findall(tag):
- log.error("Outdated stats XML received")
+ log.info("Outdated stats XML received")
finds = element.findall(tag)
parent = tag.split('Dist')[0] + 'Dist'
parent = element.find(parent)
for sub_ele in finds:
parent.remove(sub_ele)
- binCounts = ET.Element(namer('BinCounts'))
+ bce = ET.Element(namer('BinCounts'))
for sub_ele in finds:
- binCounts.append(sub_ele)
- parent.append(binCounts)
+ bce.append(sub_ele)
+ parent.append(bce)
for tag in binLabels:
if element.findall(tag):
- log.error("Outdated stats XML received")
+ log.info("Outdated stats XML received")
finds = element.findall(tag)
parent = tag.split('Dist')[0] + 'Dist'
parent = element.find(parent)
for sub_ele in finds:
parent.remove(sub_ele)
- binCounts = ET.Element(namer('BinLabels'))
+ bce = ET.Element(namer('BinLabels'))
for sub_ele in finds:
- binCounts.append(sub_ele)
- parent.append(binCounts)
+ bce.append(sub_ele)
+ parent.append(bce)
return element
def _namespaceTag(xmlns, tagName):
@@ -233,21 +237,11 @@ def _parseXmlFilters(element):
"""Pull filters from XML file, put them in a list of dictionaries, where
each dictionary contains a representation of a Filter tag: key, value pairs
with parameter names and value expressions.
-
- Doctest:
- >>> import xml.etree.ElementTree as ET
- >>> import pbcore.data.datasets as data
- >>> tree = ET.parse(data.getXml(no=8).split(':')[1])
- >>> root = tree.getroot()
- >>> filters = root[1]
- >>> str(_parseXmlFilters(filters))
- '( rq > 0.75 ) OR ( qname == 100/0/0_100 )'
"""
return Filters(_eleToDictList(element))
def parseStats(filename):
url = urlparse(filename)
- fileType = url.scheme
fileLocation = url.path.strip()
if url.netloc:
fileLocation = url.netloc
@@ -257,10 +251,7 @@ def parseStats(filename):
stats = StatsMetadata(_eleToDictList(root))
stats.record['tag'] = 'SummaryStats'
whitelist = ['ShortInsertFraction', 'AdapterDimerFraction',
- 'MedianInsertDist', 'ProdDist', 'ReadTypeDist',
- 'ReadLenDist', 'ReadQualDist', 'InsertReadQualDist',
- 'InsertReadLenDist', 'ControlReadQualDist',
- 'ControlReadLenDist', 'NumSequencingZmws']
+ 'NumSequencingZmws'] + DISTLIST
stats.pruneChildrenTo(whitelist)
return stats
diff --git a/pbcore/io/dataset/DataSetWriter.py b/pbcore/io/dataset/DataSetWriter.py
index 1bdaff2..e8989d4 100755
--- a/pbcore/io/dataset/DataSetWriter.py
+++ b/pbcore/io/dataset/DataSetWriter.py
@@ -2,6 +2,9 @@
import copy, time
import xml.etree.ElementTree as ET
+import logging
+
+log = logging.getLogger(__name__)
# TODO add in other namespaces
XMLNS = 'http://pacificbiosciences.com/PacBioDatasets.xsd'
@@ -218,7 +221,14 @@ def _addDataSetMetadataElement(dataSet, root):
root: The root ElementTree object. Extended here using SubElement
"""
if dataSet.metadata:
+ # hide the stats:
+ stats = None
+ if dataSet.metadata.summaryStats:
+ stats = dataSet.metadata.summaryStats
+ dataSet.metadata.summaryStats = None
root.append(_eleFromDictList(dataSet.metadata.record))
+ if stats:
+ dataSet.metadata.summaryStats = stats
# Metadata order matters....
#tl = dsmd.find('TotalLength')
#tl = dsmd.remove(tl)
diff --git a/pbcore/io/dataset/EntryPoints.py b/pbcore/io/dataset/EntryPoints.py
deleted file mode 100755
index c56495e..0000000
--- a/pbcore/io/dataset/EntryPoints.py
+++ /dev/null
@@ -1,246 +0,0 @@
-#/usr/bin/env python
-
-import os
-import argparse
-from collections import defaultdict
-from pbcore.io import DataSet, ContigSet, openDataSet
-from pbcore.io.dataset.DataSetMembers import Filters
-from pbcore.io.dataset.DataSetValidator import validateFile
-import logging
-
-log = logging.getLogger(__name__)
-
-def summarizeXml(args):
- dset = openDataSet(args.infile, strict=args.strict)
- for fname in dset.toExternalFiles():
- print fname
- print "Number of records: {r}".format(r=dset.numRecords)
- print "Total number of bases: {r}".format(r=dset.totalLength)
-
-def summarize_options(parser):
- parser.description = "Summarize a DataSet XML file"
- parser.add_argument("infile", type=str,
- help="The xml file to summarize")
- parser.set_defaults(func=summarizeXml)
-
-def createXml(args):
- dsTypes = DataSet.castableTypes()
- dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
- skipCounts=args.skipCounts,
- generateIndices=args.generateIndices)
- if args.dsName != '':
- dset.name = args.dsName
- if args.generateIndices:
- # we generated the indices with the last open, lets capture them with
- # this one:
- dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
- skipCounts=args.skipCounts)
- log.debug("Dataset created")
- dset.write(args.outfile, validate=args.novalidate, modPaths=True,
- relPaths=args.relative)
- log.debug("Dataset written")
-
-
-def create_options(parser):
- parser.description = ('Create an XML file from a fofn or bam. Possible '
- 'types: SubreadSet, AlignmentSet, ReferenceSet, '
- 'HdfSubreadSet, BarcodeSet, ConsensusAlignmentSet, '
- 'ConsensusReadSet, ContigSet')
- parser.add_argument("outfile", type=str, help="The XML to create")
- #parser.add_argument("infile", type=validate_file, nargs='+',
- parser.add_argument("infile", type=str, nargs='+',
- help="The fofn or BAM file(s) to make into an XML")
- parser.add_argument("--type", type=str, default='DataSet',
- dest='dsType', help="The type of XML to create")
- parser.add_argument("--name", type=str, default='',
- dest='dsName', help="The name of the new DataSet")
- parser.add_argument("--generateIndices", action='store_true',
- default=False, help="The type of XML to create")
- parser.add_argument("--novalidate", action='store_false', default=True,
- help=("Don't validate the resulting XML, don't touch "
- "paths"))
- parser.add_argument("--relative", action='store_true', default=False,
- help=("Make the included paths relative instead of "
- "absolute (not compatible with --novalidate)"))
- parser.set_defaults(func=createXml)
-
-def filterXml(args):
- if args.infile.endswith('xml'):
- dataSet = openDataSet(args.infile, strict=args.strict)
- filters = defaultdict(list)
- separators = ['<=', '>=', '!=', '==', '>', '<', '=']
- for filt in args.filters:
- for sep in separators:
- if sep in filt:
- param, condition = filt.split(sep)
- condition = (sep, condition)
- filters[param].append(condition)
- break
- dataSet.filters.addRequirement(**filters)
- dataSet.updateCounts()
- log.info("{i} filters added".format(i=len(filters)))
- dataSet.write(args.outfile)
- else:
- raise IOError("No files found/found to be compatible")
-
-def filter_options(parser):
- pbiFilterOptions = set(Filters()._pbiMappedVecAccMap({}).keys())
- bamFilterOptions = set(Filters()._bamAccMap.keys())
- parser.description = ('Add filters to an XML file. Suggested fields: '
- '{f}. More expensive fields: {b}'.format(
- f=sorted(list(pbiFilterOptions)),
- b=sorted(list(bamFilterOptions - pbiFilterOptions))))
- #parser.add_argument("infile", type=validate_file,
- parser.add_argument("infile", type=str,
- help="The xml file to filter")
- parser.add_argument("outfile", type=str, help="The resulting xml file")
- parser.add_argument("filters", type=str, nargs='+',
- help="The values and thresholds to filter (e.g. "
- "'rq>0.85')")
- parser.set_defaults(func=filterXml)
-
-def splitXml(args):
- log.debug("Starting split")
- dataSet = openDataSet(args.infile, strict=args.strict)
- chunks = len(args.outfiles)
- if args.chunks:
- chunks = args.chunks
- dss = dataSet.split(chunks=chunks,
- ignoreSubDatasets=(not args.subdatasets),
- contigs=args.contigs,
- maxChunks=args.maxChunks,
- breakContigs=args.breakContigs,
- targetSize=args.targetSize,
- zmws=args.zmws,
- barcodes=args.barcodes,
- byRecords=(not args.byRefLength),
- updateCounts=(not args.noCounts))
- log.debug("Splitting into {i} chunks".format(i=len(dss)))
- infix = 'chunk{i}'
- if args.contigs:
- infix += 'contigs'
- if not args.outfiles:
- if not args.outdir:
- args.outfiles = ['.'.join(args.infile.split('.')[:-1] +
- [infix.format(i=chNum), 'xml'])
- for chNum in range(len(dss))]
- else:
- args.outfiles = ['.'.join(args.infile.split('.')[:-1] +
- [infix.format(i=chNum), 'xml'])
- for chNum in range(len(dss))]
- args.outfiles = [os.path.join(args.outdir,
- os.path.basename(outfn))
- for outfn in args.outfiles]
- num = len(dss)
- end = ''
- if num > 5:
- num = 5
- end = '...'
- log.debug("Emitting {f} {e}".format(
- f=', '.join(args.outfiles[:num]),
- e=end))
- log.debug("Finished splitting, now writing")
- for out_fn, dset in zip(args.outfiles, dss):
- dset.write(out_fn)
- log.debug("Done writing files")
-
-def split_options(parser):
- parser.description = "Split the dataset"
- parser.add_argument("infile", type=str,
- help="The xml file to split")
- parser.add_argument("--contigs", default=False, action='store_true',
- help="Split on contigs")
- parser.add_argument("--barcodes", default=False, action='store_true',
- help="Split on barcodes")
- parser.add_argument("--zmws", default=False, action='store_true',
- help="Split on zmws")
- parser.add_argument("--byRefLength", default=False, action='store_true',
- help="Split contigs by contig length")
- parser.add_argument("--noCounts", default=False, action='store_true',
- help="Update dataset counts after split")
- parser.add_argument("--chunks", default=0, type=int,
- help="Split contigs into <chunks> total windows")
- parser.add_argument("--maxChunks", default=0, type=int,
- help="Split contig list into at most <chunks> groups")
- parser.add_argument("--targetSize", default=5000, type=int,
- help="Target number of records per chunk")
- parser.add_argument("--breakContigs", default=False, action='store_true',
- help="Break contigs to get closer to maxCounts")
- parser.add_argument("--subdatasets", default=False, action='store_true',
- help="Split on subdatasets")
- parser.add_argument("--outdir", default=False, type=str,
- help="Specify an output directory")
- parser.add_argument("outfiles", nargs=argparse.REMAINDER,
- type=str, help="The resulting xml files")
- parser.set_defaults(func=splitXml)
-
-def mergeXml(args):
- dss = []
- for infn in args.infiles:
- dss.append(openDataSet(infn, strict=args.strict))
- reduce(lambda ds1, ds2: ds1 + ds2, dss).write(args.outfile)
-
-def merge_options(parser):
- parser.description = 'Combine XML (and BAM) files'
- parser.add_argument("outfile", type=str,
- help="The resulting XML file")
- #parser.add_argument("infiles", type=validate_file, nargs='+',
- parser.add_argument("infiles", type=str, nargs='+',
- help="The XML files to merge")
- parser.set_defaults(func=mergeXml)
-
-def loadStatsXml(args):
- dset = openDataSet(args.infile, strict=args.strict)
- dset.loadStats(args.statsfile)
- if args.outfile:
- dset.write(args.outfile, validate=False)
- else:
- dset.write(args.infile, validate=False)
-
-def loadStatsXml_options(parser):
- parser.description = 'Load an sts.xml file into a DataSet XML file'
- parser.add_argument("infile", type=str,
- help="The XML file to modify")
- parser.add_argument("statsfile", type=str,
- help="The .sts.xml file to load")
- parser.add_argument("--outfile", type=str, default=None,
- help="The XML file to output")
- parser.set_defaults(func=loadStatsXml)
-
-def validateXml(args):
- validateFile(args.infile, args.skipFiles)
- print("{f} is valid DataSet XML with valid ResourceId "
- "references".format(f=args.infile))
-
-def validate_options(parser):
- parser.description = 'Validate XML and ResourceId files'
- parser.add_argument("infile", type=str,
- help="The XML file to validate")
- parser.add_argument("--skipFiles",
- default=False, action='store_true',
- help="Skip validating external resources")
- parser.set_defaults(func=validateXml)
-
-def consolidateXml(args):
- """Combine BAMs and apply the filters described in the XML file, producing
- one consolidated XML"""
- dset = openDataSet(args.infile)
- dset.consolidate(args.datafile, numFiles=args.numFiles, useTmp=(not
- args.noTmp))
- dset.write(args.xmlfile)
-
-def consolidate_options(parser):
- parser.description = 'Consolidate the XML files'
- #parser.add_argument("infile", type=validate_file,
- parser.add_argument("--numFiles", type=int, default=1,
- help="The number of data files to produce (1)")
- parser.add_argument("--noTmp", default=False, action='store_true',
- help="Don't copy to a tmp location to ensure local"
- " disk use")
- parser.add_argument("infile", type=str,
- help="The XML file to consolidate")
- parser.add_argument("datafile", type=str,
- help="The resulting data file")
- parser.add_argument("xmlfile", type=str,
- help="The resulting XML file")
- parser.set_defaults(func=consolidateXml)
diff --git a/pbcore/io/dataset/utils.py b/pbcore/io/dataset/utils.py
index a99690f..6881555 100755
--- a/pbcore/io/dataset/utils.py
+++ b/pbcore/io/dataset/utils.py
@@ -15,26 +15,28 @@ def consolidateBams(inFiles, outFile, filterDset=None, useTmp=True):
"""Take a list of infiles, an outFile to produce, and optionally a dataset
(filters) to provide the definition and content of filtrations."""
# check space available
+ final_free_space = disk_free(os.path.split(outFile)[0])
+ projected_size = sum(file_size(infn) for infn in inFiles)
+ log.debug("Projected size: {p}".format(p=projected_size))
+ log.debug("In place free space: {f}".format(f=final_free_space))
+ # give it a 5% buffer
+ if final_free_space < (projected_size * 1.05):
+ raise RuntimeError("No space available to consolidate")
if useTmp:
tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
- local_free_space = disk_free(os.path.split(outFile)[0])
tmp_free_space = disk_free(tmpout)
- projected_size = sum(file_size(infn) for infn in inFiles)
- log.debug("Projected size: {p}".format(p=projected_size))
- log.debug("In place free space: {f}".format(f=local_free_space))
- log.debug("Tmp free space: {f}".format(f=tmp_free_space))
- if tmp_free_space > projected_size:
+ log.debug("Tmp free space (need ~2x): {f}".format(f=tmp_free_space))
+ # need 2x for tmp in and out, plus 10% buffer
+ if tmp_free_space > (projected_size * 2.1):
log.debug("Using tmp storage: " + tmpout)
tmpInFiles = _tmpFiles(inFiles, tmpout)
origOutFile = outFile
origInFiles = inFiles[:]
inFiles = tmpInFiles
outFile = os.path.join(tmpout, "outfile.bam")
- elif local_free_space > projected_size:
+ else:
log.debug("Using in place storage")
useTmp = False
- else:
- raise RuntimeError("No space available to consolidate")
if filterDset and filterDset.filters:
finalOutfile = outFile
diff --git a/pbcore/io/rangeQueries.py b/pbcore/io/rangeQueries.py
index 0832696..814ecd3 100644
--- a/pbcore/io/rangeQueries.py
+++ b/pbcore/io/rangeQueries.py
@@ -128,8 +128,8 @@ def makeReadLocator(cmpH5, refSeq):
quickly.
"""
if not cmpH5.isSorted: raise Exception, "CmpH5 is not sorted"
- offsets = cmpH5.file["/RefGroup/OffsetTable"].value
- offStart, offEnd = offsets[offsets[:,0] == refSeq, 1:3].ravel()
+ refInfo = cmpH5.referenceInfo(refSeq)
+ offStart, offEnd = refInfo.StartRow, refInfo.EndRow
if (offEnd - offStart > 0):
refAlignIdx = cmpH5.alignmentIndex[offStart:offEnd, ]
diff --git a/pbcore/model/__init__.py b/pbcore/model/__init__.py
index 6861c47..b6fb7ab 100644
--- a/pbcore/model/__init__.py
+++ b/pbcore/model/__init__.py
@@ -27,3 +27,5 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
+
+from baseRegions import *
diff --git a/pbcore/model/baseRegions.py b/pbcore/model/baseRegions.py
new file mode 100644
index 0000000..61dac5a
--- /dev/null
+++ b/pbcore/model/baseRegions.py
@@ -0,0 +1,192 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+__all__ = [ "BaseRegionsMixin",
+ "ExtraBaseRegionsMixin",
+ "ADAPTER_REGION",
+ "INSERT_REGION",
+ "HQ_REGION" ]
+
+# Region types
+ADAPTER_REGION = 0
+INSERT_REGION = 1
+HQ_REGION = 2
+
+
+# Interval arithmetic
+def intersectRanges(r1, r2):
+ b1, e1 = r1
+ b2, e2 = r2
+ b, e = max(b1, b2), min(e1, e2)
+ return (b, e) if (b < e) else None
+
+def removeNones(lst):
+ return filter(lambda x: x!=None, lst)
+
+
+class BaseRegionsMixin(object):
+ """
+ Mixin class for "ZMW" client classes providing access to base
+ regions and reads sliced to those regions.
+
+ Requires the client subclass to provide 'regionsTable' and 'read'
+ (subrange slicing) methods with the correct semantics.
+
+ The "...Region[s]" calls here return one or more intervals ((int, int));
+
+ *All accessors in this mixin class implicitly clip regions to the
+ HQ region.*
+ """
+ @property
+ def hqRegion(self):
+ """
+ Return the HQ region interval.
+
+ The HQ region is an interval of basecalls where the basecaller has
+ inferred that a single sequencing reaction is taking place.
+ Secondary analysis should only use subreads within the HQ
+ region. Methods in this class, with the exception of the
+ "NoQC" methods, return data appropriately clipped/filtered to
+ the HQ region.
+ """
+ rt = self.regionTable
+ hqRows = rt[rt.regionType == HQ_REGION]
+ assert len(hqRows) == 1
+ hqRow = hqRows[0]
+ return hqRow.regionStart, hqRow.regionEnd
+
+ def _unclippedInsertRegions(self):
+ return [ (region.regionStart, region.regionEnd)
+ for region in self.regionTable
+ if region.regionType == INSERT_REGION ]
+
+ @property
+ def insertRegions(self):
+ """
+ Get insert regions as intervals, clipped to the HQ region
+ """
+ hqRegion = self.hqRegion
+ return removeNones([ intersectRanges(hqRegion, region)
+ for region in self._unclippedInsertRegions() ])
+
+ @property
+ def subreads(self):
+ """
+ Get the subreads as a list of ZmwRead objects. Restricts focus,
+ and clips to, the HQ region. This method can be used by
+ production code.
+ """
+ return [ self.read(readStart, readEnd)
+ for (readStart, readEnd) in self.insertRegions ]
+
+ def _unclippedAdapterRegions(self):
+ return [ (region.regionStart, region.regionEnd)
+ for region in self.regionTable
+ if region.regionType == ADAPTER_REGION ]
+
+ @property
+ def adapterRegions(self):
+ """
+ Get adapter regions as intervals, performing clipping to the HQ region
+ """
+ hqRegion = self.hqRegion
+ return removeNones([ intersectRanges(hqRegion, region)
+ for region in self._unclippedAdapterRegions() ])
+
+
+ @property
+ def adapters(self):
+ """
+ Get the adapter hits as a list of ZmwRead objects. Restricts
+ focus, and clips to, the HQ region. This method can be used
+ by production code.
+ """
+ return [ self.read(readStart, readEnd)
+ for (readStart, readEnd) in self.adapterRegions ]
+
+
+
+class ExtraBaseRegionsMixin(BaseRegionsMixin):
+ """
+ Mixin class for a "ZMW" class providing the basic region
+ accessors, and additional region accessors that do not clip to the
+ HQ region.
+
+ These accessors only make sense for bas.h5 ZMW objects, where we
+ have access to region information extending beyond the HQ region.
+ In the BAM world, "regions" are all inherently clipped to HQ by
+ design of PPA.
+
+ Requires 'readNoQC' accessor
+ """
+ @property
+ def adapterRegionsNoQC(self):
+ """
+ Get adapter regions as intervals, without clipping to the HQ
+ region. Don't use this unless you know what you're doing.
+ """
+ return self._unclippedAdapterRegions()
+
+ @property
+ def adaptersNoQC(self):
+ """
+ Get the adapters, including data beyond the bounds of the HQ
+ region.
+
+ .. warning::
+
+ It is not recommended that production code use this method
+ as we make no guarantees about what happens outside of the
+ HQ region.
+ """
+ return [ self.read(readStart, readEnd)
+ for (readStart, readEnd) in self.adapterRegionsNoQC ]
+
+ @property
+ def insertRegionsNoQC(self):
+ """
+ Get insert regions as intervals, without clipping to the HQ
+ region. Don't use this unless you know what you're doing.
+ """
+ return self._unclippedInsertRegions()
+
+ @property
+ def subreadsNoQC(self):
+ """
+ Get the subreads, including data beyond the bounds of the HQ region.
+
+ .. warning::
+
+ It is not recommended that production code use this method
+ as we make no guarantees about what happens outside of the
+ HQ region.
+ """
+ return [ self.read(readStart, readEnd)
+ for (readStart, readEnd) in self.insertRegionsNoQC ]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 0af0380..266b3e6 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,2 +1,3 @@
sphinx
nose
+pyxb
diff --git a/requirements.txt b/requirements.txt
index 22e02eb..357619b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
cython
numpy >= 1.7.1
h5py >= 2.0.1
-pysam >= 0.8.4
+pysam == 0.8.4
diff --git a/tests/test_pbcore_io_BasH5Reader.py b/tests/test_pbcore_io_BasH5Reader.py
index 43cdfeb..caee1e2 100644
--- a/tests/test_pbcore_io_BasH5Reader.py
+++ b/tests/test_pbcore_io_BasH5Reader.py
@@ -484,9 +484,8 @@ class TestBasH5Reader_CCS(ReadIteratorTests):
nose.tools.assert_equal(len(reader[zmw].insertRegions), 0)
nose.tools.assert_equal(len(reader[zmw].adapterRegions), 0)
+ nose.tools.assert_equal(len(reader[zmw].subreads), 0)
- with nose.tools.assert_raises(ValueError):
- reader[zmw].subreads
with nose.tools.assert_raises(ValueError):
reader[zmw].read()
diff --git a/tests/test_pbcore_io_unaligned_bam.py b/tests/test_pbcore_io_unaligned_bam.py
index 5e2f8ae..3d33ae3 100644
--- a/tests/test_pbcore_io_unaligned_bam.py
+++ b/tests/test_pbcore_io_unaligned_bam.py
@@ -69,7 +69,7 @@ class TestUnalignedBam(object):
def testIpd(self):
"""Check that 'Ipd' feature is recognized correctly."""
- pfa = self.bam.pulseFeaturesAvailable()
+ pfa = self.bam.baseFeaturesAvailable()
EQ(pfa, frozenset(['Ipd', 'DeletionTag', 'MergeQV', 'SubstitutionQV',
'InsertionQV', 'DeletionQV']))
ipd = self.bamRead0.IPD(aligned=False, orientation="native")
diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py
index f571fb7..13ea083 100644
--- a/tests/test_pbdataset.py
+++ b/tests/test_pbdataset.py
@@ -8,6 +8,7 @@ import tempfile
import numpy as np
import unittest
import shutil
+from random import shuffle
from unittest.case import SkipTest
from pbcore.io import PacBioBamIndex, IndexedBamReader
@@ -15,10 +16,10 @@ from pbcore.io import openIndexedAlignmentFile
from pbcore.io.dataset.utils import BamtoolsVersion
from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
openDataSet, DataSetMetaTypes, HdfSubreadSet,
- ConsensusReadSet, ConsensusAlignmentSet)
-from pbcore.io.dataset.DataSetIO import _dsIdToSuffix
+ ConsensusReadSet, ConsensusAlignmentSet, openDataFile,
+ divideKeys, keysToRanges)
+from pbcore.io.dataset.DataSetIO import _dsIdToSuffix, InvalidDataSetIOError
from pbcore.io.dataset.DataSetMembers import ExternalResource, Filters
-from pbcore.io.dataset.DataSetWriter import toXml
from pbcore.io.dataset.DataSetValidator import validateFile
from pbcore.util.Process import backticks
import pbcore.data.datasets as data
@@ -27,11 +28,6 @@ import pbcore.data as upstreamdata
log = logging.getLogger(__name__)
def _check_constools():
- cmd = "dataset.py"
- o, r, m = backticks(cmd)
- if r != 2:
- return False
-
if not BamtoolsVersion().good:
log.warn("Bamtools not found or out of date")
return False
@@ -85,7 +81,8 @@ class TestDataSet(unittest.TestCase):
ds1 = DataSet(data.getFofn())
self.assertEquals(ds1.numExternalResources, 2)
# New! Use the correct constructor:
- self.assertEquals(type(SubreadSet(data.getSubreadSet())).__name__,
+ self.assertEquals(type(SubreadSet(data.getSubreadSet(),
+ skipMissing=True)).__name__,
'SubreadSet')
# Even with untyped inputs
self.assertTrue(str(SubreadSet(data.getBam())).startswith(
@@ -113,45 +110,6 @@ class TestDataSet(unittest.TestCase):
ds.externalResources[-1].indices[0].resourceId ==
"IdontExist.bam.pbi")
- @unittest.skipIf(not _check_constools(),
- "bamtools or pbindex not found, skipping")
- def test_split_cli(self):
- outdir = tempfile.mkdtemp(suffix="dataset-unittest")
- cmd = "dataset.py split --outdir {o} --contigs --chunks 2 {d}".format(
- o=outdir,
- d=data.getXml(8))
- log.debug(cmd)
- o, r, m = backticks(cmd)
- self.assertEqual(r, 0)
- self.assertTrue(os.path.exists(
- os.path.join(outdir, os.path.basename(data.getXml(15)))))
- self.assertTrue(os.path.exists(
- os.path.join(outdir, os.path.basename(data.getXml(16)))))
-
- @unittest.skipIf(not _check_constools(),
- "bamtools or pbindex not found, skipping")
- def test_filter_cli(self):
- outdir = tempfile.mkdtemp(suffix="dataset-unittest")
- outfn = os.path.join(outdir, "filtered8.xml")
- log.debug(outfn)
- cmd = "dataset.py filter {i} {o} {f}".format(
- i=data.getXml(8),
- o=outfn,
- f="rname=E.faecalis.1")
- log.debug(cmd)
- o, r, m = backticks(cmd)
- if r != 0:
- log.debug(m)
- self.assertEqual(r, 0)
- self.assertTrue(os.path.exists(outfn))
- aln = AlignmentSet(data.getXml(8))
- aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
- aln.updateCounts()
- dset = AlignmentSet(outfn)
- self.assertEqual(str(aln.filters), str(dset.filters))
- self.assertEqual(aln.totalLength, dset.totalLength)
- self.assertEqual(aln.numRecords, dset.numRecords)
-
def test_merge_subdatasets(self):
# from data file
ds1 = AlignmentSet(data.getBam(0))
@@ -191,45 +149,6 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(merged.subdatasets[1].toExternalFiles(),
AlignmentSet(data.getXml(11)).toExternalFiles())
- @unittest.skipIf(not _check_constools(),
- "bamtools or pbindex not found, skipping")
- def test_create_cli(self):
- log.debug("Absolute")
- outdir = tempfile.mkdtemp(suffix="dataset-unittest")
- cmd = "dataset.py create --type AlignmentSet {o} {i1} {i2}".format(
- o=os.path.join(outdir, 'pbalchemysim.alignmentset.xml'),
- i1=data.getXml(8), i2=data.getXml(11))
- log.debug(cmd)
- o, r, m = backticks(cmd)
- self.assertEqual(r, 0)
- self.assertTrue(os.path.exists(
- os.path.join(outdir, os.path.basename(data.getXml(12)))))
-
- log.debug("Relative")
- cmd = ("dataset.py create --relative --type AlignmentSet "
- "{o} {i1} {i2}".format(
- o=os.path.join(outdir, 'pbalchemysim.alignmentset.xml'),
- i1=data.getXml(8),
- i2=data.getXml(11)))
- log.debug(cmd)
- o, r, m = backticks(cmd)
- self.assertEqual(r, 0)
- self.assertTrue(os.path.exists(
- os.path.join(outdir, os.path.basename(data.getXml(12)))))
-
- log.debug("Emtpy ContigSet")
- emptyfastafile = tempfile.NamedTemporaryFile(suffix=".fasta")
- emptyfasta = emptyfastafile.name
- emptycontigset = os.path.join(outdir, 'empty.contigset.xml')
- cmd = ("dataset.py create --relative --type ContigSet "
- "{o} {i}".format(
- o=emptycontigset,
- i=emptyfasta))
- log.debug(cmd)
- o, r, m = backticks(cmd)
- self.assertEqual(r, 0)
- self.assertTrue(os.path.exists(emptycontigset))
-
def test_empty_metatype(self):
inBam = data.getBam()
d = DataSet(inBam)
@@ -255,6 +174,24 @@ class TestDataSet(unittest.TestCase):
dset.updateCounts()
dset.index
self.assertEqual(len(dset.resourceReaders()), 1)
+ self.assertEqual(
+ len(dset.split(zmws=True, maxChunks=12)),
+ 1)
+
+ # empty and full:
+ full_bam = SubreadSet(data.getXml(10)).toExternalFiles()[0]
+ dset = SubreadSet(upstreamdata.getEmptyBam(), full_bam)
+ self.assertEqual(dset.numRecords, 92)
+ self.assertEqual(dset.totalLength, 124093)
+ self.assertEqual(len(list(dset)), 92)
+ dset.updateCounts()
+ self.assertNotEqual(list(dset.index), [])
+ self.assertEqual(len(dset.resourceReaders()), 2)
+ # there are 9 reads in this set, < the minimum chunk size
+ self.assertEqual(
+ len(dset.split(zmws=True, maxChunks=12)),
+ 2)
+
dset = AlignmentSet(upstreamdata.getEmptyBam())
self.assertEqual(dset.numRecords, 0)
@@ -263,6 +200,23 @@ class TestDataSet(unittest.TestCase):
dset.updateCounts()
dset.index
self.assertEqual(len(dset.resourceReaders()), 1)
+ # there is a minimum chunk size here:
+ self.assertEqual(
+ len(dset.split(contigs=True, maxChunks=12, breakContigs=True)),
+ 1)
+
+ # empty and full:
+ dset = AlignmentSet(upstreamdata.getEmptyBam(), data.getBam())
+ self.assertEqual(dset.numRecords, 92)
+ self.assertEqual(dset.totalLength, 123588)
+ self.assertEqual(len(list(dset)), 92)
+ dset.updateCounts()
+ self.assertNotEqual(list(dset.index), [])
+ self.assertEqual(len(dset.resourceReaders()), 2)
+ # there are 9 reads in this set, < the minimum chunk size
+ self.assertEqual(
+ len(dset.split(zmws=True, maxChunks=12)),
+ 2)
dset = ConsensusReadSet(upstreamdata.getEmptyBam())
self.assertEqual(dset.numRecords, 0)
@@ -291,6 +245,25 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(len(list(dset)), 0)
dset.updateCounts()
self.assertEqual(len(dset.resourceReaders()), 1)
+ self.assertEqual(
+ len(dset.split(zmws=True, maxChunks=12)),
+ 1)
+
+ # empty and full:
+ full_bam = SubreadSet(data.getXml(10)).toExternalFiles()[0]
+ dset = SubreadSet(outpath, full_bam)
+ self.assertEqual(len(dset.resourceReaders()), 2)
+ dset.updateCounts()
+ # without a pbi, updating counts is broken
+ self.assertEqual(dset.numRecords, 0)
+ self.assertEqual(dset.totalLength, 0)
+ self.assertEqual(len(list(dset)), 92)
+ with self.assertRaises(IOError):
+ self.assertNotEqual(list(dset.index), [])
+ self.assertEqual(
+ len(dset.split(zmws=True, maxChunks=12)),
+ 1)
+
dset = AlignmentSet(outpath)
self.assertEqual(dset.numRecords, 0)
@@ -298,6 +271,23 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(len(list(dset)), 0)
dset.updateCounts()
self.assertEqual(len(dset.resourceReaders()), 1)
+ self.assertEqual(
+ len(dset.split(contigs=True, maxChunks=12, breakContigs=True)),
+ 1)
+
+ # empty and full:
+ dset = AlignmentSet(outpath, data.getBam())
+ # without a pbi, updating counts is broken
+ self.assertEqual(dset.numRecords, 0)
+ self.assertEqual(dset.totalLength, 0)
+ self.assertEqual(len(list(dset)), 92)
+ dset.updateCounts()
+ with self.assertRaises(IOError):
+ self.assertNotEqual(list(dset.index), [])
+ self.assertEqual(len(dset.resourceReaders()), 2)
+ self.assertEqual(
+ len(dset.split(zmws=True, maxChunks=12)),
+ 1)
dset = ConsensusReadSet(outpath)
self.assertEqual(dset.numRecords, 0)
@@ -320,6 +310,45 @@ class TestDataSet(unittest.TestCase):
dset.updateCounts()
self.assertEqual(len(dset.resourceReaders()), 1)
+ def test_read_ranges(self):
+ # This models the old and new ways by which Genomic Consensus generates
+ # lists of paired tStarts and tEnds.
+
+ full_bam = upstreamdata.getBamAndCmpH5()[0]
+ empty_bam = data.getEmptyAlignedBam()
+ file_lists = [[full_bam, empty_bam],
+ [empty_bam, full_bam]]
+ refId_list = ['lambda_NEB3011', 0]
+ minMapQV = 30
+
+ for file_list in file_lists:
+ for refId in refId_list:
+ alnFile = AlignmentSet(*file_list)
+
+ # old GC:
+ # TODO(mdsmith)(2016-01-27): Remove assertRaises when tId
+ # accession is fixed for empty aligned bam+pbi files
+ with self.assertRaises(AttributeError):
+ for reader in alnFile.resourceReaders(refId):
+ reader.index.tId
+ refLen = reader.referenceInfo(refId).Length
+ rows = reader.readsInRange(refId, 0, refLen,
+ justIndices=True)
+
+ # new GC (just testing that it doesn't raise exceptions):
+ rows = alnFile.index[
+ ((alnFile.tId == alnFile.referenceInfo(refId).ID) &
+ (alnFile.mapQV >= minMapQV))]
+
+ unsorted_tStart = rows.tStart
+ unsorted_tEnd = rows.tEnd
+
+ # Sort (expected by CoveredIntervals)
+ sort_order = np.lexsort((unsorted_tEnd, unsorted_tStart))
+ tStart = unsorted_tStart[sort_order].tolist()
+ tEnd = unsorted_tEnd[sort_order].tolist()
+
+
def test_loading_reference(self):
log.info('Opening Reference')
@@ -354,6 +383,11 @@ class TestDataSet(unittest.TestCase):
ds = openDataSet(infn, strict=False)
self.assertEqual(type(ds), exp)
+ def test_openDataSet_unicode(self):
+ # Test to see if we can't open a unicode filename
+ fn = data.getXml(8)
+ aln = openDataSet(unicode(fn))
+
def test_type_checking(self):
bam = data.getBam()
fasta = ReferenceSet(data.getXml(9)).toExternalFiles()[0]
@@ -490,14 +524,19 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(expLen, accLen)
self.assertEqual(expNum, accNum)
- # TODO: turn this back on once a bam is found with a different
- # referenceInfoTable
- @SkipTest
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
def test_referenceInfoTableMerging(self):
log.info("Testing refIds, etc. after merging")
- ds = DataSet(data.getXml(17))
- also_lambda = ds.toExternalFiles()[0]
- aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda)
+ bam1 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
+ "2590953/0001/Alignment_Results/"
+ "m140913_005018_42139_c100713652400000001823152"
+ "404301534_s1_p0.1.aligned.bam")
+ bam2 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
+ "2590953/0001/Alignment_Results/"
+ "m140913_005018_42139_c100713652400000001823152"
+ "404301534_s1_p0.3.aligned.bam")
+ aln = AlignmentSet(bam1, bam2)
readers = aln.resourceReaders()
ids = sorted([i for _, i in aln.refInfo('ID')])
@@ -570,24 +609,121 @@ class TestDataSet(unittest.TestCase):
self.assertTrue(ds1.totalLength == ds1tl)
self.assertTrue(ds2.totalLength == ds2tl)
+ def test_split_zmws(self):
+ N_RECORDS = 117
+ test_file = upstreamdata.getUnalignedBam()
+ ds1 = openDataFile(test_file)
+ self.assertEqual(len([r for r in ds1]), N_RECORDS)
+ self.assertEqual(len(ds1), N_RECORDS)
+ dss = ds1.split(chunks=1, zmws=True)
+ self.assertEqual(len(dss), 1)
+ self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
+ N_RECORDS)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+
+ # We have a lower limit on the number of zmws, now
+ dss = ds1.split(chunks=12, zmws=True)
+ self.assertEqual(len(dss), 2)
+ self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
+ N_RECORDS)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+ self.assertEqual(
+ dss[0].zmwRanges,
+ [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
+ 1650, 32328)])
+ self.assertEqual(
+ dss[-1].zmwRanges,
+ [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
+ 32560, 54396)])
+ ranges = sorted([c.zmwRanges[0][1:] for c in dss])
+ interspans = []
+ last = None
+ for rg in ranges:
+ if not last is None:
+ interspans.append((last, rg[0]))
+ self.assertFalse(last == rg[0])
+ last = rg[1]
+ for rg in interspans:
+ self.assertEqual(len(np.nonzero(np.logical_and(
+ ds1.index.holeNumber < rg[1],
+ ds1.index.holeNumber > rg[0]))[0]), 0)
+
@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
"Missing testadata directory")
- def test_split_zmws(self):
- test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/2372215/"
- "0007_micro/Analysis_Results/m150404_101626_42267_c"
- "100807920800000001823174110291514_s1_p0.all."
- "subreadset.xml")
- ds1 = openDataSet(test_file)
- self.assertEqual(len([r for r in ds1]), 1220)
+ def test_large_split_zmws(self):
+ N_RECORDS = 959539
+ test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+ "2372215/0007/Analysis_Results/m150404_101626_42"
+ "267_c100807920800000001823174110291514_s1_p0.al"
+ "l.subreadset.xml")
+ ds1 = openDataFile(test_file)
+ self.assertEqual(len(ds1), N_RECORDS)
+ dss = ds1.split(chunks=1, zmws=True)
+ self.assertEqual(len(dss), 1)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+ dss = ds1.split(chunks=12, zmws=True)
+ self.assertEqual(len(dss), 12)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+ self.assertEqual(
+ dss[0].zmwRanges,
+ [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+ 7, 14007)])
+ self.assertEqual(
+ dss[-1].zmwRanges,
+ [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+ 149876, 163475)])
+ ranges = sorted([c.zmwRanges[0][1:] for c in dss])
+ interspans = []
+ last = None
+ for rg in ranges:
+ if not last is None:
+ interspans.append((last, rg[0]))
+ self.assertFalse(last == rg[0])
+ last = rg[1]
+ for rg in interspans:
+ self.assertEqual(len(np.nonzero(np.logical_and(
+ ds1.index.holeNumber < rg[1],
+ ds1.index.holeNumber > rg[0]))[0]), 0)
+
+
+ @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+ "Missing testadata directory")
+ def test_multi_movie_split_zmws(self):
+ N_RECORDS = 1745161
+ test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+ "2372215/0007/Analysis_Results/m150404_101626_42"
+ "267_c100807920800000001823174110291514_s1_p0.al"
+ "l.subreadset.xml")
+ test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+ "2590980/0008/Analysis_Results/m141115_075238_et"
+ "han_c100699872550000001823139203261572_s1_p0.al"
+ "l.subreadset.xml")
+ ds1 = SubreadSet(test_file_1, test_file_2)
+ # used to get total:
+ #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
+ self.assertEqual(len(ds1), N_RECORDS)
dss = ds1.split(chunks=1, zmws=True)
- self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), 1220)
- dss = ds1.split(chunks=9, zmws=True)
- self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), 1220)
+ self.assertEqual(len(dss), 1)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+
+ dss = ds1.split(chunks=12, zmws=True)
+ self.assertEqual(len(dss), 12)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
self.assertEqual(
dss[0].zmwRanges,
[('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
- 55, 1815)])
- #55, 2865)])
+ 7, 22098)])
+ self.assertEqual(
+ dss[-1].zmwRanges,
+ [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0',
+ 127814, 163468)])
+
@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
"Missing testadata directory")
@@ -662,6 +798,38 @@ class TestDataSet(unittest.TestCase):
self.assertTrue(str(ds2.filters).startswith(
'( rname = E.faecalis'))
+ def test_setFilters(self):
+ ds1 = DataSet()
+ filt = Filters()
+ filt.addRequirement(rq=[('>', '0.85')])
+ ds1.addFilters(filt)
+ self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
+ # Or added from a source XML
+ ds2 = DataSet()
+ ds2.filters = ds1.filters
+ self.assertEquals(str(ds2.filters), '( rq > 0.85 )')
+
+
+ def test_add_double_bound_filters(self):
+ ds1 = AlignmentSet(data.getXml(8))
+ ds1.filters.addRequirement(rq=[('>', '0.85'),
+ ('<', '0.99')])
+ self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )')
+
+ ds1 = AlignmentSet(data.getXml(8))
+ self.assertEquals(str(ds1.filters), '')
+ ds1.filters.addFilter(rq=[('>', '0.85'),
+ ('<', '0.99')])
+ self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )')
+
+ ds1.filters.addFilter(length=[('>', '1000')])
+ self.assertEquals(str(ds1.filters),
+ '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )')
+
+ ds1.filters.removeFilter(0)
+ self.assertEquals(str(ds1.filters),
+ '( length > 1000 )')
+
def test_addMetadata(self):
ds = DataSet()
ds.addMetadata(None, Name='LongReadsRock')
@@ -712,7 +880,7 @@ class TestDataSet(unittest.TestCase):
def test_toFofn(self):
self.assertEquals(DataSet("bam1.bam", "bam2.bam",
- strict=False).toFofn(),
+ strict=False, skipMissing=True).toFofn(),
['bam1.bam', 'bam2.bam'])
realDS = DataSet(data.getXml(8))
files = realDS.toFofn()
@@ -725,12 +893,14 @@ class TestDataSet(unittest.TestCase):
self.assertFalse(os.path.isabs(files[0]))
def test_toExternalFiles(self):
- bogusDS = DataSet("bam1.bam", "bam2.bam", strict=False)
+ bogusDS = DataSet("bam1.bam", "bam2.bam", strict=False,
+ skipMissing=True)
self.assertEqual(['bam1.bam', 'bam2.bam'],
bogusDS.externalResources.resourceIds)
- self.assertEquals(DataSet("bam1.bam", "bam2.bam",
- strict=False).toExternalFiles(),
- ['bam1.bam', 'bam2.bam'])
+ self.assertEquals(
+ DataSet("bam1.bam", "bam2.bam", strict=False,
+ skipMissing=True).toExternalFiles(),
+ ['bam1.bam', 'bam2.bam'])
realDS = DataSet(data.getXml(8))
files = realDS.toExternalFiles()
self.assertEqual(len(files), 1)
@@ -886,6 +1056,12 @@ class TestDataSet(unittest.TestCase):
for r1, r2 in itertools.izip(reads1, reads2):
self.assertEqual(r1, r2)
+ def test_n_subreads_filter(self):
+ ds2 = AlignmentSet(data.getXml(8))
+ ds2.filters.addRequirement(n_subreads=[('>', '4')])
+ self.assertEqual(len(list(ds2.records)), 87)
+ self.assertEqual(len(ds2), 87)
+
def test_filter(self):
ds2 = AlignmentSet(data.getXml(8))
ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
@@ -895,6 +1071,111 @@ class TestDataSet(unittest.TestCase):
ds2.enableFilters()
self.assertEqual(len(list(ds2.records)), 20)
+ def test_context_filters(self):
+ ss = SubreadSet(upstreamdata.getUnalignedBam())
+ self.assertEqual(set(ss.index.contextFlag), {0, 1, 2, 3})
+ self.assertEqual(
+ [len(np.flatnonzero(ss.index.contextFlag == cx))
+ for cx in sorted(set(ss.index.contextFlag))],
+ [15, 33, 32, 37])
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapters/barcodes
+ ss.filters.addRequirement(cx=[('=', 0)])
+ self.assertEqual(len(ss.index), 15)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapters/barcodes
+ ss.filters.addRequirement(cx=[('=', 'NO_LOCAL_CONTEXT')])
+ self.assertEqual(len(ss.index), 15)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # some adapters/barcodes
+ ss.filters.addRequirement(cx=[('!=', 0)])
+ self.assertEqual(len(ss.index), 102)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before
+ ss.filters.addRequirement(cx=[('&', 1)])
+ self.assertEqual(len(ss.index), 70)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before
+ ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE')])
+ self.assertEqual(len(ss.index), 70)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter after
+ ss.filters.addRequirement(cx=[('&', 2)])
+ self.assertEqual(len(ss.index), 69)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before or after
+ ss.filters.addRequirement(cx=[('&', 3)])
+ self.assertEqual(len(ss.index), 102)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before or after
+ ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE | ADAPTER_AFTER')])
+ self.assertEqual(len(ss.index), 102)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before or after but not both
+ ss.filters.addRequirement(cx=[('!=', 0)])
+ ss.filters.addRequirement(cx=[('~', 1),
+ ('~', 2)])
+ self.assertEqual(len(ss.index), 65)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before or after
+ ss.filters.addRequirement(cx=[('&', 1),
+ ('&', 2)])
+ self.assertEqual(len(ss.index), 102)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before and after
+ ss.filters.addRequirement(cx=[('&', 1)])
+ ss.filters.addRequirement(cx=[('&', 2)])
+ self.assertEqual(len(ss.index), 37)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before but not after
+ ss.filters.addRequirement(cx=[('&', 1)])
+ ss.filters.addRequirement(cx=[('~', 2)])
+ self.assertEqual(len(ss.index), 33)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapter before
+ ss.filters.addRequirement(cx=[('~', 1)])
+ self.assertEqual(len(ss.index), 47)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapter before or after
+ ss.filters.addRequirement(cx=[('~', 1)])
+ ss.filters.addRequirement(cx=[('~', 2)])
+ self.assertEqual(len(ss.index), 15)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapter before or after
+ ss.filters.addRequirement(cx=[('~', 3)])
+ self.assertEqual(len(ss.index), 15)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
@SkipTest
def test_split_by_contigs_presplit(self):
# Consumes too much memory for Jenkins
@@ -1195,6 +1476,76 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
('E.faecalis.2', 100, 299)])
+ def test_intervalContour(self):
+ ds = AlignmentSet(data.getBam(0))
+ coverage = ds.intervalContour('E.faecalis.1')
+ ds.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
+ # regular totalLength uses aEnd/aStart, which includes insertions
+ totalTargetLength = sum(ds.index.tEnd - ds.index.tStart)
+ self.assertEqual(totalTargetLength, sum(coverage))
+
+ # partial interval
+ ds = AlignmentSet(data.getBam(0))
+ coverage = ds.intervalContour('E.faecalis.1', tStart=100, tEnd=500)
+ ds.filters.addRequirement(rname=[('=', 'E.faecalis.1')],
+ tStart=[('<', '500')],
+ tEnd=[('>', '100')])
+ # regular totalLength uses aEnd/aStart, which includes insertions
+ ends = ds.index.tEnd
+ post = ends > 500
+ ends[post] = 500
+ starts = ds.index.tStart
+ pre = starts < 100
+ starts[pre] = 100
+ totalTargetLength = sum(ends - starts)
+ self.assertEqual(totalTargetLength, sum(coverage))
+
+ # test a second reference in this set
+ ds.filters.removeRequirement('rname')
+ coverage = ds.intervalContour('E.faecalis.2')
+ ds.filters.addRequirement(rname=[('=', 'E.faecalis.2')])
+ totalTargetLength = sum(ds.index.tEnd - ds.index.tStart)
+ self.assertEqual(totalTargetLength, sum(coverage))
+
+ # partial interval
+ ds = AlignmentSet(data.getBam(0))
+ coverage = ds.intervalContour('E.faecalis.2', tStart=100, tEnd=500)
+ ds.filters.addRequirement(rname=[('=', 'E.faecalis.2')],
+ tStart=[('<', '500')],
+ tEnd=[('>', '100')])
+ # regular totalLength uses aEnd/aStart, which includes insertions
+ ends = ds.index.tEnd
+ post = ends > 500
+ ends[post] = 500
+ starts = ds.index.tStart
+ pre = starts < 100
+ starts[pre] = 100
+ totalTargetLength = sum(ends - starts)
+ self.assertEqual(totalTargetLength, sum(coverage))
+
+
+ # test a cmp.h5 alignmentset
+ ds = AlignmentSet(upstreamdata.getBamAndCmpH5()[1])
+ coverage = ds.intervalContour('lambda_NEB3011')
+ totalTargetLength = sum(ds.index.tEnd - ds.index.tStart)
+ self.assertEqual(totalTargetLength, sum(coverage))
+
+ # partial interval
+ ds = AlignmentSet(upstreamdata.getBamAndCmpH5()[1])
+ coverage = ds.intervalContour('lambda_NEB3011', tStart=100, tEnd=500)
+ ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011')],
+ tStart=[('<', '500')],
+ tEnd=[('>', '100')])
+ # regular totalLength uses aEnd/aStart, which includes insertions
+ starts = ds.index.tStart
+ ends = ds.index.tEnd
+ post = ends > 500
+ ends[post] = 500
+ pre = starts < 100
+ starts[pre] = 100
+ totalTargetLength = sum(ends - starts)
+ self.assertEqual(totalTargetLength, sum(coverage))
+
def test_refLengths(self):
ds = AlignmentSet(data.getBam(0))
@@ -1420,6 +1771,10 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(len(readers[2].readGroupTable), 1)
self.assertEqual(len(aln.readGroupTable), 3)
+ def test_missing_file(self):
+ with self.assertRaises(InvalidDataSetIOError):
+ aln = AlignmentSet("NOPE")
+
def test_repr(self):
ds = DataSet(data.getBam())
rep = str(ds)
@@ -1428,6 +1783,133 @@ class TestDataSet(unittest.TestCase):
rep))
self.assertTrue(re.search('pbalchemysim0.pbalign.bam', rep))
+ def test_stats_metadata_zero_binwidth(self):
+ # both zero
+ ds1 = DataSet(data.getXml(8))
+ ds1.loadStats(data.getStats())
+ ds2 = DataSet(data.getXml(11))
+ ds2.loadStats(data.getStats())
+ ds1.metadata.summaryStats.readLenDist.bins = (
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ ds1.metadata.summaryStats.readLenDist.minBinValue = 0
+ ds1.metadata.summaryStats.readLenDist.binWidth = 0
+ ds2.metadata.summaryStats.readLenDist.bins = (
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ ds2.metadata.summaryStats.readLenDist.minBinValue = 0
+ ds2.metadata.summaryStats.readLenDist.binWidth = 0
+ ds3 = ds1 + ds2
+ self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+ self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+
+ # one zero
+ ds1 = DataSet(data.getXml(8))
+ ds1.loadStats(data.getStats())
+ ds2 = DataSet(data.getXml(11))
+ ds2.loadStats(data.getStats())
+ ds1.metadata.summaryStats.readLenDist.bins = (
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ ds1.metadata.summaryStats.readLenDist.minBinValue = 0
+ ds1.metadata.summaryStats.readLenDist.binWidth = 0
+ ds2.metadata.summaryStats.readLenDist.bins = (
+ [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+ self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+ [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+ ds2.metadata.summaryStats.readLenDist.minBinValue = 20
+ ds2.metadata.summaryStats.readLenDist.binWidth = 10
+ ds3 = ds1 + ds2
+ self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+ self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+ [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+
+ # other zero
+ ds1 = DataSet(data.getXml(8))
+ ds1.loadStats(data.getStats())
+ ds2 = DataSet(data.getXml(11))
+ ds2.loadStats(data.getStats())
+ ds1.metadata.summaryStats.readLenDist.bins = (
+ [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+ self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+ [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+ ds1.metadata.summaryStats.readLenDist.minBinValue = 10
+ ds1.metadata.summaryStats.readLenDist.binWidth = 10
+ ds2.metadata.summaryStats.readLenDist.bins = (
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ ds2.metadata.summaryStats.readLenDist.minBinValue = 0
+ ds2.metadata.summaryStats.readLenDist.binWidth = 0
+ ds3 = ds1 + ds2
+ self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+ self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+ [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+
+ # one zero more zero
+ ds1 = DataSet(data.getXml(8))
+ ds1.loadStats(data.getStats())
+ ds2 = DataSet(data.getXml(11))
+ ds2.loadStats(data.getStats())
+ ds3 = DataSet(data.getXml(11))
+ ds3.loadStats(data.getStats())
+ ds1.metadata.summaryStats.readLenDist.bins = (
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ ds1.metadata.summaryStats.readLenDist.minBinValue = 0
+ ds1.metadata.summaryStats.readLenDist.binWidth = 0
+ ds2.metadata.summaryStats.readLenDist.bins = (
+ [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
+ self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+ [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
+ ds2.metadata.summaryStats.readLenDist.minBinValue = 20
+ ds2.metadata.summaryStats.readLenDist.binWidth = 10
+ ds3.metadata.summaryStats.readLenDist.bins = (
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ ds3.metadata.summaryStats.readLenDist.minBinValue = 0
+ ds3.metadata.summaryStats.readLenDist.binWidth = 0
+ ds4 = ds1 + ds2 + ds3
+ self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+ self.assertEqual(ds4.metadata.summaryStats.readLenDist.bins,
+ [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
+
+ # other zero
+ ds1 = DataSet(data.getXml(8))
+ ds1.loadStats(data.getStats())
+ ds2 = DataSet(data.getXml(11))
+ ds2.loadStats(data.getStats())
+ ds3 = DataSet(data.getXml(11))
+ ds3.loadStats(data.getStats())
+ ds1.metadata.summaryStats.readLenDist.bins = (
+ [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+ self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+ [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+ ds1.metadata.summaryStats.readLenDist.minBinValue = 10
+ ds1.metadata.summaryStats.readLenDist.binWidth = 10
+ ds2.metadata.summaryStats.readLenDist.bins = (
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ ds2.metadata.summaryStats.readLenDist.minBinValue = 0
+ ds2.metadata.summaryStats.readLenDist.binWidth = 0
+ ds3.metadata.summaryStats.readLenDist.bins = (
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ ds3.metadata.summaryStats.readLenDist.minBinValue = 0
+ ds3.metadata.summaryStats.readLenDist.binWidth = 0
+ ds4 = ds1 + ds2 + ds3
+ self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+ self.assertEqual(ds4.metadata.summaryStats.readLenDist.bins,
+ [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+
def test_stats_metadata(self):
ds = DataSet(data.getBam())
ds.loadStats(data.getStats())
@@ -1527,4 +2009,439 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(
2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws)
self.assertEqual(
- 150292.0, ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
+ 150292.0,
+ ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_merged_cmp(self):
+ cmp1 = ("/pbi/dept/secondary/siv/testdata/pbreports"
+ "-unittest/data/sat/aligned_reads.cmp.h5")
+ cmp2 = upstreamdata.getBamAndCmpH5()[1]
+ aln0 = AlignmentSet(cmp1)
+ self.assertEqual(aln0.referenceInfoTable['EndRow'][0], 338001)
+ self.assertEqual(len(aln0), 338002)
+ aln1 = AlignmentSet(cmp2)
+ self.assertEqual(aln1.referenceInfoTable['EndRow'][0], 111)
+ self.assertEqual(len(aln1), 112)
+ aln = AlignmentSet(cmp1, cmp2)
+ refnames = aln.refNames
+ self.assertEqual(refnames, ['lambda_NEB3011'])
+ self.assertEqual(aln.refIds[aln.refNames[0]], 1)
+ self.assertEqual(aln.referenceInfoTable['EndRow'][0], 338113)
+ self.assertEqual(len(aln), 338114)
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_two_cmpH5(self):
+ cmp1 = ("/pbi/dept/secondary/siv/testdata/pbreports"
+ "-unittest/data/sat/aligned_reads.cmp.h5")
+ cmp2 = upstreamdata.getBamAndCmpH5()[1]
+ len1 = len(AlignmentSet(cmp1))
+ len2 = len(AlignmentSet(cmp2))
+ aln = AlignmentSet(cmp1, cmp2)
+ len3 = len(aln)
+ self.assertEqual(len1 + len2, len3)
+ self.assertEqual(len3, 338114)
+ self.assertEqual(
+ str(aln.referenceInfoTable),
+ ("[ (1, 1, 'lambda_NEB3011', 'lambda_NEB3011', "
+ "48502, 'a1319ff90e994c8190a4fe6569d0822a', 0L, 338113L)]"))
+ self.assertEqual(set(aln.tId), {1})
+ # + 1, because bounds are inclusive, rather than exclusive
+ self.assertEqual(len3, (aln.referenceInfoTable[0].EndRow -
+ aln.referenceInfoTable[0].StartRow) + 1)
+ self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
+ aln.referenceInfo(1))
+ # ask for the wrong one:
+ self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+ None)
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_two_ref_cmpH5(self):
+ cmp1 = upstreamdata.getBamAndCmpH5()[1]
+ cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+ "genomic_consensus-unittest/bam_c4p6_tests/"
+ "ecoli_c4p6.cmp.h5")
+ len1 = len(AlignmentSet(cmp1))
+ len2 = len(AlignmentSet(cmp2))
+ aln = AlignmentSet(cmp1, cmp2)
+ len3 = len(aln)
+ self.assertEqual(len1 + len2, len3)
+ self.assertEqual(len3, 57147)
+ self.assertEqual(
+ str(aln.referenceInfoTable),
+ ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+ "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 57034L)\n"
+ " (1, 1, 'lambda_NEB3011', 'lambda_NEB3011', 48502, "
+ "'a1319ff90e994c8190a4fe6569d0822a', 57035L, 57146L)]"))
+ self.assertEqual(set(aln.tId), {0, 1})
+ # + 1, because bounds are inclusive, rather than exclusive
+ self.assertEqual(len1, (aln.referenceInfoTable[1].EndRow -
+ aln.referenceInfoTable[1].StartRow) + 1)
+ self.assertEqual(len2, (aln.referenceInfoTable[0].EndRow -
+ aln.referenceInfoTable[0].StartRow) + 1)
+ self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+ aln.referenceInfo(0))
+ self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
+ aln.referenceInfo(1))
+
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_two_bam(self):
+ cmp1 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
+ "2590953/0001/Alignment_Results/"
+ "m140913_005018_42139_c100713652400000001823152"
+ "404301534_s1_p0.1.aligned.bam")
+ cmp2 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
+ "2590953/0001/Alignment_Results/"
+ "m140913_005018_42139_c100713652400000001823152"
+ "404301534_s1_p0.2.aligned.bam")
+ len1 = len(AlignmentSet(cmp1))
+ len2 = len(AlignmentSet(cmp2))
+ aln = AlignmentSet(cmp1, cmp2)
+ len3 = len(aln)
+ self.assertEqual(len1 + len2, len3)
+ self.assertEqual(len3, 65346)
+ self.assertEqual(
+ str(aln.referenceInfoTable),
+ ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+ "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 0L)]"))
+ self.assertEqual(set(aln.tId), {0})
+ self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+ aln.referenceInfo(0))
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_two_xml(self):
+ cmp1 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
+ "m140913_005018_42139_c1007136524000000018231"
+ "52404301534_s1_p0.all.alignmentset.xml")
+ cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+ "m140913_222218_42240_c1006999524000000018231"
+ "39203261564_s1_p0.all.alignmentset.xml")
+ len1 = len(AlignmentSet(cmp1))
+ len2 = len(AlignmentSet(cmp2))
+ aln = AlignmentSet(cmp1, cmp2)
+ len3 = len(aln)
+ self.assertEqual(len1 + len2, len3)
+ self.assertEqual(len3, 160264)
+ self.assertEqual(
+ str(aln.referenceInfoTable),
+ ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+ "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 0L)]"))
+ self.assertEqual(set(aln.tId), {0})
+ self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+ aln.referenceInfo(0))
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_two_ref_bam(self):
+ cmp1 = upstreamdata.getBamAndCmpH5()[0]
+ # this is the supposedly the same data as above:
+ cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+ "m140913_222218_42240_c1006999524000000018231"
+ "39203261564_s1_p0.all.alignmentset.xml")
+ len1 = len(AlignmentSet(cmp1))
+ len2 = len(AlignmentSet(cmp2))
+ aln = AlignmentSet(cmp1, cmp2)
+ len3 = len(aln)
+ self.assertEqual(len1 + len2, len3)
+ self.assertEqual(len3, 57344)
+ # TODO(mdsmith)(2016-01-25) I would like to be able to use the startrow
+ # and endrow fields for bams someday...
+ self.assertEqual(
+ str(aln.referenceInfoTable),
+ ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+ "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 0L)\n"
+ " (1, 1, 'lambda_NEB3011', 'lambda_NEB3011', 48502, "
+ "'a1319ff90e994c8190a4fe6569d0822a', 0L, 0L)]"))
+ self.assertEqual(set(aln.tId), {0, 1})
+ self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+ aln.referenceInfo(0))
+ self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
+ aln.referenceInfo(1))
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_two_ref_three_bam(self):
+ # Here we test whether duplicate references in a non-identical
+ # reference situation remain duplicates or are collapsed
+ cmp1 = upstreamdata.getBamAndCmpH5()[0]
+ # this is the supposedly the same data as above:
+ cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+ "m140913_222218_42240_c1006999524000000018231"
+ "39203261564_s1_p0.all.alignmentset.xml")
+ cmp3 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
+ "m140913_005018_42139_c1007136524000000018231"
+ "52404301534_s1_p0.all.alignmentset.xml")
+ len1 = len(AlignmentSet(cmp1))
+ len2 = len(AlignmentSet(cmp2))
+ len3 = len(AlignmentSet(cmp3))
+ aln = AlignmentSet(cmp1, cmp2, cmp3)
+ len4 = len(aln)
+ self.assertEqual(len1 + len2 + len3, len4)
+ self.assertEqual(len4, 160376)
+ self.assertEqual(
+ str(aln.referenceInfoTable),
+ ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+ "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 0L)\n"
+ " (1, 1, 'lambda_NEB3011', 'lambda_NEB3011', 48502, "
+ "'a1319ff90e994c8190a4fe6569d0822a', 0L, 0L)]"))
+ self.assertEqual(set(aln.tId), {0, 1})
+ self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+ aln.referenceInfo(0))
+ self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
+ aln.referenceInfo(1))
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_movie_filter(self):
+ # unaligned bam
+ bam0 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590956/0003/"
+ "Analysis_Results/m140913_222218_42240_c10069"
+ "9952400000001823139203261564_s1_p0.all.subreadset.xml")
+ bam1 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590953/0001/"
+ "Analysis_Results/m140913_005018_42139_c10071"
+ "3652400000001823152404301534_s1_p0.all.subreadset.xml")
+ aln = SubreadSet(bam0, bam1)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(aln.readGroupTable['ID']))
+ self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(set(aln.index.qId)))
+ self.assertEqual(len(aln), 178570)
+ aln.filters.addRequirement(movie=[(
+ '=',
+ 'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
+ self.assertEqual(len(SubreadSet(bam1)), len(aln))
+
+ # aligned bam
+ #bam0 = ("/pbi/dept/secondary/siv/testdata/"
+ # "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+ # "m140913_222218_42240_c1006999524000000018231"
+ # "39203261564_s1_p0.all.alignmentset.xml")
+ bam0 = upstreamdata.getBamAndCmpH5()[0]
+ bam1 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
+ "m140913_005018_42139_c1007136524000000018231"
+ "52404301534_s1_p0.all.alignmentset.xml")
+ aln = AlignmentSet(bam0, bam1)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(aln.readGroupTable['ID']))
+ self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(set(aln.index.qId)))
+ self.assertEqual(len(aln), 103144)
+ aln.filters.addRequirement(movie=[(
+ '=',
+ 'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
+ self.assertEqual(len(AlignmentSet(bam1)), len(aln))
+
+ # cmpH5
+ cmp1 = upstreamdata.getBamAndCmpH5()[1]
+ cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+ "genomic_consensus-unittest/bam_c4p6_tests/"
+ "ecoli_c4p6.cmp.h5")
+ aln = AlignmentSet(cmp1, cmp2)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(aln.readGroupTable['ID']))
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(set(aln.index.MovieID)))
+ self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+ self.assertEqual(len(aln), 57147)
+ aln.filters.addRequirement(movie=[(
+ '=',
+ 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0')])
+ len1 = len(AlignmentSet(cmp1))
+ self.assertEqual(len1, len(aln))
+
+ aln.filters.removeRequirement('movie')
+ self.assertEqual(len(aln), 57147)
+
+ def test_exceptions(self):
+ try:
+ raise InvalidDataSetIOError("Wrong!")
+ except InvalidDataSetIOError as e:
+ self.assertEqual(e.message, "Wrong!")
+
+ def test_createdAt(self):
+ aln = AlignmentSet(data.getXml(8))
+ self.assertEqual(aln.createdAt, '2015-08-05T10:25:18')
+
+ def test_divideKeys_keysToRanges(self):
+ keys = [0, 1, 2, 3, 5, 8, 50]
+ res = divideKeys(keys, 0)
+ self.assertEqual(res, [])
+ res = keysToRanges(res)
+ self.assertEqual(res, [])
+
+ res = divideKeys(keys, 1)
+ self.assertEqual(res, [[0, 1, 2, 3, 5, 8, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 50]])
+
+ res = divideKeys(keys, 2)
+ self.assertEqual(res, [[0, 1, 2], [3, 5, 8, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 2], [3, 50]])
+
+ res = divideKeys(keys, 3)
+ self.assertEqual(res, [[0, 1], [2, 3], [5, 8, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 1], [2, 3], [5, 50]])
+
+ res = divideKeys(keys, 7)
+ self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
+ [5, 5], [8, 8], [50, 50]])
+
+ res = divideKeys(keys, 8)
+ self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
+ [5, 5], [8, 8], [50, 50]])
+
+
+ keys = [0, 1, 2, 2, 3, 5, 8, 50, 50]
+ res = divideKeys(keys, 0)
+ self.assertEqual(res, [])
+ res = keysToRanges(res)
+ self.assertEqual(res, [])
+
+ res = divideKeys(keys, 1)
+ self.assertEqual(res, [[0, 1, 2, 2, 3, 5, 8, 50, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 50]])
+
+ res = divideKeys(keys, 2)
+ self.assertEqual(res, [[0, 1, 2, 2], [3, 5, 8, 50, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 2], [3, 50]])
+
+ res = divideKeys(keys, 3)
+ self.assertEqual(res, [[0, 1, 2], [2, 3, 5], [8, 50, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 2], [2, 5], [8, 50]])
+
+ res = divideKeys(keys, 9)
+ self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
+ [5, 5], [8, 8], [50, 50], [50, 50]])
+
+ res = divideKeys(keys, 10)
+ self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
+ [5, 5], [8, 8], [50, 50], [50, 50]])
+
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_load_sts_from_extres(self):
+ # don't have a subreadset.xml with loaded sts.xml in testdata,
+ # fabricate one here:
+ ss = SubreadSet(data.getXml(10))
+ ss.externalResources[0].sts = ('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.sts.xml')
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outXml = os.path.join(outdir, 'tempfile.xml')
+ ss.write(outXml)
+ ss = SubreadSet(outXml)
+ self.assertTrue(ss.metadata.summaryStats)
+ # test validation on write with loaded stats:
+ outXml = os.path.join(outdir, 'tempfileWithStats.xml')
+ ss.write(outXml, validate=False)
+ ss.write(outXml)
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_fixed_bin_sts(self):
+ # don't have a subreadset.xml with loaded sts.xml in testdata,
+ # fabricate one here:
+ ss = SubreadSet(data.getXml(10))
+ ss.externalResources[0].sts = ('/pbi/dept/secondary/siv/testdata/'
+ 'pbreports-unittest/data/sts_xml/'
+ '3120134-r54009_20160323_173308-'
+ '1_A01-Bug30772/m54009_160323_'
+ '173323.sts.xml')
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outXml = os.path.join(outdir, 'tempfile.xml')
+ outXml2 = os.path.join(outdir, 'tempfile2.xml')
+ ss.write(outXml)
+ ss.write(outXml2)
+ ss = SubreadSet(outXml)
+ ss2 = SubreadSet(outXml2)
+ ss3 = ss + ss2
+ self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+ [b1 + b2 for b1, b2 in
+ zip(ss.metadata.summaryStats.readLenDist.bins,
+ ss2.metadata.summaryStats.readLenDist.bins)])
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_missing_extres(self):
+ # copy a file with relative paths, rescue ResourceId's
+ test_file = ('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.subreadset.xml')
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outXml = os.path.join(outdir, 'tempfile.xml')
+ resXml = os.path.join(outdir, 'tempfile.rescued.xml')
+ sset = SubreadSet(test_file)
+
+ # record the original paths:
+ path_map = {}
+ recorder = lambda x, m=path_map: m.setdefault(os.path.split(x)[1], x)
+ sset._changePaths(recorder)
+
+ # make the paths relative and write out dataset with all missing:
+ sset.makePathsRelative(os.path.dirname(test_file))
+ sset.write(outXml, validate=False)
+
+ # check that it is broken:
+ with self.assertRaises(InvalidDataSetIOError):
+ sset = SubreadSet(outXml)
+
+ # check that rescuing fixes it:
+ replacer = lambda x, m=path_map: m[x]
+ sset = SubreadSet(outXml, skipMissing=True)
+ sset._changePaths(replacer)
+ sset.write(resXml, validate=False)
+ sset = SubreadSet(resXml)
+
+ # check that removing any one breaks it:
+ for key in path_map.keys():
+ mod_pmap = path_map.copy()
+
+ # remove a resourceId from the map:
+ mod_pmap.pop(key)
+ log.debug(key)
+ # use dict.get to maintain the breakage:
+ replacer = lambda x, m=mod_pmap: m.get(x, x)
+ sset = SubreadSet(outXml, skipMissing=True)
+ sset._changePaths(replacer)
+ sset.write(resXml, validate=False)
+ with self.assertRaises(InvalidDataSetIOError):
+ sset = SubreadSet(resXml)
+
+
+ @unittest.skip("Too expensive")
+ def test_huge_zmw_split(self):
+ human = ('/pbi/dept/secondary/siv/testdata/SA3-DS/'
+ 'human/JCV_85x_v030/jcv_85x_v030.subreadset.xml')
+ sset = SubreadSet(human)
+ ssets = sset.split(zmws=True, maxChunks=5)
+
diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py
index bf8637e..7741b36 100644
--- a/tests/test_pbdataset_subtypes.py
+++ b/tests/test_pbdataset_subtypes.py
@@ -5,15 +5,18 @@ import unittest
import tempfile
import os
import itertools
+import numpy as np
+
+import pysam
from pbcore.util.Process import backticks
from pbcore.io.dataset.utils import (consolidateBams, _infixFname,
BamtoolsVersion)
from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet,
- ReferenceSet, ContigSet, AlignmentSet,
+ ReferenceSet, ContigSet, AlignmentSet, BarcodeSet,
FastaReader, FastaWriter, IndexedFastaReader,
HdfSubreadSet, ConsensusAlignmentSet,
- openDataFile, FastaWriter)
+ openDataFile, FastaWriter, FastqReader)
import pbcore.data as upstreamData
import pbcore.data.datasets as data
from pbcore.io.dataset.DataSetValidator import validateXml
@@ -22,11 +25,6 @@ import xml.etree.ElementTree as ET
log = logging.getLogger(__name__)
def _check_constools():
- cmd = "dataset.py"
- o, r, m = backticks(cmd)
- if r != 2:
- return False
-
if not BamtoolsVersion().good:
log.warn("Bamtools not found or out of date")
return False
@@ -53,8 +51,8 @@ class TestDataSet(unittest.TestCase):
def test_subread_build(self):
- ds1 = SubreadSet(data.getXml(no=5))
- ds2 = SubreadSet(data.getXml(no=5))
+ ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
+ ds2 = SubreadSet(data.getXml(no=5), skipMissing=True)
self.assertEquals(type(ds1).__name__, 'SubreadSet')
self.assertEquals(ds1._metadata.__class__.__name__,
'SubreadSetMetadata')
@@ -64,7 +62,7 @@ class TestDataSet(unittest.TestCase):
self.assertEquals(len(ds2.metadata.collections), 1)
ds3 = ds1 + ds2
self.assertEquals(len(ds3.metadata.collections), 2)
- ds4 = SubreadSet(data.getSubreadSet())
+ ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True)
self.assertEquals(type(ds4).__name__, 'SubreadSet')
self.assertEquals(type(ds4._metadata).__name__, 'SubreadSetMetadata')
self.assertEquals(len(ds4.metadata.collections), 1)
@@ -136,11 +134,33 @@ class TestDataSet(unittest.TestCase):
for name in names:
self.assertTrue(ds[name].id == name)
+ def test_contigset_split(self):
+ ref = ReferenceSet(data.getXml(9))
+ exp_n_contigs = len(ref)
+ refs = ref.split(10)
+ self.assertEqual(len(refs), 10)
+ obs_n_contigs = 0
+ for r in refs:
+ obs_n_contigs += sum(1 for _ in r)
+ self.assertEqual(obs_n_contigs, exp_n_contigs)
+
+
+ def test_contigset_len(self):
+ ref = ReferenceSet(data.getXml(9))
+ exp_n_contigs = len(ref)
+ refs = ref.split(10)
+ self.assertEqual(len(refs), 10)
+ obs_n_contigs = 0
+ for r in refs:
+ obs_n_contigs += len(r)
+ self.assertEqual(obs_n_contigs, exp_n_contigs)
+
+
def test_ccsread_build(self):
- ds1 = ConsensusReadSet(data.getXml(2), strict=False)
+ ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
self.assertEquals(type(ds1).__name__, 'ConsensusReadSet')
self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata')
- ds2 = ConsensusReadSet(data.getXml(2), strict=False)
+ ds2 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
self.assertEquals(type(ds2).__name__, 'ConsensusReadSet')
self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
@@ -161,7 +181,8 @@ class TestDataSet(unittest.TestCase):
ds1.write(fn)
def test_ccsalignment_build(self):
- ds1 = ConsensusAlignmentSet(data.getXml(20), strict=False)
+ ds1 = ConsensusAlignmentSet(data.getXml(20), strict=False,
+ skipMissing=True)
self.assertEquals(type(ds1).__name__, 'ConsensusAlignmentSet')
self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata')
# XXX strict=True requires actual existing .bam files
@@ -223,10 +244,10 @@ class TestDataSet(unittest.TestCase):
def test_contigset_build(self):
- ds1 = ContigSet(data.getXml(3))
+ ds1 = ContigSet(data.getXml(3), skipMissing=True)
self.assertEquals(type(ds1).__name__, 'ContigSet')
self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata')
- ds2 = ContigSet(data.getXml(3))
+ ds2 = ContigSet(data.getXml(3), skipMissing=True)
self.assertEquals(type(ds2).__name__, 'ContigSet')
self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
for contigmd in ds2.metadata.contigs:
@@ -240,7 +261,22 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(len(aln.toExternalFiles()), 2)
outdir = tempfile.mkdtemp(suffix="dataset-unittest")
outfn = os.path.join(outdir, 'merged.bam')
- consolidateBams(aln.toExternalFiles(), outfn, filterDset=aln)
+ consolidateBams(aln.toExternalFiles(), outfn, filterDset=aln,
+ useTmp=False)
+ self.assertTrue(os.path.exists(outfn))
+ consAln = AlignmentSet(outfn)
+ self.assertEqual(len(consAln.toExternalFiles()), 1)
+ for read1, read2 in zip(sorted(list(aln)), sorted(list(consAln))):
+ self.assertEqual(read1, read2)
+ self.assertEqual(len(aln), len(consAln))
+
+ log.debug("Test methods directly in tmp")
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(aln.toExternalFiles()), 2)
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outfn = os.path.join(outdir, 'merged.bam')
+ consolidateBams(aln.toExternalFiles(), outfn, filterDset=aln,
+ useTmp=True)
self.assertTrue(os.path.exists(outfn))
consAln = AlignmentSet(outfn)
self.assertEqual(len(consAln.toExternalFiles()), 1)
@@ -326,16 +362,17 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(read1, read2)
self.assertEqual(len(list(aln)), len(list(nonCons)))
- log.debug("Test cli")
- outdir = tempfile.mkdtemp(suffix="dataset-unittest")
- datafile = os.path.join(outdir, "merged.bam")
- xmlfile = os.path.join(outdir, "merged.xml")
- cmd = "dataset.py consolidate {i} {d} {x}".format(i=data.getXml(12),
- d=datafile,
- x=xmlfile)
- log.debug(cmd)
- o, r, m = backticks(cmd)
- self.assertEqual(r, 0)
+ def test_accuracy_filter(self):
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(list(aln)), 177)
+ aln.filters.addRequirement(accuracy=[('>', '.85')])
+ self.assertEqual(len(list(aln)), 174)
+
+ def test_contigset_filter(self):
+ ref = ReferenceSet(data.getXml(9))
+ self.assertEqual(len(list(ref)), 59)
+ ref.filters.addRequirement(length=[('>', '1450')])
+ self.assertEqual(len(list(ref)), 34)
@unittest.skipIf(not _check_constools() or not _internal_data(),
"bamtools, pbindex or data not found, skipping")
@@ -360,15 +397,6 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(read1, read2)
self.assertEqual(len(aln), len(nonCons))
- log.debug("Test cli")
- outdir = tempfile.mkdtemp(suffix="dataset-unittest")
- datafile = os.path.join(outdir, "merged.bam")
- xmlfile = os.path.join(outdir, "merged.xml")
- cmd = "dataset.py consolidate --numFiles 2 {i} {d} {x}".format(
- i=testFile, d=datafile, x=xmlfile)
- log.debug(cmd)
- o, r, m = backticks(cmd)
- self.assertEqual(r, 0)
@unittest.skipIf(not _check_constools(),
"bamtools or pbindex not found, skipping")
@@ -435,7 +463,10 @@ class TestDataSet(unittest.TestCase):
exp_single_seqs = [rec.sequence for rec in exp_singles]
acc_file = ContigSet(outFas1, outFas2)
+ acc_file.induceIndices()
log.debug(acc_file.toExternalFiles())
+ self.assertEqual(len(acc_file), 4)
+ self.assertEqual(len(list(acc_file)), 4)
acc_file.consolidate()
log.debug(acc_file.toExternalFiles())
@@ -445,6 +476,47 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(acc_file.get_contig(double).sequence[:],
exp_double_seq)
+ self.assertEqual(len(acc_file._openReaders), 1)
+ self.assertEqual(len(acc_file.index), 3)
+ self.assertEqual(len(acc_file._indexMap), 3)
+ self.assertEqual(len(acc_file), 3)
+ self.assertEqual(len(list(acc_file)), 3)
+
+ def test_contigset_consolidate_int_names(self):
+ #build set to merge
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+
+ inFas = os.path.join(outdir, 'infile.fasta')
+ outFas1 = os.path.join(outdir, 'tempfile1.fasta')
+ outFas2 = os.path.join(outdir, 'tempfile2.fasta')
+
+ # copy fasta reference to hide fai and ensure FastaReader is used
+ backticks('cp {i} {o}'.format(
+ i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
+ o=inFas))
+ rs1 = ContigSet(inFas)
+
+ double = 'B.cereus.1'
+ exp_double = rs1.get_contig(double)
+
+ # todo: modify the names first:
+ with FastaWriter(outFas1) as writer:
+ writer.writeRecord('5141', exp_double.sequence)
+ with FastaWriter(outFas2) as writer:
+ writer.writeRecord('5142', exp_double.sequence)
+
+ exp_double_seqs = [exp_double.sequence, exp_double.sequence]
+ exp_names = ['5141', '5142']
+
+ obs_file = ContigSet(outFas1, outFas2)
+ log.debug(obs_file.toExternalFiles())
+ obs_file.consolidate()
+ log.debug(obs_file.toExternalFiles())
+
+ # open obs and compare to exp
+ for name, seq in zip(exp_names, exp_double_seqs):
+ self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
+
def test_contigset_consolidate_genomic_consensus(self):
"""
Verify that the contigs output by GenomicConsensus (e.g. quiver) can
@@ -459,20 +531,20 @@ class TestDataSet(unittest.TestCase):
files = []
for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]):
_files = []
- for suffix in ["", "|quiver", "|plurality", "|arrow"]:
+ for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]:
tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name
with open(tmpfile, "w") as f:
f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
_files.append(tmpfile)
files.append(_files)
for i in range(3):
- ds = ContigSet(*[ f[i] for f in files ])
+ ds = ContigSet(*[f[i] for f in files])
out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name
ds.consolidate(fa1)
ds.write(out1)
with ContigSet(out1) as ds_new:
- self.assertEqual(len([ rec for rec in ds_new ]), 1,
+ self.assertEqual(len([rec for rec in ds_new]), 1,
"failed on %d" % i)
def test_split_hdfsubreadset(self):
@@ -483,6 +555,52 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(len(hdfdss[0].toExternalFiles()), 2)
self.assertEqual(len(hdfdss[1].toExternalFiles()), 1)
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not found, skipping")
+ def test_len_fastq(self):
+ fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
+ 'lambda/2590980/0008/Analysis_Results/'
+ 'm141115_075238_ethan_c100699872550000001'
+ '823139203261572_s1_p0.1.subreads.fastq')
+ fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
+ with open(fq_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ for line in itertools.islice(fih, 24):
+ fqh.write(line)
+ cset = ContigSet(fq_out)
+ self.assertFalse(cset.isIndexed)
+ self.assertTrue(isinstance(cset.resourceReaders()[0], FastqReader))
+ self.assertEqual(sum(1 for _ in cset),
+ sum(1 for _ in FastqReader(fq_out)))
+ self.assertEqual(sum(1 for _ in cset), 6)
+ # XXX not possible, fastq files can't be indexed:
+ #self.assertEqual(len(cset), sum(1 for _ in cset))
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not found, skipping")
+ def test_fastq_consolidate(self):
+ fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
+ 'lambda/2590980/0008/Analysis_Results/'
+ 'm141115_075238_ethan_c100699872550000001'
+ '823139203261572_s1_p0.1.subreads.fastq')
+ fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
+ cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
+ with open(fq_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ for line in itertools.islice(fih, 240):
+ fqh.write(line)
+ cset = ContigSet(fq_out)
+ cset_l = sum(1 for _ in cset)
+ self.assertEqual(cset_l, 60)
+ cset.filters.addRequirement(length=[('>', 1000)])
+ cset_l = sum(1 for _ in cset)
+ self.assertEqual(cset_l, 23)
+ cset.consolidate(cfq_out)
+ cset_l = sum(1 for _ in cset)
+ cfq = FastqReader(cfq_out)
+ self.assertEqual(cset_l, 23)
+ self.assertEqual(cset_l, sum(1 for _ in cfq))
+
def test_len(self):
# AlignmentSet
@@ -498,6 +616,8 @@ class TestDataSet(unittest.TestCase):
aln.updateCounts()
self.assertEqual(aln.totalLength, 123588)
self.assertEqual(aln.numRecords, 92)
+ self.assertEqual(sum(1 for _ in aln), 92)
+ self.assertEqual(sum(len(rec) for rec in aln), 123588)
# AlignmentSet with filters
aln = AlignmentSet(data.getXml(15), strict=True)
@@ -514,18 +634,18 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(aln.numRecords, 40)
# AlignmentSet with cmp.h5
- aln = AlignmentSet(upstreamData.getCmpH5(), strict=True)
- self.assertEqual(len(aln), 84)
- self.assertEqual(aln._length, (84, 26103))
- self.assertEqual(aln.totalLength, 26103)
- self.assertEqual(aln.numRecords, 84)
+ aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
+ self.assertEqual(len(aln), 112)
+ self.assertEqual(aln._length, (112, 59970))
+ self.assertEqual(aln.totalLength, 59970)
+ self.assertEqual(aln.numRecords, 112)
aln.totalLength = -1
aln.numRecords = -1
self.assertEqual(aln.totalLength, -1)
self.assertEqual(aln.numRecords, -1)
aln.updateCounts()
- self.assertEqual(aln.totalLength, 26103)
- self.assertEqual(aln.numRecords, 84)
+ self.assertEqual(aln.totalLength, 59970)
+ self.assertEqual(aln.numRecords, 112)
# SubreadSet
@@ -541,6 +661,8 @@ class TestDataSet(unittest.TestCase):
sset.updateCounts()
self.assertEqual(sset.totalLength, 124093)
self.assertEqual(sset.numRecords, 92)
+ self.assertEqual(sum(1 for _ in sset), 92)
+ self.assertEqual(sum(len(rec) for rec in sset), 124093)
# HdfSubreadSet
# len means something else in bax/bas land. These numbers may actually
@@ -602,7 +724,7 @@ class TestDataSet(unittest.TestCase):
def test_nested_external_resources(self):
log.debug("Testing nested externalResources in AlignmentSets")
- aln = AlignmentSet(data.getXml(0))
+ aln = AlignmentSet(data.getXml(0), skipMissing=True)
self.assertTrue(aln.externalResources[0].pbi)
self.assertTrue(aln.externalResources[0].reference)
self.assertEqual(
@@ -611,7 +733,7 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(aln.externalResources[0].scraps, None)
log.debug("Testing nested externalResources in SubreadSets")
- subs = SubreadSet(data.getXml(5))
+ subs = SubreadSet(data.getXml(5), skipMissing=True)
self.assertTrue(subs.externalResources[0].scraps)
self.assertEqual(
subs.externalResources[0].externalResources[0].metaType,
@@ -626,6 +748,11 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(
subs.externalResources[0].externalResources[0].metaType,
'PacBio.SubreadFile.ScrapsBamFile')
+ subs.externalResources[0].barcodes = 'bc.fasta'
+ self.assertTrue(subs.externalResources[0].barcodes)
+ self.assertEqual(
+ subs.externalResources[0].externalResources[1].metaType,
+ "PacBio.DataSet.BarcodeSet")
log.debug("Testing adding nested externalResources to AlignmetnSet "
"manually")
@@ -670,8 +797,100 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(ds[0].name, "lambda_NEB3011")
def test_alignmentset_index(self):
- aln = AlignmentSet(upstreamData.getCmpH5(), strict=True)
+ aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
reads = aln.readsInRange(aln.refNames[0], 0, 1000)
self.assertEqual(len(list(reads)), 2)
- self.assertEqual(len(list(aln)), 84)
- self.assertEqual(len(aln.index), 84)
+ self.assertEqual(len(list(aln)), 112)
+ self.assertEqual(len(aln.index), 112)
+
+ def test_barcodeset(self):
+ fa_out = tempfile.NamedTemporaryFile(suffix=".fasta").name
+ with open(fa_out, "w") as f:
+ f.write(">bc1\nAAAAAAAAAAAAAAAA\n>bc2\nCCCCCCCCCCCCCCCC")
+ ds = BarcodeSet(fa_out)
+ ds.induceIndices()
+ self.assertEqual([r.id for r in ds], ["bc1","bc2"])
+ ds_out = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name
+ ds.write(ds_out)
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not found, skipping")
+ def test_barcode_split_cornercases(self):
+ fn = ('/pbi/dept/secondary/siv/testdata/'
+ 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
+ '.tiny.subreadset.xml')
+ sset = SubreadSet(fn)
+ ssets = sset.split(chunks=3, barcodes=True)
+ self.assertEqual([str(ss.filters) for ss in ssets],
+ ["( bc = [0, 0] )",
+ "( bc = [1, 1] )",
+ "( bc = [2, 2] )"])
+ sset = SubreadSet(fn)
+ self.assertEqual(len(sset), 15133)
+ sset.filters = None
+ self.assertEqual(str(sset.filters), "")
+ sset.updateCounts()
+ self.assertEqual(len(sset), 2667562)
+
+ sset.filters.addRequirement(bc=[('=', '[2, 2]')])
+ self.assertEqual(str(sset.filters), "( bc = [2, 2] )")
+ sset.updateCounts()
+ self.assertEqual(len(sset), 4710)
+
+ sset.filters = None
+ self.assertEqual(str(sset.filters), "")
+ sset.updateCounts()
+ self.assertEqual(len(sset), 2667562)
+
+ sset.filters.addRequirement(bc=[('=', '[2,2]')])
+ self.assertEqual(str(sset.filters), "( bc = [2,2] )")
+ sset.updateCounts()
+ self.assertEqual(len(sset), 4710)
+
+ def test_merged_contigset(self):
+ fn = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
+ with ContigSet(upstreamData.getLambdaFasta(),
+ upstreamData.getFasta()) as cset:
+ self.assertEqual(len(list(cset)), 49)
+ self.assertEqual(len(cset), 49)
+ cset.consolidate()
+ cset.write(fn)
+ log.debug("Writing to {f}".format(f=fn))
+ self.assertEqual(len(list(cset)), 49)
+ self.assertEqual(len(cset), 49)
+ with ContigSet(fn) as cset:
+ self.assertEqual(len(list(cset)), 49)
+ self.assertEqual(len(cset), 49)
+
+ def test_incorrect_len_getitem(self):
+ types = [AlignmentSet(data.getXml(8)),
+ ReferenceSet(data.getXml(9)),
+ SubreadSet(data.getXml(10)),
+ HdfSubreadSet(data.getXml(19))]
+ fn = tempfile.NamedTemporaryFile(suffix=".xml").name
+ for ds in types:
+ explen = -2
+ with openDataFile(ds.toExternalFiles()[0]) as mystery:
+ # try to avoid crashes...
+ explen = len(mystery)
+ mystery.numRecords = 1000000000
+ mystery.write(fn)
+ with openDataFile(fn) as mystery:
+ self.assertEqual(len(list(mystery)), explen)
+
+ def test_missing_fai_error_message(self):
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+
+ inFas = os.path.join(outdir, 'infile.fasta')
+
+ # copy fasta reference to hide fai and ensure FastaReader is used
+ backticks('cp {i} {o}'.format(
+ i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
+ o=inFas))
+ rs1 = ContigSet(inFas)
+ with self.assertRaises(IOError) as cm:
+ rs1.assertIndexed()
+ self.assertEqual(
+ str(cm.exception),
+ ( "Companion FASTA index (.fai) file not found or malformatted! "
+ "Use 'samtools faidx' to generate FASTA index."))
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git
More information about the debian-med-commit
mailing list