[med-svn] [python-pbcore] 01/04: Imported Upstream version 1.2.11+dfsg
Afif Elghraoui
afif at moszumanska.debian.org
Thu Jan 26 08:28:50 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository python-pbcore.
commit 114e57037464896e7e8bf654d91f7e489f694278
Author: Afif Elghraoui <afif at debian.org>
Date: Wed Jan 25 22:47:43 2017 -0800
Imported Upstream version 1.2.11+dfsg
---
.circleci/installHDF5.sh | 2 +-
CHANGELOG.org | 5 +
Makefile | 5 +
pbcore/__init__.py | 2 +-
pbcore/chemistry/resources/mapping.xml | 60 +-
pbcore/data/__init__.py | 3 +
pbcore/data/empty.aligned_subreads.bam | Bin 0 -> 728 bytes
pbcore/data/empty.aligned_subreads.bam.pbi | Bin 0 -> 67 bytes
pbcore/io/GffIO.py | 70 +-
pbcore/io/align/BamIO.py | 26 +-
pbcore/io/align/CmpH5IO.py | 6 +-
pbcore/io/align/PacBioBamIndex.py | 10 +-
pbcore/io/dataset/DataSetErrors.py | 34 +
pbcore/io/dataset/DataSetIO.py | 339 +++---
pbcore/io/dataset/DataSetMembers.py | 850 +++++++++------
pbcore/io/dataset/DataSetMetaTypes.py | 86 ++
pbcore/io/dataset/DataSetReader.py | 46 +-
pbcore/{__init__.py => io/dataset/DataSetUtils.py} | 61 +-
pbcore/io/dataset/DataSetValidator.py | 41 +-
pbcore/io/dataset/DataSetWriter.py | 43 +-
pbcore/io/dataset/__init__.py | 35 +
pbcore/io/dataset/utils.py | 172 ++-
requirements-dev.txt | 2 +-
tests/test_pbcore_io_AlnFileReaders.py | 12 +-
tests/test_pbcore_io_GffIO.py | 11 +
tests/test_pbdataset.py | 1090 ++++----------------
tests/test_pbdataset_filters.py | 373 +++++++
tests/test_pbdataset_metadata.py | 114 ++
tests/test_pbdataset_split.py | 506 +++++++++
tests/test_pbdataset_subtypes.py | 205 ++--
tests/test_pbdataset_utils.py | 183 ++++
tests/utils.py | 40 +
32 files changed, 2842 insertions(+), 1590 deletions(-)
diff --git a/.circleci/installHDF5.sh b/.circleci/installHDF5.sh
index 3e7ab39..15b9049 100644
--- a/.circleci/installHDF5.sh
+++ b/.circleci/installHDF5.sh
@@ -1,7 +1,7 @@
set -x
set -e
if [ ! -e prefix/lib/libhdf5.so ]; then
- wget https://www.hdfgroup.org/ftp/HDF5//releases/hdf5-1.8.12/src/hdf5-1.8.12.tar.gz
+ wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8.12/src/hdf5-1.8.12.tar.gz
tar xzf hdf5-1.8.12.tar.gz
mkdir -p prefix
PREFIX=$PWD/prefix
diff --git a/CHANGELOG.org b/CHANGELOG.org
index daccd48..baf6708 100644
--- a/CHANGELOG.org
+++ b/CHANGELOG.org
@@ -1,3 +1,8 @@
+* Version 1.2.11
+ - Support for 3.2 basecaller and chemistry
+ - GFF bug fixes
+ - Misc fixes and enhancements
+
* Version 1.2.10 (> SMRTanalysis 3.1)
- Update to pysam 0.9.0
- Recognize new and prospective future partnumbers for the 3.1 timeframe
diff --git a/Makefile b/Makefile
index 64e10c7..2ea1010 100644
--- a/Makefile
+++ b/Makefile
@@ -50,3 +50,8 @@ pip-install:
&& pip uninstall -y pbcore \
|| echo -n ''
@pip install --no-index ./
+
+
+publish-to-pypi:
+ @echo "I'm not going to do this for you, but you can do it by:"
+ @echo " % python setup.py sdist upload -r pypi"
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
index 7375688..3e5263b 100644
--- a/pbcore/__init__.py
+++ b/pbcore/__init__.py
@@ -28,4 +28,4 @@
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
-__VERSION__ = "1.2.10"
+__VERSION__ = "1.2.11"
diff --git a/pbcore/chemistry/resources/mapping.xml b/pbcore/chemistry/resources/mapping.xml
index 0f7e95b..528b7a4 100644
--- a/pbcore/chemistry/resources/mapping.xml
+++ b/pbcore/chemistry/resources/mapping.xml
@@ -175,6 +175,8 @@
<SequencingKit>100612400</SequencingKit>
<SoftwareVersion>2.3</SoftwareVersion>
</Mapping>
+
+ <!-- 3.0 ("Dromedary") chemistry -->
<Mapping>
<SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
<BindingKit>100-619-300</BindingKit>
@@ -188,71 +190,41 @@
<SoftwareVersion>3.1</SoftwareVersion>
</Mapping>
- <!-- 1.0/1.1: a 1/1 compabible new SK condition -->
+ <!-- 3.1 ("Echidna") chemistry -->
<Mapping>
- <SequencingChemistry>S/P1-C1</SequencingChemistry>
+ <SequencingChemistry>S/P1-C1.1</SequencingChemistry>
<BindingKit>100-619-300</BindingKit>
<SequencingKit>100-867-300</SequencingKit>
<SoftwareVersion>3.1</SoftwareVersion>
</Mapping>
- <!-- 1.1/1.1: a 1/1 compatible new BK and SK condition -->
<Mapping>
- <SequencingChemistry>S/P1-C1</SequencingChemistry>
- <BindingKit>100-867-500</BindingKit>
+ <SequencingChemistry>S/P1-C1.1</SequencingChemistry>
+ <BindingKit>100-619-300</BindingKit>
<SequencingKit>100-867-300</SequencingKit>
- <SoftwareVersion>3.1</SoftwareVersion>
- </Mapping>
-
- <!-- 2.0/2.0: new kits... 1/1 compatible... -->
- <Mapping>
- <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
- <BindingKit>100-864-400</BindingKit>
- <SequencingKit>100-864-100</SequencingKit>
- <SoftwareVersion>3.1</SoftwareVersion>
+ <SoftwareVersion>3.2</SoftwareVersion>
</Mapping>
- <!-- 2.1/2.1, ibid -->
+ <!-- 3.1.1 ("Flea") chemistry -->
<Mapping>
- <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
- <BindingKit>100-864-600</BindingKit>
- <SequencingKit>100-864-200</SequencingKit>
+ <SequencingChemistry>S/P1-C1.2</SequencingChemistry>
+ <BindingKit>100-619-300</BindingKit>
+ <SequencingKit>100-902-100</SequencingKit>
<SoftwareVersion>3.1</SoftwareVersion>
</Mapping>
- <!-- 3.2 basecaller support -->
<Mapping>
- <SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
+ <SequencingChemistry>S/P1-C1.2</SequencingChemistry>
<BindingKit>100-619-300</BindingKit>
- <SequencingKit>100-620-000</SequencingKit>
+ <SequencingKit>100-902-100</SequencingKit>
<SoftwareVersion>3.2</SoftwareVersion>
</Mapping>
+ <!---3.2 ("Goat") chemistry -->
<Mapping>
- <SequencingChemistry>S/P1-C1</SequencingChemistry>
+ <SequencingChemistry>S/P1-C1.3</SequencingChemistry>
<BindingKit>100-619-300</BindingKit>
- <SequencingKit>100-867-300</SequencingKit>
- <SoftwareVersion>3.2</SoftwareVersion>
- </Mapping>
-
- <Mapping>
- <SequencingChemistry>S/P1-C1</SequencingChemistry>
- <BindingKit>100-867-500</BindingKit>
- <SequencingKit>100-867-300</SequencingKit>
- <SoftwareVersion>3.2</SoftwareVersion>
- </Mapping>
-
- <Mapping>
- <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
- <BindingKit>100-864-400</BindingKit>
- <SequencingKit>100-864-100</SequencingKit>
- <SoftwareVersion>3.2</SoftwareVersion>
- </Mapping>
-
- <Mapping>
- <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
- <BindingKit>100-864-600</BindingKit>
- <SequencingKit>100-864-200</SequencingKit>
+ <SequencingKit>100-972-200</SequencingKit>
<SoftwareVersion>3.2</SoftwareVersion>
</Mapping>
diff --git a/pbcore/data/__init__.py b/pbcore/data/__init__.py
index 433b5bf..eec7adf 100644
--- a/pbcore/data/__init__.py
+++ b/pbcore/data/__init__.py
@@ -171,3 +171,6 @@ def getUnalignedBam():
def getEmptyBam():
return _getAbsPath("empty.ccs.bam")
+
+def getEmptyAlignedBam():
+ return _getAbsPath("empty.aligned_subreads.bam")
diff --git a/pbcore/data/empty.aligned_subreads.bam b/pbcore/data/empty.aligned_subreads.bam
new file mode 100644
index 0000000..18f0531
Binary files /dev/null and b/pbcore/data/empty.aligned_subreads.bam differ
diff --git a/pbcore/data/empty.aligned_subreads.bam.pbi b/pbcore/data/empty.aligned_subreads.bam.pbi
new file mode 100644
index 0000000..e398d79
Binary files /dev/null and b/pbcore/data/empty.aligned_subreads.bam.pbi differ
diff --git a/pbcore/io/GffIO.py b/pbcore/io/GffIO.py
index a08f0b7..c7decaf 100644
--- a/pbcore/io/GffIO.py
+++ b/pbcore/io/GffIO.py
@@ -42,8 +42,9 @@ __all__ = [ "Gff3Record",
"GffWriter" ]
from .base import ReaderBase, WriterBase
-from collections import OrderedDict, defaultdict
+from collections import OrderedDict, defaultdict, namedtuple
from copy import copy as shallow_copy
+import logging
import tempfile
import os.path
@@ -295,46 +296,37 @@ def _merge_gff_headers(gff_files):
return headers, header_keys
-def _merge_gffs_sorted(gff1, gff2, out_file):
- import logging
- logging.warn("Merging %s and %s" % (gff1, gff2))
- with GffWriter(out_file) as out:
- n_rec = 0
- headers, header_keys = _merge_gff_headers([gff1, gff2])
- with GffReader(gff1) as f1:
- for key in header_keys:
- for value in headers[key]:
- out.writeHeader("##%s %s" % (key, value))
- with GffReader(gff2) as f2:
- rec1 = [ rec for rec in f1 ]
- rec2 = [ rec for rec in f2 ]
- i = j = 0
- while i < len(rec1) and j < len(rec2):
- if (rec1[i] > rec2[j]):
- out.writeRecord(rec2[j])
- j += 1
- else:
- out.writeRecord(rec1[i])
- i += 1
- for rec in rec1[i:]:
- out.writeRecord(rec)
- i += 1
- for rec in rec2[j:]:
- out.writeRecord(rec)
- j += 2
- return i + j
-
+GffHead = namedtuple("GffHead", ("file_name", "record", "headers"))
def merge_gffs_sorted(gff_files, output_file_name):
"""
Utility function to combine a set of N (>= 2) GFF files, with records
- ordered by genomic location. (Assuming each input file is already sorted.)
+ ordered by genomic location. This assumes that each input file is already
+ sorted, and forms a contiguous set of records.
"""
- if len(gff_files) == 1: return gff_files[0]
- while len(gff_files) > 2:
- tmpout = tempfile.NamedTemporaryFile(suffix=".gff").name
- _merge_gffs_sorted(gff_files[0], gff_files[1], tmpout)
- gff_files = [tmpout] + gff_files[2:]
- with open(output_file_name, "w") as f:
- _merge_gffs_sorted(gff_files[0], gff_files[1], f)
- return output_file_name
+ # collect the very first record of each GFF and use that to sort the files
+ first_records = []
+ empty_files = []
+ for file_name in gff_files:
+ with GffReader(file_name) as f:
+ for rec in f:
+ first_records.append(GffHead(file_name, rec, f.headers))
+ break
+ else:
+ empty_files.append(file_name)
+ first_records.sort(lambda a,b: cmp(a.record, b.record))
+ gff_files = [f.file_name for f in first_records]
+ gff_files.extend(empty_files)
+ headers, header_keys = _merge_gff_headers(gff_files)
+ nrec = 0
+ with GffWriter(output_file_name) as out:
+ for key in header_keys:
+ for value in headers[key]:
+ out.writeHeader("##%s %s" % (key, value))
+ for file_name in gff_files:
+ logging.info("Merging {f}".format(f=file_name))
+ with GffReader(file_name) as gff:
+ for rec in gff:
+ out.writeRecord(rec)
+ nrec += 1
+ return nrec
diff --git a/pbcore/io/align/BamIO.py b/pbcore/io/align/BamIO.py
index a9e7fa7..f0d5d99 100644
--- a/pbcore/io/align/BamIO.py
+++ b/pbcore/io/align/BamIO.py
@@ -114,7 +114,6 @@ class _BamReaderBase(ReaderBase):
rgChem = decodeTriple(*triple)
rgReadType = ds["READTYPE"]
rgFrameRate = ds["FRAMERATEHZ"]
- readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate))
# Look for the features manifest entries within the DS tag,
# and build an "indirection layer", i.e. to get from
@@ -130,13 +129,17 @@ class _BamReaderBase(ReaderBase):
self._baseFeatureNameMappings[rgID] = baseFeatureNameMapping
self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping
+ readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate,
+ frozenset(baseFeatureNameMapping.iterkeys())))
+
self._readGroupTable = np.rec.fromrecords(
readGroupTable_,
dtype=[("ID" , np.int32),
("MovieName" , "O"),
("ReadType" , "O"),
("SequencingChemistry", "O"),
- ("FrameRate", float)])
+ ("FrameRate", float),
+ ("BaseFeatures", "O")])
assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
"First 8 chars of read group IDs must be unique!"
@@ -176,19 +179,16 @@ class _BamReaderBase(ReaderBase):
def _checkFileCompatibility(self):
# Verify that this is a "pacbio" BAM file of version at least
# 3.0.1
- try:
- checkedVersion = self.version
- if "b" in checkedVersion:
- raise Exception()
- else:
- major, minor, patch = checkedVersion.split('.')
- assert major >= 3
- assert minor >= 0
- assert patch >= 1
- except:
- raise IncompatibleFile(
+ badVersionException = IncompatibleFile(
"This BAM file is incompatible with this API " +
"(only PacBio BAM files version >= 3.0.1 are supported)")
+ checkedVersion = self.version
+ if "b" in checkedVersion:
+ raise badVersionException
+ else:
+ major, minor, patch = checkedVersion.split('.')
+ if not (major, minor, patch) >= (3, 0, 1):
+ raise badVersionException
def __init__(self, fname, referenceFastaFname=None):
self.filename = fname = abspath(expanduser(fname))
diff --git a/pbcore/io/align/CmpH5IO.py b/pbcore/io/align/CmpH5IO.py
index aa5e83b..2e18108 100755
--- a/pbcore/io/align/CmpH5IO.py
+++ b/pbcore/io/align/CmpH5IO.py
@@ -796,12 +796,14 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
self._movieInfoTable.Name,
[self.readType] * len(self._movieInfoTable.ID),
self.sequencingChemistry,
- self._movieInfoTable.FrameRate),
+ self._movieInfoTable.FrameRate,
+ [frozenset(self.baseFeaturesAvailable())] * len(self._movieInfoTable.ID)),
dtype=[("ID" , np.int32),
("MovieName" , "O"),
("ReadType" , "O"),
("SequencingChemistry", "O"),
- ("FrameRate" , float)])
+ ("FrameRate" , float),
+ ("BaseFeatures" , "O")])
self._readGroupDict = { rg.ID : rg
for rg in self._readGroupTable }
diff --git a/pbcore/io/align/PacBioBamIndex.py b/pbcore/io/align/PacBioBamIndex.py
index 89ae316..c959c4d 100644
--- a/pbcore/io/align/PacBioBamIndex.py
+++ b/pbcore/io/align/PacBioBamIndex.py
@@ -71,7 +71,15 @@ class PacBioBamIndex(object):
@property
def hasMappingInfo(self):
- return (self.pbiFlags & PBI_FLAGS_MAPPED)
+ # N.B.: Or'ing in (nReads==0) is HACKish fix for issue with
+ # empty mapped BAM files---the "Mapped" bit doesn't get set
+ # correctly in pbi_flags if the file is empty. So in the empty
+ # file case, assume it may be mapped. The downside is we
+ # don't get an exception on attempting to access the mapped
+ # columns for empty unmapped BAM pbis. Not a big deal, I
+ # suppose
+ return ((self.nReads == 0) or
+ (self.pbiFlags & PBI_FLAGS_MAPPED))
@property
def hasCoordinateSortedInfo(self):
diff --git a/pbcore/io/dataset/DataSetErrors.py b/pbcore/io/dataset/DataSetErrors.py
index 4695df9..1623475 100644
--- a/pbcore/io/dataset/DataSetErrors.py
+++ b/pbcore/io/dataset/DataSetErrors.py
@@ -1,3 +1,37 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
class InvalidDataSetIOError(Exception):
"""The base class for all DataSetIO related custom exceptions
diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py
index cbde652..3ad720a 100755
--- a/pbcore/io/dataset/DataSetIO.py
+++ b/pbcore/io/dataset/DataSetIO.py
@@ -1,9 +1,43 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
"""
Classes representing DataSets of various types.
"""
import hashlib
-import datetime
import copy
import os, sys
import re
@@ -39,39 +73,23 @@ from pbcore.io.dataset.DataSetMembers import (DataSetMetadata,
ExternalResource, Filters)
from pbcore.io.dataset.utils import (consolidateBams, _infixFname, _pbindexBam,
_indexBam, _indexFasta, _fileCopy,
- _swapPath, which, consolidateXml)
+ _swapPath, which, consolidateXml,
+ getTimeStampedName)
from pbcore.io.dataset.DataSetErrors import (InvalidDataSetIOError,
ResourceMismatchError)
+from pbcore.io.dataset.DataSetMetaTypes import (DataSetMetaTypes, toDsId,
+ dsIdToSuffix)
+from pbcore.io.dataset.DataSetUtils import fileType
log = logging.getLogger(__name__)
-def filtered(generator):
- """Wrap a generator with postfiltering"""
- @wraps(generator)
- def wrapper(dset, *args, **kwargs):
- filter_tests = dset.processFilters()
- no_filter = dset.noFiltering
- if no_filter:
- for read in generator(dset, *args, **kwargs):
- yield read
- else:
- for read in generator(dset, *args, **kwargs):
- if any(filt(read) for filt in filter_tests):
- yield read
- return wrapper
-
-def _getTimeStampedName(mType):
- """Generate a timestamped name using the given metatype 'mType' and the
- current UTC time"""
- mType = mType.lower()
- mType = '_'.join(mType.split('.'))
- time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
- return "{m}-{t}".format(m=mType, t=time)
-
-def _toDsId(name):
- """Translate a class name into a MetaType/ID"""
- return "PacBio.DataSet.{x}".format(x=name)
+def openDataSet(*files, **kwargs):
+ """Factory function for DataSet types as suggested by the first file"""
+ if files[0].endswith('xml'):
+ tbrType = _typeDataSet(files[0])
+ return tbrType(*files, **kwargs)
+ return openDataFile(*files, **kwargs)
def _dsIdToName(dsId):
"""Translate a MetaType/ID into a class name"""
@@ -88,23 +106,10 @@ def _dsIdToType(dsId):
else:
raise InvalidDataSetIOError("Invalid DataSet MetaType")
-def _dsIdToSuffix(dsId):
- """Translate a MetaType/ID into a file suffix"""
- dsIds = DataSetMetaTypes.ALL
- suffixMap = {dsId: _dsIdToName(dsId) for dsId in dsIds}
- suffixMap[_toDsId("DataSet")] = 'DataSet'
- if DataSetMetaTypes.isValid(dsId):
- suffix = suffixMap[dsId]
- suffix = suffix.lower()
- suffix += '.xml'
- return suffix
- else:
- raise InvalidDataSetIOError("Invalid DataSet MetaType")
-
def _typeDataSet(dset):
"""Determine the type of a dataset from the xml file without opening it"""
xml_rt = xmlRootType(dset)
- dsId = _toDsId(xml_rt)
+ dsId = toDsId(xml_rt)
tbrType = _dsIdToType(dsId)
return tbrType
@@ -116,39 +121,6 @@ def isDataSet(xmlfile):
except Exception:
return False
-def getDataSetUuid(xmlfile):
- """
- Quickly retrieve the uuid from the root element of a dataset XML file,
- using a streaming parser to avoid loading the entire dataset into memory.
- Returns None if the parsing fails.
- """
- try:
- import xml.etree.cElementTree as ET
- for event, element in ET.iterparse(xmlfile, events=("start",)):
- return element.get("UniqueId")
- except Exception:
- return None
-
-
-def getDataSetMetaType(xmlfile):
- """
- Quickly retrieve the MetaType from the root element of a dataset XML file,
- using a streaming parser to avoid loading the entire dataset into memory.
- Returns None if the parsing fails.
- """
- try:
- import xml.etree.cElementTree as ET
- for event, element in ET.iterparse(xmlfile, events=("start",)):
- return element.get("MetaType")
- except Exception:
- return None
-
-
-def openDataSet(*files, **kwargs):
- """Factory function for DataSet types as suggested by the first file"""
- tbrType = _typeDataSet(files[0])
- return tbrType(*files, **kwargs)
-
def openDataFile(*files, **kwargs):
"""Factory function for DataSet types determined by the first data file"""
possibleTypes = [AlignmentSet, SubreadSet, ConsensusReadSet,
@@ -158,10 +130,10 @@ def openDataFile(*files, **kwargs):
for dstype in possibleTypes:
for ftype in dstype._metaTypeMapping():
fileMap[ftype].append(dstype)
- ext = _fileType(files[0])
+ ext = fileType(files[0])
if ext == 'fofn':
files = openFofnFile(files[0])
- ext = _fileType(files[0])
+ ext = fileType(files[0])
if ext == 'xml':
dsType = _typeDataSet(files[0])
return dsType(*origFiles, **kwargs)
@@ -191,11 +163,40 @@ def openDataFile(*files, **kwargs):
else:
return SubreadSet(*origFiles, **kwargs)
+def filtered(generator):
+ """Wrap a generator with postfiltering"""
+ @wraps(generator)
+ def wrapper(dset, *args, **kwargs):
+ filter_tests = dset.processFilters()
+ no_filter = dset.noFiltering
+ if no_filter:
+ for read in generator(dset, *args, **kwargs):
+ yield read
+ else:
+ for read in generator(dset, *args, **kwargs):
+ if any(filt(read) for filt in filter_tests):
+ yield read
+ return wrapper
+
def _stackRecArrays(recArrays):
"""Stack recarrays into a single larger recarray"""
- tbr = np.concatenate(recArrays)
- tbr = tbr.view(np.recarray)
- return tbr
+
+ empties = []
+ nonempties = []
+ for array in recArrays:
+ if len(array) == 0:
+ empties.append(array)
+ else:
+ nonempties.append(array)
+ if nonempties:
+ tbr = np.concatenate(nonempties)
+ tbr = tbr.view(np.recarray)
+ return tbr
+ else:
+ # this will check dtypes...
+ tbr = np.concatenate(empties)
+ tbr = tbr.view(np.recarray)
+ return tbr
def _uniqueRecords(recArray):
"""Remove duplicate records"""
@@ -256,46 +257,12 @@ def keysToRanges(keys):
key_ranges = [[min(k), max(k)] for k in keys]
return key_ranges
-class DataSetMetaTypes(object):
- """
- This mirrors the PacBioSecondaryDataModel.xsd definitions and be used
- to reference a specific dataset type.
- """
- SUBREAD = _toDsId("SubreadSet")
- HDF_SUBREAD = _toDsId("HdfSubreadSet")
- ALIGNMENT = _toDsId("AlignmentSet")
- BARCODE = _toDsId("BarcodeSet")
- CCS_ALIGNMENT = _toDsId("ConsensusAlignmentSet")
- CCS = _toDsId("ConsensusReadSet")
- REFERENCE = _toDsId("ReferenceSet")
- CONTIG = _toDsId("ContigSet")
-
- ALL = (SUBREAD, HDF_SUBREAD, ALIGNMENT,
- BARCODE, CCS, CCS_ALIGNMENT, REFERENCE, CONTIG)
-
- @classmethod
- def isValid(cls, dsId):
- return dsId in cls.ALL
-
-
-def _fileType(fname):
- """Get the extension of fname (with h5 type)"""
- remainder, ftype = os.path.splitext(fname)
- if ftype == '.h5':
- _, prefix = os.path.splitext(remainder)
- ftype = prefix + ftype
- elif ftype == '.index':
- _, prefix = os.path.splitext(remainder)
- if prefix == '.contig':
- ftype = prefix + ftype
- ftype = ftype.strip('.')
- return ftype
def _fileExists(fname):
"""Assert that a file exists with a useful failure mode"""
if not isinstance(fname, str):
fname = fname.resourceId
- if not os.path.isfile(fname):
+ if not os.path.exists(fname):
raise InvalidDataSetIOError("Resource {f} not found".format(f=fname))
return True
@@ -318,21 +285,17 @@ def _pathChanger(osPathFunc, metaTypeFunc, resource):
metaTypeFunc(resource)
def _copier(dest, resource, subfolder=None):
- """Apply these two functions to the resource or ResourceId"""
if subfolder is None:
subfolder = [uuid.uuid4()]
resId = resource.resourceId
currentPath = urlparse(resId)
if currentPath.scheme == 'file' or not currentPath.scheme:
currentPath = currentPath.path
- try:
+ if resource.KEEP_WITH_PARENT:
+ currentPath = _fileCopy(dest, currentPath, uuid=subfolder[0])
+ else:
currentPath = _fileCopy(dest, currentPath, uuid=resource.uniqueId)
subfolder[0] = resource.uniqueId
- except AttributeError:
- if subfolder:
- currentPath = _fileCopy(dest, currentPath, uuid=subfolder[0])
- else:
- raise
resource.resourceId = currentPath
@@ -450,9 +413,9 @@ class DataSet(object):
dsType = self.objMetadata.setdefault("MetaType", self.datasetType)
else:
dsType = self.objMetadata.setdefault("MetaType",
- _toDsId('DataSet'))
+ toDsId('DataSet'))
if not "TimeStampedName" in self.objMetadata:
- self.objMetadata["TimeStampedName"] = _getTimeStampedName(
+ self.objMetadata["TimeStampedName"] = getTimeStampedName(
self.objMetadata["MetaType"])
self.objMetadata.setdefault("Name",
self.objMetadata["TimeStampedName"])
@@ -514,7 +477,7 @@ class DataSet(object):
if _induceIndices:
self.induceIndices()
- def induceIndices(self):
+ def induceIndices(self, force=False):
"""Generate indices for ExternalResources.
Not compatible with DataSet base type"""
@@ -709,7 +672,7 @@ class DataSet(object):
self.updateCounts()
return self.numRecords
- def newUuid(self, setter=True):
+ def newUuid(self, setter=True, random=False):
"""Generate and enforce the uniqueness of an ID for a new DataSet.
While user setable fields are stripped out of the Core DataSet object
used for comparison, the previous UniqueId is not. That means that
@@ -730,12 +693,15 @@ class DataSet(object):
>>> old != ds.uuid
True
"""
- coreXML = toXml(self, core=True)
- newId = str(hashlib.md5(coreXML).hexdigest())
+ if random:
+ newId = str(uuid.uuid4())
+ else:
+ coreXML = toXml(self, core=True)
+ newId = str(hashlib.md5(coreXML).hexdigest())
- # Group appropriately
- newId = '-'.join([newId[:8], newId[8:12], newId[12:16], newId[16:20],
- newId[20:]])
+ # Group appropriately
+ newId = '-'.join([newId[:8], newId[8:12], newId[12:16],
+ newId[16:20], newId[20:]])
if setter:
self.objMetadata['UniqueId'] = newId
@@ -863,14 +829,14 @@ class DataSet(object):
Args:
:chunks: the number of chunks to split the DataSet.
- :ignoreSubDatasets: (True) do not split on subdatasets
- :contigs: split on contigs instead of external resources etc
+ :ignoreSubDatasets: (True) do not split by subdatasets
+ :contigs: split on contigs instead of external resources
+ :zmws: Split by zmws instead of external resources
+ :barcodes: Split by barcodes instead of external resources
:maxChunks: The upper limit on the number of chunks.
- :breakContigs: Whether or not to break contigs when using maxChunks
- :targetSize: The target number of reads per chunk
- :zmws: Split by zmws
- :barcodes: Split by barcodes
+ :breakContigs: Whether or not to break contigs
:byRecords: Split contigs by mapped records, rather than ref length
+ :targetSize: The target minimum number of reads per chunk
:updateCounts: Update the count metadata in each chunk
Returns:
@@ -975,6 +941,8 @@ class DataSet(object):
# whole, therefore we need to regenerate it again here
log.debug("Generating new UUID")
for result in results:
+ if updateCounts:
+ result.updateCounts()
result.newUuid()
# Update the basic metadata for the new DataSets from external
@@ -1084,11 +1052,11 @@ class DataSet(object):
validation_errors.append(e)
fileName = urlparse(outFile).path.strip()
if self._strict and not isinstance(self.datasetType, tuple):
- if not fileName.endswith(_dsIdToSuffix(self.datasetType)):
+ if not fileName.endswith(dsIdToSuffix(self.datasetType)):
raise IOError(errno.EIO,
"Given filename does not meet standards, "
"should end with {s}".format(
- s=_dsIdToSuffix(self.datasetType)),
+ s=dsIdToSuffix(self.datasetType)),
fileName)
with open(fileName, 'w') as outFile:
outFile.writelines(xml_string)
@@ -1127,11 +1095,6 @@ class DataSet(object):
else:
statsMetadata = filename
if self.metadata.summaryStats:
- newSub = self.copy()
- newSub.metadata.removeChildren('SummaryStats')
- newSub.loadStats(statsMetadata)
- self.addDatasets(self.copy())
- self.addDatasets(newSub)
self.metadata.summaryStats.merge(statsMetadata)
else:
self.metadata.append(statsMetadata)
@@ -1142,15 +1105,22 @@ class DataSet(object):
return sorted(reads, key=lambda a: a.readStart)
def loadMetadata(self, filename):
- """Load pipeline metadata from a <moviename>.run.metadata.xml file.
+ """Load pipeline metadata from a <moviename>.metadata.xml file (or
+ other DataSet)
Args:
- :filename: the filename of a <moviename>.run.metadata.xml file
+ :filename: the filename of a <moviename>.metadata.xml file
"""
if isinstance(filename, basestring):
- metadata = parseMetadata(str(filename))
+ if isDataSet(filename):
+ # path to DataSet
+ metadata = openDataSet(filename).metadata
+ else:
+ # path to metadata.xml
+ metadata = parseMetadata(str(filename))
else:
+ # DataSetMetadata object
metadata = filename
self.addMetadata(metadata)
self.updateCounts()
@@ -1250,11 +1220,11 @@ class DataSet(object):
"""Infer and set the metatype of 'resource' if it doesn't already have
one."""
if not resource.metaType:
- file_type = _fileType(resource.resourceId)
+ file_type = fileType(resource.resourceId)
resource.metaType = self._metaTypeMapping().get(file_type, "")
if not resource.timeStampedName:
mtype = resource.metaType
- tsName = _getTimeStampedName(mtype)
+ tsName = getTimeStampedName(mtype)
resource.timeStampedName = tsName
@staticmethod
@@ -1633,7 +1603,7 @@ class DataSet(object):
if isinstance(self.datasetType, tuple):
return self.datasetType
else:
- return (_toDsId('DataSet'), self.datasetType)
+ return (toDsId('DataSet'), self.datasetType)
@classmethod
def castableTypes(cls):
@@ -1655,6 +1625,7 @@ class DataSet(object):
'ConsensusReadSet': ConsensusReadSet,
'ConsensusAlignmentSet': ConsensusAlignmentSet,
'ReferenceSet': ReferenceSet,
+ 'GmapReferenceSet': GmapReferenceSet,
'BarcodeSet': BarcodeSet}
@property
def metadata(self):
@@ -1739,6 +1710,11 @@ class DataSet(object):
return self.objMetadata.get('UniqueId')
@property
+ def uniqueId(self):
+ """The UniqueId of this DataSet"""
+ return self.objMetadata.get('UniqueId')
+
+ @property
def name(self):
"""The name of this DataSet"""
return self.objMetadata.get('Name', '')
@@ -1900,20 +1876,27 @@ class ReadSet(DataSet):
super(ReadSet, self).__init__(*files, **kwargs)
self._metadata = SubreadSetMetadata(self._metadata)
- def induceIndices(self):
+ def induceIndices(self, force=False):
for res in self.externalResources:
fname = res.resourceId
newInds = []
if not res.pbi:
- newInds.append(_pbindexBam(fname))
+ iname = fname + '.pbi'
+ if not os.path.isfile(iname) or force:
+ iname = _pbindexBam(fname)
+ newInds.append(iname)
self.close()
if not res.bai:
- newInds.append(_indexBam(fname))
+ iname = fname + '.bai'
+ if not os.path.isfile(iname) or force:
+ iname = _indexBam(fname)
+ newInds.append(iname)
self.close()
if newInds:
res.addIndices(newInds)
self._populateMetaTypes()
self.updateCounts()
+ return newInds
@property
def isMapped(self):
@@ -2252,8 +2235,6 @@ class ReadSet(DataSet):
_indexMap = []
for rrNum, rr in enumerate(self.resourceReaders()):
indices = rr.index
- if len(indices) == 0:
- continue
self._fixQIds(indices, rr)
@@ -2433,11 +2414,11 @@ class HdfSubreadSet(ReadSet):
# The metatype for this dataset type is inconsistent, plaster over it
# here:
self.objMetadata["MetaType"] = "PacBio.DataSet.HdfSubreadSet"
- self.objMetadata["TimeStampedName"] = _getTimeStampedName(
+ self.objMetadata["TimeStampedName"] = getTimeStampedName(
self.objMetadata["MetaType"])
- def induceIndices(self):
- log.debug("Bax files already indexed")
+ def induceIndices(self, force=False):
+ log.debug("Bax files don't have external indices")
@property
def isIndexed(self):
@@ -2605,11 +2586,11 @@ class AlignmentSet(ReadSet):
else:
return super(AlignmentSet, self).consolidate(*args, **kwargs)
- def induceIndices(self):
+ def induceIndices(self, force=False):
if self.isCmpH5:
log.debug("Cmp.h5 files already indexed")
else:
- return super(AlignmentSet, self).induceIndices()
+ return super(AlignmentSet, self).induceIndices(force=force)
@property
def isIndexed(self):
@@ -2662,8 +2643,6 @@ class AlignmentSet(ReadSet):
for rrNum, rr in enumerate(self.resourceReaders()):
indices = rr.index
# pbi files lack e.g. mapping cols when bam emtpy, ignore
- if len(indices) == 0:
- continue
# TODO(mdsmith)(2016-01-19) rename the fields instead of branching:
#if self.isCmpH5:
# _renameField(indices, 'MovieID', 'qId')
@@ -3833,10 +3812,13 @@ class ContigSet(DataSet):
return '_'.join(name.split('_')[:-2]) + suff
return name
- def induceIndices(self):
+ def induceIndices(self, force=False):
if not self.isIndexed:
for extRes in self.externalResources:
- extRes.addIndices([_indexFasta(extRes.resourceId)])
+ iname = extRes.resourceId + '.fai'
+ if not os.path.isfile(iname) or force:
+ iname = _indexFasta(extRes.resourceId)
+ extRes.addIndices([iname])
self.close()
self._populateMetaTypes()
self.updateCounts()
@@ -4154,6 +4136,35 @@ class ReferenceSet(ContigSet):
}
+class GmapReferenceSet(ReferenceSet):
+ """DataSet type specific to GMAP References"""
+
+ datasetType = DataSetMetaTypes.GMAPREFERENCE
+
+ def __init__(self, *files, **kwargs):
+ super(GmapReferenceSet, self).__init__(*files, **kwargs)
+
+ @property
+ def gmap(self):
+ return self.externalResources[0].gmap
+
+ @gmap.setter
+ def gmap(self, value):
+ self.externalResources[0].gmap = value
+
+ @staticmethod
+ def _metaTypeMapping():
+ return {'fasta':'PacBio.ContigFile.ContigFastaFile',
+ 'fa':'PacBio.ContigFile.ContigFastaFile',
+ 'fas':'PacBio.ContigFile.ContigFastaFile',
+ 'fai':'PacBio.Index.SamIndex',
+ 'contig.index':'PacBio.Index.FastaContigIndex',
+ 'index':'PacBio.Index.Indexer',
+ 'sa':'PacBio.Index.SaWriterIndex',
+ 'gmap': 'PacBio.GmapDB.GmapDBPath',
+ }
+
+
class BarcodeSet(ContigSet):
"""DataSet type specific to Barcodes"""
diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py
index 99090ae..6ef9449 100755
--- a/pbcore/io/dataset/DataSetMembers.py
+++ b/pbcore/io/dataset/DataSetMembers.py
@@ -1,3 +1,43 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED
+# BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
+# BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES
+# OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
"""DataSetMetadata (also the tag of the Element in the DataSet XML
representation) is somewhat challening to store, access, and (de)serialize
efficiently. Here, we maintain a bulk representation of all of the dataset
@@ -49,29 +89,29 @@ serve two pruposes:
['WellSample']['BioSamplePointers']\
['BioSamplePointer'].record['text'])
+Notes:
+ - If you want temporary children to be retained for a classes's children,
+ pass parent=self to the child's constructor.
+ - it helps to add a TAG member...
+
"""
-#import hashlib
import ast
import uuid
-import datetime
import copy
+import logging
import operator as OP
import numpy as np
-import logging
from functools import partial as P
-from collections import Counter
-from pbcore.io.dataset.DataSetWriter import namespaces
+from collections import Counter, defaultdict
+from pbcore.io.dataset.utils import getTimeStampedName
+from pbcore.io.dataset.DataSetUtils import getDataSetUuid
log = logging.getLogger(__name__)
-DISTLIST = ["ProdDist", "ReadTypeDist", "ReadLenDist", "ReadQualDist",
- "MedianInsertDist", "InsertReadQualDist",
- "InsertReadLenDist", "ControlReadQualDist",
- "ControlReadLenDist"]
-
def newUuid(record):
# At some point the uuid may need to be a digest
+ #import hashlib
#newId = str(hashlib.md5(str(record)).hexdigest())
# Group appropriately
@@ -82,12 +122,6 @@ def newUuid(record):
# Today is not that day
return str(uuid.uuid4())
-def getTime():
- return datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S")
-
-def getTimeStampedName(mType):
- return "{m}-{t}".format(m=mType, t=getTime())
-
OPMAP = {'==': OP.eq,
'=': OP.eq,
'eq': OP.eq,
@@ -105,6 +139,7 @@ OPMAP = {'==': OP.eq,
'<': OP.lt,
'<': OP.lt,
'lt': OP.lt,
+ 'in': np.in1d,
'&': lambda x, y: OP.and_(x, y).view(np.bool_),
'~': lambda x, y: np.logical_not(OP.and_(x, y).view(np.bool_)),
}
@@ -112,6 +147,21 @@ OPMAP = {'==': OP.eq,
def mapOp(op):
return OPMAP[op]
+def setify(value):
+ # This is a string of a python or numpy list, needs to be made into a
+ # python set:
+ value = value.strip('set')
+ value = value.strip('()')
+ value = value.strip('[]')
+ if ',' in value:
+ value = value.split(',')
+ else:
+ value = value.split()
+ value = [v.strip() for v in value]
+ value = [v.strip("'") for v in value]
+ return set(value)
+
+
class PbiFlags(object):
NO_LOCAL_CONTEXT = 0
ADAPTER_BEFORE = 1
@@ -128,6 +178,52 @@ class PbiFlags(object):
return reduce(OP.or_,
(getattr(cls, fl.strip()) for fl in flag.split('|')))
+def subgetter(key, container='text', default=None, asType=(lambda x: x)):
+ def get(self):
+ return self.getMemberV(key, container=container, default=default,
+ asType=asType)
+ return property(get)
+
+def subsetter(key, container='text'):
+ def set(self, value):
+ self._runCallbacks()
+ self.setMemberV(key, str(value), container=container)
+ return set
+
+def subaccs(key, container='text', default=None, asType=(lambda x: x)):
+ get = subgetter(key, container=container, default=default, asType=asType)
+ get = get.setter(subsetter(key, container=container))
+ return get
+
+def getter(key, container='attrib', asType=(lambda x: x)):
+ def get(self):
+ return asType(self.getV(container, key))
+ return property(get)
+
+def setter(key, container='attrib'):
+ def set(self, value):
+ self._runCallbacks()
+ self.setV(str(value), container, key)
+ return set
+
+def accs(key, container='attrib', asType=(lambda x: x)):
+ get = getter(key, container, asType)
+ get = get.setter(setter(key, container))
+ return get
+
+def runonce(func):
+ def runner():
+ if not runner.hasrun:
+ try:
+ return func()
+ finally:
+ runner.hasrun = True
+ runner.hasrun = False
+ return runner
+
+def updateTag(ele, tag):
+ if ele.metaname == '':
+ ele.metaname = tag
class RecordWrapper(object):
"""The base functionality of a metadata element.
@@ -136,16 +232,19 @@ class RecordWrapper(object):
RecordWrapper (e.g. append, extend). Methods in child classes often provide
similar functionality for more raw inputs (e.g. resourceIds as strings)"""
+ # only things that should be kept with their parents (indices) should be
+ # True
+ KEEP_WITH_PARENT = False
- def __init__(self, record=None):
+ def __init__(self, record=None, parent=None):
"""Here, record is any element in the Metadata Element tree and a
- dictionary with four members: 'tag', 'attrib', 'text', and 'children'.
-
- Now with a new member! 'namespace'
+ dictionary with five members: 'tag', 'attrib', 'text', 'children', and
+ 'namespace'
Do not deepcopy, we rely on side effects for all persistent
modifications.
"""
+ self._callbacks = []
if record:
try:
self.record = record.record
@@ -153,8 +252,30 @@ class RecordWrapper(object):
self.record = record
else:
self.record = _emptyMember()
+ # register a callback to set the XML element 'tag' to the
+ # class's TAG member if it has one, with the class name as a
+ # fallback
+ self.registerCallback(runonce(
+ P(updateTag, self,
+ getattr(self, 'TAG', self.__class__.__name__))))
+ if not parent is None:
+ # register a callback to append this object to the parent, so
+ # that it will be added to the XML file
+ self.registerCallback(runonce(P(parent.append, self.record)))
assert 'tag' in self.record.keys()
+
+ def registerCallback(self, func):
+ if func not in self._callbacks:
+ self._callbacks.append(func)
+
+ def clearCallbacks(self):
+ self._callbacks = []
+
+ def _runCallbacks(self):
+ for func in self._callbacks:
+ func()
+
def __len__(self):
"""Return the number of children in this node"""
return len(self.record['children'])
@@ -277,9 +398,11 @@ class RecordWrapper(object):
def append(self, newMember):
"""Append to the actual list of child elements"""
if isinstance(newMember, RecordWrapper):
+ newMember._runCallbacks()
self.record['children'].append(newMember.record)
else:
self.record['children'].append(newMember)
+ self._runCallbacks()
def index(self, tag):
"""Return the index in 'children' list of item with 'tag' member"""
@@ -369,33 +492,12 @@ class RecordWrapper(object):
# Some common attributes (to reduce code duplication):
- @property
- def name(self):
- return self.getV('attrib', 'Name')
-
- @name.setter
- def name(self, value):
- self.setV(value, 'attrib', 'Name')
-
- @property
- def value(self):
- return self.getV('attrib', 'Value')
-
- @value.setter
- def value(self, value):
- self.setV(value, 'attrib', 'Value')
-
- @property
- def version(self):
- return self.getV('attrib', 'Version')
-
- @property
- def description(self):
- return self.getV('attrib', 'Description')
-
- @description.setter
- def description(self, value):
- return self.setV(value, 'attrib', 'Description')
+ name = accs('Name')
+ value = accs('Value')
+ version = accs('Version')
+ description = accs('Description')
+ uniqueId = accs('UniqueId')
+ createdAt = accs('CreatedAt')
def filter_read(accessor, operator, value, read):
return operator(accessor(read), value)
@@ -406,18 +508,6 @@ class Filters(RecordWrapper):
def __init__(self, record=None):
super(self.__class__, self).__init__(record)
self.record['tag'] = self.__class__.__name__
- self._callbacks = []
-
- def registerCallback(self, func):
- if func not in self._callbacks:
- self._callbacks.append(func)
-
- def clearCallbacks(self):
- self._callbacks = []
-
- def _runCallbacks(self):
- for func in self._callbacks:
- func()
def __getitem__(self, index):
return Filter(self.record['children'][index])
@@ -477,8 +567,8 @@ class Filters(RecordWrapper):
for i, filt in enumerate(self):
for req in filt:
if req.name == param:
- if not mapOp(oper)(testType(req.value),
- testType(value)):
+ if not mapOp(oper)(testType(value),
+ testType(req.value)):
options[i] = False
return any(options)
@@ -489,8 +579,8 @@ class Filters(RecordWrapper):
for req in filt:
if req.name == param:
tested = True
- passes |= mapOp(oper)(testType(req.value),
- values)
+ passes |= mapOp(oper)(values,
+ testType(req.value))
if not tested:
return np.ones(len(values), dtype=np.bool_)
return passes
@@ -499,7 +589,7 @@ class Filters(RecordWrapper):
def _bamAccMap(self):
return {'rname': (lambda x: x.referenceName),
'length': (lambda x: int(x.readLength)),
- 'qname': (lambda x: x.qNameA),
+ 'qname': (lambda x: x.qName),
'movie': (lambda x: x.movieName),
'zm': (lambda x: int(x.HoleNumber)),
# not implemented yet:
@@ -538,7 +628,7 @@ class Filters(RecordWrapper):
'tend': (lambda x: x.tEnd),
'accuracy': (
lambda x: (np.ones(len(x.nMM), dtype='f4') -
- (x.nMM + x.nIns + x.nDel).astype(np.float32)/
+ (x.nMM + x.nIns + x.nDel).astype(np.float)/
(x.aEnd - x.aStart + x.tEnd - x.tStart -
x.nM - x.nMM)))
}
@@ -580,11 +670,11 @@ class Filters(RecordWrapper):
'bcq': int,
'bq': int,
'qs': int,
- 'rq': np.float32,
+ 'rq': np.float,
'pos': int,
'tstart': int,
'tend': int,
- 'accuracy': np.float32,
+ 'accuracy': np.float,
'readstart': int,
'cx': PbiFlags.flagMap,
'n_subreads': int,
@@ -618,8 +708,7 @@ class Filters(RecordWrapper):
value = typeMap[param](req.value)
operator = mapOp(req.operator)
reqTests.append(P(filter_read, accMap[param], operator, value))
- tests.append(
- lambda x, reqTests=reqTests: all([f(x) for f in reqTests]))
+ tests.append(lambda x, rt=reqTests: all([f(x) for f in rt]))
return tests
def filterIndexRecords(self, indexRecords, nameMap, movieMap,
@@ -637,6 +726,7 @@ class Filters(RecordWrapper):
accMap['movie'] = (lambda x: x.MovieID)
accMap['qname'] = (lambda x: x.MovieID)
accMap['zm'] = (lambda x: x.HoleNumber)
+ accMap['length'] = (lambda x: x.rEnd - x.rStart)
elif readType == 'fasta':
accMap = {'id': (lambda x: x.id),
'length': (lambda x: x.length.astype(int)),
@@ -651,7 +741,10 @@ class Filters(RecordWrapper):
for req in filt:
param = req.name
if param in accMap.keys():
- value = typeMap[param](req.value)
+ if req.operator == 'in':
+ value = map(typeMap[param], setify(req.value))
+ else:
+ value = typeMap[param](req.value)
if param == 'rname':
value = nameMap[value]
if param == 'movie':
@@ -670,27 +763,45 @@ class Filters(RecordWrapper):
reqResultsForRecords &= operator(
accMap[param](indexRecords), value)
elif param == 'qname':
- movie, hole, span = value.split('/')
- operator = mapOp(req.operator)
-
- value = movieMap[movie]
- reqResultsForRecords = operator(
- accMap[param](indexRecords), value)
-
- param = 'zm'
- value = typeMap[param](hole)
- reqResultsForRecords &= operator(
- accMap[param](indexRecords), value)
-
- param = 'qstart'
- value = typeMap[param](span.split('_')[0])
- reqResultsForRecords &= operator(
- accMap[param](indexRecords), value)
-
- param = 'qend'
- value = typeMap[param](span.split('_')[1])
- reqResultsForRecords &= operator(
- accMap[param](indexRecords), value)
+ # TODO: optimize "in" version...
+ if not isinstance(value, list):
+ values = [value]
+ else:
+ values = value
+ reqResultsForRecords = np.zeros(len(indexRecords), dtype=np.bool_)
+ #tempRes = np.ones(len(indexRecords), dtype=np.bool_)
+ for value in values:
+ movie, hole, span = value.split('/')
+ operator = mapOp(req.operator)
+
+ param = 'movie'
+ value = movieMap[movie]
+ #reqResultsForRecords = operator(
+ # accMap[param](indexRecords), value)
+ tempRes = operator(
+ accMap[param](indexRecords), value)
+
+ param = 'zm'
+ value = typeMap[param](hole)
+ #reqResultsForRecords &= operator(
+ # accMap[param](indexRecords), value)
+ tempRes &= operator(
+ accMap[param](indexRecords), value)
+
+ param = 'qstart'
+ value = typeMap[param](span.split('_')[0])
+ #reqResultsForRecords &= operator(
+ # accMap[param](indexRecords), value)
+ tempRes &= operator(
+ accMap[param](indexRecords), value)
+
+ param = 'qend'
+ value = typeMap[param](span.split('_')[1])
+ #reqResultsForRecords &= operator(
+ # accMap[param](indexRecords), value)
+ tempRes &= operator(
+ accMap[param](indexRecords), value)
+ reqResultsForRecords |= tempRes
else:
operator = mapOp(req.operator)
@@ -731,6 +842,8 @@ class Filters(RecordWrapper):
for name, options in kwargs.items():
for i, (oper, val) in enumerate(options):
for filt in newFilts[i]:
+ if isinstance(val, np.ndarray):
+ val = list(val)
filt.addRequirement(name, oper, val)
for filtList in newFilts:
self.extend(filtList)
@@ -738,6 +851,8 @@ class Filters(RecordWrapper):
newFilts = [Filter() for _ in kwargs.values()[0]]
for name, options in kwargs.items():
for i, (oper, val) in enumerate(options):
+ if isinstance(val, np.ndarray):
+ val = list(val)
newFilts[i].addRequirement(name, oper, val)
self.extend(newFilts)
#log.debug("Current filters: {s}".format(s=str(self)))
@@ -965,6 +1080,8 @@ class ExternalResources(RecordWrapper):
Args:
resourceIds: a list of uris as strings
"""
+ if not isinstance(resourceIds, list):
+ resourceIds = [resourceIds]
templist = []
self._resourceIds = []
for res in resourceIds:
@@ -1015,10 +1132,6 @@ class ExternalResource(RecordWrapper):
return True
return False
- @property
- def uniqueId(self):
- return self.getV('attrib', 'UniqueId')
-
def merge(self, other):
if self.metaType:
if self.metaType != other.metaType:
@@ -1027,21 +1140,9 @@ class ExternalResource(RecordWrapper):
if self.tags:
self.tags = ', '.join([self.tags, other.tags])
- @property
- def metaType(self):
- return self.getV('attrib', 'MetaType')
-
- @metaType.setter
- def metaType(self, value):
- return self.setV(value, 'attrib', 'MetaType')
-
- @property
- def timeStampedName(self):
- return self.getV('attrib', 'TimeStampedName')
-
- @timeStampedName.setter
- def timeStampedName(self, value):
- return self.setV(value, 'attrib', 'TimeStampedName')
+ metaType = accs('MetaType')
+ timeStampedName = accs('TimeStampedName')
+ tags = accs('Tags')
@property
def resourceId(self):
@@ -1050,14 +1151,9 @@ class ExternalResource(RecordWrapper):
@resourceId.setter
def resourceId(self, value):
self.setV(value, 'attrib', 'ResourceId')
-
- @property
- def tags(self):
- return self.getV('attrib', 'Tags')
-
- @tags.setter
- def tags(self, value):
- self.setV(value, 'attrib', 'Tags')
+ dsuuid = getDataSetUuid(value)
+ if dsuuid:
+ self.uniqueId = dsuuid
@property
def bam(self):
@@ -1070,6 +1166,10 @@ class ExternalResource(RecordWrapper):
if index.metaType == 'PacBio.Index.PacBioIndex':
return index.resourceId
+ @pbi.setter
+ def pbi(self, value):
+ self._setIndResByMetaType('PacBio.Index.PacBioIndex', value)
+
@property
def bai(self):
indices = self.indices
@@ -1078,6 +1178,16 @@ class ExternalResource(RecordWrapper):
return index.resourceId
@property
+ def gmap(self):
+ """Unusual: returns the gmap external resource instead of the resId"""
+ return self._getSubExtResByMetaType('PacBio.GmapDB.GmapDBPath')
+
+ @gmap.setter
+ def gmap(self, value):
+ """Sets the resourceId"""
+ self._setSubResByMetaType('PacBio.GmapDB.GmapDBPath', value)
+
+ @property
def sts(self):
return self._getSubResByMetaType('PacBio.SubreadFile.ChipStatsFile')
@@ -1119,11 +1229,70 @@ class ExternalResource(RecordWrapper):
self._setSubResByMetaType('PacBio.ReferenceFile.ReferenceFastaFile',
value)
- def _getSubResByMetaType(self, mType):
+ def _deleteIndByMetaType(self, mType):
+ rm = []
+ for i, res in enumerate(self.indices):
+ if res.metaType == mType:
+ rm.append(i)
+ for i in sorted(rm, reverse=True):
+ self.indices.pop(i)
+
+ def _getIndByMetaType(self, mType):
+ resources = self.indices
+ for res in resources:
+ if res.metaType == mType:
+ return res
+
+ def _getIndResByMetaType(self, mType):
+ res = self._getIndByMetaType(mType)
+ if not res is None:
+ return res.resourceId
+
+ def _setIndResByMetaType(self, mType, value):
+ if not isinstance(value, FileIndex):
+ tmp = FileIndex()
+ tmp.resourceId = value
+ else:
+ tmp = value
+ extant = self._getIndByMetaType(mType)
+ if extant:
+ if value is None:
+ self._deleteIndByMetaType(mType)
+ else:
+ extant.resourceId = value
+ else:
+ tmp.metaType = mType
+ tmp.timeStampedName = getTimeStampedName(mType)
+ resources = self.indices
+ # externalresources objects have a tag by default, which means their
+ # truthiness is true. Perhaps a truthiness change is in order
+ # TODO: (mdsmith 20160728) this can be updated now that the
+ # retention and tag system has been refactored
+ if len(resources) == 0:
+ resources = FileIndices()
+ resources.append(tmp)
+ self.append(resources)
+ else:
+ resources.append(tmp)
+
+ def _deleteExtResByMetaType(self, mType):
+ rm = []
+ for i, res in enumerate(self.externalResources):
+ if res.metaType == mType:
+ rm.append(i)
+ for i in sorted(rm, reverse=True):
+ self.externalResources.pop(i)
+
+ def _getSubExtResByMetaType(self, mType):
resources = self.externalResources
for res in resources:
if res.metaType == mType:
- return res.resourceId
+ return res
+
+ def _getSubResByMetaType(self, mType):
+ res = self._getSubExtResByMetaType(mType)
+ if not res is None:
+ return res.resourceId
def _setSubResByMetaType(self, mType, value):
if not isinstance(value, ExternalResource):
@@ -1131,17 +1300,26 @@ class ExternalResource(RecordWrapper):
tmp.resourceId = value
else:
tmp = value
- tmp.metaType = mType
- tmp.timeStampedName = getTimeStampedName(mType)
- resources = self.externalResources
- # externalresources objects have a tag by default, which means their
- # truthiness is true. Perhaps a truthiness change is in order
- if len(resources) == 0:
- resources = ExternalResources()
- resources.append(tmp)
- self.append(resources)
+ extant = self._getSubExtResByMetaType(mType)
+ if extant:
+ if value is None:
+ self._deleteExtResByMetaType(mType)
+ else:
+ extant.resourceId = value
else:
- resources.append(tmp)
+ tmp.metaType = mType
+ tmp.timeStampedName = getTimeStampedName(mType)
+ resources = self.externalResources
+ # externalresources objects have a tag by default, which means their
+ # truthiness is true. Perhaps a truthiness change is in order
+ # TODO: (mdsmith 20160728) this can be updated now that the
+ # retention and tag system has been refactored
+ if len(resources) == 0:
+ resources = ExternalResources()
+ resources.append(tmp)
+ self.append(resources)
+ else:
+ resources.append(tmp)
@property
def externalResources(self):
@@ -1198,6 +1376,7 @@ class FileIndices(RecordWrapper):
class FileIndex(RecordWrapper):
+ KEEP_WITH_PARENT = True
def __init__(self, record=None):
super(self.__class__, self).__init__(record)
@@ -1205,40 +1384,21 @@ class FileIndex(RecordWrapper):
self.attrib.setdefault('UniqueId', newUuid(self.record))
self.attrib.setdefault('TimeStampedName', '')
- @property
- def resourceId(self):
- return self.getV('attrib', 'ResourceId')
-
- @resourceId.setter
- def resourceId(self, value):
- self.setV(value, 'attrib', 'ResourceId')
-
- @property
- def metaType(self):
- return self.getV('attrib', 'MetaType')
-
- @metaType.setter
- def metaType(self, value):
- return self.setV(value, 'attrib', 'MetaType')
-
- @property
- def timeStampedName(self):
- return self.getV('attrib', 'TimeStampedName')
-
- @timeStampedName.setter
- def timeStampedName(self, value):
- return self.setV(value, 'attrib', 'TimeStampedName')
+ resourceId = accs('ResourceId')
+ metaType = accs('MetaType')
+ timeStampedName = accs('TimeStampedName')
class DataSetMetadata(RecordWrapper):
"""The root of the DataSetMetadata element tree, used as base for subtype
specific DataSet or for generic "DataSet" records."""
+ TAG = 'DataSetMetadata'
def __init__(self, record=None):
"""Here, record is the root element of the Metadata Element tree"""
super(DataSetMetadata, self).__init__(record)
- self.record['tag'] = 'DataSetMetadata'
+ self.record['tag'] = self.TAG
def merge(self, other):
self.numRecords += other.numRecords
@@ -1301,6 +1461,8 @@ class SubreadSetMetadata(DataSetMetadata):
"""The DataSetMetadata subtype specific to SubreadSets. Deals explicitly
with the merging of Collections and BioSamples metadata hierarchies."""
+ TAG = 'DataSetMetadata'
+
def __init__(self, record=None):
# This doesn't really need to happen unless there are contextual
# differences in the meanings of subtypes (e.g. BioSamples mean
@@ -1329,7 +1491,8 @@ class SubreadSetMetadata(DataSetMetadata):
"""Return a list of wrappers around Collections elements of the
Metadata Record"""
return CollectionsMetadata(self.getV(tag='Collections',
- container='children'))
+ container='children'),
+ parent=self)
@collections.setter
def collections(self, value):
@@ -1348,6 +1511,7 @@ class SubreadSetMetadata(DataSetMetadata):
class ContigSetMetadata(DataSetMetadata):
"""The DataSetMetadata subtype specific to ContigSets."""
+ TAG = 'DataSetMetadata'
def __init__(self, record=None):
if record:
@@ -1365,21 +1529,8 @@ class ContigSetMetadata(DataSetMetadata):
else:
self.contigs = other.contigs
- @property
- def organism(self):
- return self.getMemberV('Organism')
-
- @organism.setter
- def organism(self, value):
- self.setMemberV('Organism', value)
-
- @property
- def ploidy(self):
- return self.getMemberV('Ploidy')
-
- @ploidy.setter
- def ploidy(self, value):
- self.setMemberV('Ploidy', value)
+ organism = subaccs('Organism')
+ ploidy = subaccs('Ploidy')
@property
def contigs(self):
@@ -1410,6 +1561,7 @@ class ContigSetMetadata(DataSetMetadata):
class BarcodeSetMetadata(DataSetMetadata):
"""The DataSetMetadata subtype specific to BarcodeSets."""
+ TAG = 'DataSetMetadata'
def __init__(self, record=None):
if record:
@@ -1420,20 +1572,16 @@ class BarcodeSetMetadata(DataSetMetadata):
"{t}".format(t=type(record).__name__))
super(BarcodeSetMetadata, self).__init__(record)
- @property
- def barcodeConstruction(self):
- return self.getMemberV('BarcodeConstruction')
-
- @barcodeConstruction.setter
- def barcodeConstruction(self, value):
- self.setMemberV('BarcodeConstruction', value)
+ barcodeConstruction = subaccs('BarcodeConstruction')
class ContigsMetadata(RecordWrapper):
+ TAG = 'Contigs'
+
def __init__(self, record=None):
super(self.__class__, self).__init__(record)
- self.record['tag'] = 'Contigs'
+ self.record['tag'] = self.TAG
def __getitem__(self, index):
return ContigMetadata(self.record['children'][index])
@@ -1448,9 +1596,11 @@ class ContigsMetadata(RecordWrapper):
class ContigMetadata(RecordWrapper):
+ TAG = 'Contig'
+
def __init__(self, record=None):
super(self.__class__, self).__init__(record)
- self.record['tag'] = 'Contig'
+ self.record['tag'] = self.TAG
@property
def digest(self):
@@ -1473,6 +1623,7 @@ class CollectionsMetadata(RecordWrapper):
"""The Element should just have children: a list of
CollectionMetadataTags"""
+ TAG = 'Collections'
def __getitem__(self, index):
return CollectionMetadata(self.record['children'][index])
@@ -1485,65 +1636,14 @@ class CollectionsMetadata(RecordWrapper):
self.extend([child for child in other])
-class CollectionMetadata(RecordWrapper):
- """The metadata for a single collection. It contains Context,
- InstrumentName etc. as attribs, InstCtrlVer etc. for children"""
-
-
- @property
- def context(self):
- return self.getV('attrib', 'Context')
-
- @property
- def instrumentName(self):
- return self.getV('attrib', 'InstrumentName')
-
- @property
- def instrumentId(self):
- return self.getV('attrib', 'InstrumentId')
-
- @property
- def instCtrlVer(self):
- return self.getMemberV('InstCtrlVer')
-
- @property
- def sigProcVer(self):
- return self.getMemberV('SigProcVer')
-
- @property
- def automation(self):
- return Automation(self.getMemberV('Automation'))
-
- @property
- def collectionNumber(self):
- return self.getMemberV('collectionNumber')
-
- @property
- def cellIndex(self):
- return self.getMemberV('cellIndex')
-
- @property
- def cellPac(self):
- return self.getMemberV('cellPac', 'attrib')
-
- @property
- def runDetails(self):
- return RunDetailsMetadata(self.getV('children', 'RunDetails'))
-
- @property
- def wellSample(self):
- return WellSampleMetadata(self.getV('children', 'WellSample'))
+class AutomationParameter(RecordWrapper):
- @property
- def primary(self):
- return PrimaryMetadata(self.getV('children', 'Primary'))
+ def __init__(self, record=None):
+ super(self.__class__, self).__init__(record)
+ self.record['tag'] = self.__class__.__name__
-class Automation(RecordWrapper):
+ value = accs('SimpleValue')
- @property
- def automationParameters(self):
- return AutomationParameters(self.getV('children',
- 'AutomationParameters'))
class AutomationParameters(RecordWrapper):
@@ -1551,10 +1651,8 @@ class AutomationParameters(RecordWrapper):
super(self.__class__, self).__init__(record)
self.record['tag'] = self.__class__.__name__
- @property
- def automationParameter(self):
- return AutomationParameter(self.getV('children',
- 'AutomationParameter'))
+ automationParameter = accs('AutomationParameter', container='children',
+ asType=AutomationParameter)
def addParameter(self, key, value):
temp = AutomationParameter()
@@ -1564,46 +1662,103 @@ class AutomationParameters(RecordWrapper):
temp.value = value
self.append(temp)
-class AutomationParameter(RecordWrapper):
+ def __getitem__(self, tag):
+ """Override to use tag as Name instead of strictly tag"""
+ if isinstance(tag, str):
+ for child in self:
+ child = AutomationParameter(child)
+ if child.name == tag:
+ return child
+ return RecordWrapper(self.getV('children', tag))
+ elif isinstance(tag, int):
+ return RecordWrapper(self.record['children'][tag])
- def __init__(self, record=None):
- super(self.__class__, self).__init__(record)
- self.record['tag'] = self.__class__.__name__
+ @property
+ def parameterNames(self):
+ return [c.name for c in self]
-class Provenance(RecordWrapper):
- """The metadata concerning this dataset's provenance"""
- @property
- def createdBy(self):
- return self.getV(container='attrib', tag='CreatedBy')
+class Automation(RecordWrapper):
+
+ automationParameters = accs('AutomationParameters', container='children',
+ asType=AutomationParameters)
- @property
- def parentTool(self):
- return ParentTool(self.getV('children', 'ParentTool'))
class ParentTool(RecordWrapper):
pass
+
+class Provenance(RecordWrapper):
+ """The metadata concerning this dataset's provenance"""
+
+ createdBy = accs('CreatedBy')
+ parentTool = accs('ParentTool', container='children', asType=ParentTool)
+
+
class StatsMetadata(RecordWrapper):
"""The metadata from the machine sts.xml"""
+ # merged dists:
+ MERGED_DISTS = ["ProdDist", "ReadTypeDist", "ReadLenDist", "ReadQualDist",
+ "MedianInsertDist", "InsertReadQualDist",
+ "InsertReadLenDist", "ControlReadQualDist",
+ "ControlReadLenDist"]
+
+ # continuous channel dists:
+ CHANNEL_DISTS = ['BaselineLevelDist', 'BaselineStdDist', 'SnrDist',
+ 'HqRegionSnrDist', 'HqBasPkMidDist',
+ 'BaselineLevelSequencingDist']
+
+ # continuous misc. dists:
+ OTHER_DISTS = ['PausinessDist', 'PulseRateDist', 'PulseWidthDist',
+ 'BaseRateDist', 'BaseWidthDist', 'BaseIpdDist',
+ 'LocalBaseRateDist', 'NumUnfilteredBasecallsDist']
+
+ UNMERGED_DISTS = CHANNEL_DISTS + OTHER_DISTS
+
+ @property
+ def channelDists(self):
+ tbr = {}
+ for dist in self.CHANNEL_DISTS:
+ chans = defaultdict(list)
+ for chan in self.findChildren(dist):
+ chans[chan.attrib['Channel']].append(ContinuousDistribution(chan))
+ tbr[dist] = chans
+ return tbr
+
+ @property
+ def otherDists(self):
+ tbr = defaultdict(list)
+ for disttype in self.OTHER_DISTS:
+ for dist in self.findChildren(disttype):
+ tbr[disttype].append(ContinuousDistribution(dist))
+ return tbr
+
def merge(self, other):
- if other.shortInsertFraction and other.prodDist:
+ if (other.shortInsertFraction and other.prodDist and
+ self.shortInsertFraction and self.prodDist):
self.shortInsertFraction = (self.shortInsertFraction *
self.prodDist.bins[1] +
other.shortInsertFraction *
other.prodDist.bins[1])/(
self.prodDist.bins[1]
+ other.prodDist.bins[1])
- if other.adapterDimerFraction and other.prodDist:
+ if (other.adapterDimerFraction and other.prodDist and
+ self.shortInsertFraction and self.prodDist):
self.adapterDimerFraction = (self.adapterDimerFraction *
self.prodDist.bins[1] +
other.adapterDimerFraction *
other.prodDist.bins[1])/(
self.prodDist.bins[1]
+ other.prodDist.bins[1])
+ if other.shortInsertFraction and not self.shortInsertFraction:
+ self.shortInsertFraction = other.shortInsertFraction
+ if other.adapterDimerFraction and not self.adapterDimerFraction:
+ self.adapterDimerFraction = other.adapterDimerFraction
+ if other.prodDist and not self.prodDist:
+ self.append(other.prodDist)
self.numSequencingZmws += other.numSequencingZmws
- for dist in DISTLIST:
+ for dist in self.MERGED_DISTS:
selfDist = getattr(self, dist[0].lower() + dist[1:])
otherDist = getattr(other, dist[0].lower() + dist[1:])
if not selfDist:
@@ -1617,6 +1772,11 @@ class StatsMetadata(RecordWrapper):
self.append(otherDist)
except BinMismatchError:
self.append(otherDist)
+ for dist in self.UNMERGED_DISTS:
+ otherDists = other.findChildren(dist)
+ for otherDist in otherDists:
+ if otherDist:
+ self.append(otherDist)
@property
def prodDist(self):
@@ -1918,66 +2078,55 @@ class DiscreteDistribution(RecordWrapper):
class RunDetailsMetadata(RecordWrapper):
+ TAG = 'RunDetails'
- @property
- def timeStampedName(self):
- return self.getMemberV('TimeStampedName')
+ timeStampedName = subgetter('TimeStampedName')
+ name = subaccs('Name')
- @property
- def name(self):
- return self.getMemberV('Name')
- @name.setter
- def name(self, value):
- return self.setMemberV('Name', value)
+class BioSamplePointersMetadata(RecordWrapper):
+ """The BioSamplePointer members don't seem complex enough to justify
+ class representation, instead rely on base class methods to provide
+ iterators and accessors"""
+
+ TAG = 'BioSamplePointers'
class WellSampleMetadata(RecordWrapper):
+ TAG = 'WellSample'
- @property
- def uniqueId(self):
- return self.getV('attrib', 'UniqueId')
+ wellName = subaccs('WellName')
+ concentration = subaccs('Concentration')
+ sampleReuseEnabled = subgetter('SampleReuseEnabled')
+ stageHotstartEnabled = subgetter('StageHotstartEnabled')
+ sizeSelectionEnabled = subgetter('SizeSelectionEnabled')
+ useCount = subaccs('UseCount')
+ comments = subaccs('Comments')
+ bioSamplePointers = accs('BioSamplePointers', container='children',
+ asType=BioSamplePointersMetadata)
- @property
- def wellName(self):
- return self.getMemberV('WellName')
- @property
- def concentration(self):
- return self.getMemberV('Concentration')
+class CopyFilesMetadata(RecordWrapper):
+ """The CopyFile members don't seem complex enough to justify
+ class representation, instead rely on base class methods"""
- @property
- def sampleReuseEnabled(self):
- return self.getMemberV('SampleReuseEnabled')
+ TAG = 'CopyFiles'
- @property
- def stageHotstartEnabled(self):
- return self.getMemberV('StageHotstartEnabled')
- @property
- def sizeSelectionEnabled(self):
- return self.getMemberV('SizeSelectionEnabled')
+class OutputOptions(RecordWrapper):
- @property
- def useCount(self):
- return self.getMemberV('UseCount')
+ resultsFolder = subaccs('ResultsFolder')
+ collectionPathUri = subaccs('CollectionPathUri')
+ copyFiles = accs('CopyFiles', container='children',
+ asType=CopyFilesMetadata)
- @property
- def comments(self):
- return self.getMemberV('Comments')
- @property
- def bioSamplePointers(self):
- return BioSamplePointersMetadata(
- self.getV('children', 'BioSamplePointers'))
+class SecondaryMetadata(RecordWrapper):
+ TAG = 'Secondary'
-class BioSamplePointersMetadata(RecordWrapper):
- """The BioSamplePointer members don't seem complex enough to justify
- class representation, instead rely on base class methods to provide
- iterators and accessors"""
- pass
+ cellCountInJob = subaccs('CellCountInJob')
class PrimaryMetadata(RecordWrapper):
@@ -2002,45 +2151,13 @@ class PrimaryMetadata(RecordWrapper):
'BetterAnalysis_Results'
"""
- @property
- def automationName(self):
- return self.getMemberV('AutomationName')
-
- @property
- def configFileName(self):
- return self.getMemberV('ConfigFileName')
-
- @property
- def sequencingCondition(self):
- return self.getMemberV('SequencingCondition')
-
- @property
- def outputOptions(self):
- return OutputOptions(self.getV('children', 'OutputOptions'))
-
-
-class OutputOptions(RecordWrapper):
- @property
- def resultsFolder(self):
- return self.getMemberV('ResultsFolder')
+ TAG = 'Primary'
- @resultsFolder.setter
- def resultsFolder(self, value):
- self.setMemberV('ResultsFolder', value)
-
- @property
- def collectionPathUri(self):
- return self.getMemberV('CollectionPathUri')
-
- @property
- def copyFiles(self):
- return CopyFilesMetadata(self.getV('children', 'CopyFiles'))
-
-
-class CopyFilesMetadata(RecordWrapper):
- """The CopyFile members don't seem complex enough to justify
- class representation, instead rely on base class methods"""
- pass
+ automationName = subaccs('AutomationName')
+ configFileName = subaccs('ConfigFileName')
+ sequencingCondition = subaccs('SequencingCondition')
+ outputOptions = accs('OutputOptions', container='children',
+ asType=OutputOptions)
class BioSamplesMetadata(RecordWrapper):
@@ -2062,6 +2179,7 @@ class BioSamplesMetadata(RecordWrapper):
'great biosample'
"""
+ TAG = 'BioSamples'
def __getitem__(self, index):
"""Get a biosample"""
@@ -2079,28 +2197,74 @@ class BioSamplesMetadata(RecordWrapper):
class BioSampleMetadata(RecordWrapper):
"""The metadata for a single BioSample"""
+ TAG = 'BioSample'
+
+class Kit(RecordWrapper):
+ partNumber = accs('PartNumber')
+ lotNumber = accs('LotNumber')
+ barcode = accs('Barcode')
+ expirationDate = accs('ExpirationDate')
+
+class CellPac(Kit):
+ """CellPac metadata"""
+
+class TemplatePrepKit(Kit):
+ """TemplatePrepKit metadata"""
+
+ rightAdaptorSequence = subaccs('RightAdaptorSequence')
+ leftAdaptorSequence = subaccs('LeftAdaptorSequence')
+
+class BindingKit(Kit):
+ pass
+
+class SequencingKitPlate(Kit):
+ pass
+
+class CollectionMetadata(RecordWrapper):
+ """The metadata for a single collection. It contains Context,
+ InstrumentName etc. as attribs, InstCtrlVer etc. for children"""
+
+ TAG = 'CollectionMetadata'
+
+ context = accs('Context')
+ instrumentName = accs('InstrumentName')
+ instrumentId = accs('InstrumentId')
+ instCtrlVer = subaccs('InstCtrlVer')
+ sigProcVer = subaccs('SigProcVer')
+ collectionNumber = subaccs('CollectionNumber')
+ cellIndex = subaccs('CellIndex')
+ cellPac = accs('CellPac', 'children', CellPac)
+ templatePrepKit = accs('TemplatePrepKit', 'children', TemplatePrepKit)
+ bindingKit = accs('BindingKit', 'children', BindingKit)
+ sequencingKitPlate = accs('SequencingKitPlate', 'children',
+ SequencingKitPlate)
+ automation = accs('Automation', 'children', Automation)
+ primary = accs('Primary', 'children', PrimaryMetadata)
+ secondary = accs('Secondary', 'children', SecondaryMetadata)
@property
- def uniqueId(self):
- return self.getV('attrib', 'UniqueId')
+ def runDetails(self):
+ return RunDetailsMetadata(self.getV('children', 'RunDetails'),
+ parent=self)
@property
- def createdAt(self):
- return self.getV('attrib', 'CreatedAt')
+ def wellSample(self):
+ return WellSampleMetadata(self.getV('children', 'WellSample'),
+ parent=self)
def _emptyMember(tag=None, text=None, attrib=None, children=None,
namespace=None):
"""Return an empty stock Element representation"""
- if not tag:
+ if tag is None:
tag = ''
- if not namespace:
+ if namespace is None:
namespace = ''
- if not text:
+ if text is None:
text = ''
- if not attrib:
+ if attrib is None:
attrib = {}
- if not children:
+ if children is None:
children = []
return {'tag': tag, 'text': text, 'attrib': attrib, 'children': children,
'namespace': namespace}
diff --git a/pbcore/io/dataset/DataSetMetaTypes.py b/pbcore/io/dataset/DataSetMetaTypes.py
new file mode 100644
index 0000000..e04305e
--- /dev/null
+++ b/pbcore/io/dataset/DataSetMetaTypes.py
@@ -0,0 +1,86 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
+import logging
+from pbcore.io.dataset.DataSetErrors import (InvalidDataSetIOError,)
+
+log = logging.getLogger(__name__)
+
+def toDsId(name):
+ """Translate a class name into a MetaType/ID"""
+ return "PacBio.DataSet.{x}".format(x=name)
+
+class DataSetMetaTypes(object):
+ """
+ This mirrors the PacBioSecondaryDataModel.xsd definitions and be used
+ to reference a specific dataset type.
+ """
+ SUBREAD = toDsId("SubreadSet")
+ HDF_SUBREAD = toDsId("HdfSubreadSet")
+ ALIGNMENT = toDsId("AlignmentSet")
+ BARCODE = toDsId("BarcodeSet")
+ CCS_ALIGNMENT = toDsId("ConsensusAlignmentSet")
+ CCS = toDsId("ConsensusReadSet")
+ REFERENCE = toDsId("ReferenceSet")
+ GMAPREFERENCE = toDsId("GmapReferenceSet")
+ CONTIG = toDsId("ContigSet")
+
+ ALL = (SUBREAD, HDF_SUBREAD, ALIGNMENT,
+ BARCODE, CCS, CCS_ALIGNMENT, REFERENCE, CONTIG, GMAPREFERENCE)
+
+ @classmethod
+ def isValid(cls, dsId):
+ return dsId in cls.ALL
+
+def dsIdToName(dsId):
+ """Translate a MetaType/ID into a class name"""
+ if DataSetMetaTypes.isValid(dsId):
+ return dsId.split('.')[-1]
+ else:
+ raise InvalidDataSetIOError("Invalid DataSet MetaType")
+
+def dsIdToSuffix(dsId):
+ """Translate a MetaType/ID into a file suffix"""
+ dsIds = DataSetMetaTypes.ALL
+ suffixMap = {dsId: dsIdToName(dsId) for dsId in dsIds}
+ suffixMap[toDsId("DataSet")] = 'DataSet'
+ if DataSetMetaTypes.isValid(dsId):
+ suffix = suffixMap[dsId]
+ suffix = suffix.lower()
+ suffix += '.xml'
+ return suffix
+ else:
+ raise InvalidDataSetIOError("Invalid DataSet MetaType")
+
diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py
index 1ba4c27..95b277c 100755
--- a/pbcore/io/dataset/DataSetReader.py
+++ b/pbcore/io/dataset/DataSetReader.py
@@ -1,3 +1,38 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
""" Input and output functions for DataSet XML files"""
import os.path
@@ -9,7 +44,7 @@ from pbcore.io.dataset.DataSetMembers import (ExternalResource,
ExternalResources,
DataSetMetadata,
Filters, AutomationParameters,
- StatsMetadata, DISTLIST)
+ StatsMetadata)
from pbcore.io.dataset.DataSetWriter import _eleFromDictList
from pbcore.io.dataset.DataSetErrors import InvalidDataSetIOError
@@ -97,7 +132,11 @@ def _addUnknownFile(dset, path):
elif path.endswith('fofn'):
return _addFofnFile(dset, path)
else:
- return _addGenericFile(dset, path)
+ # Give it a shot as an XML file:
+ try:
+ return _addXmlFile(dset, path)
+ except ET.ParseError:
+ return _addGenericFile(dset, path)
SUB_RESOURCES = ['.scraps.bam', '.control.subreads.bam']
FILE_INDICES = ['.fai', '.pbi', '.bai', '.metadata.xml',
@@ -270,9 +309,6 @@ def parseStats(filename):
root = _updateStats(root)
stats = StatsMetadata(_eleToDictList(root))
stats.record['tag'] = 'SummaryStats'
- whitelist = ['ShortInsertFraction', 'AdapterDimerFraction',
- 'NumSequencingZmws'] + DISTLIST
- stats.pruneChildrenTo(whitelist)
return stats
def parseMetadata(filename):
diff --git a/pbcore/__init__.py b/pbcore/io/dataset/DataSetUtils.py
similarity index 52%
copy from pbcore/__init__.py
copy to pbcore/io/dataset/DataSetUtils.py
index 7375688..1c6e083 100644
--- a/pbcore/__init__.py
+++ b/pbcore/io/dataset/DataSetUtils.py
@@ -1,5 +1,5 @@
-#################################################################################
-# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
@@ -16,16 +16,65 @@
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
-# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
-#################################################################################
+###############################################################################
+
+# Author: Martin D. Smith
+
+
+import os
+import logging
+
+log = logging.getLogger(__name__)
+
+def fileType(fname):
+ """Get the extension of fname (with h5 type)"""
+ remainder, ftype = os.path.splitext(fname)
+ if ftype == '.h5':
+ _, prefix = os.path.splitext(remainder)
+ ftype = prefix + ftype
+ elif ftype == '.index':
+ _, prefix = os.path.splitext(remainder)
+ if prefix == '.contig':
+ ftype = prefix + ftype
+ ftype = ftype.strip('.')
+ return ftype
+
+def getDataSetUuid(xmlfile):
+ """
+ Quickly retrieve the uuid from the root element of a dataset XML file,
+ using a streaming parser to avoid loading the entire dataset into memory.
+ Returns None if the parsing fails.
+ """
+ try:
+ import xml.etree.cElementTree as ET
+ for event, element in ET.iterparse(xmlfile, events=("start",)):
+ return element.get("UniqueId")
+ except Exception:
+ return None
+
+
+def getDataSetMetaType(xmlfile):
+ """
+ Quickly retrieve the MetaType from the root element of a dataset XML file,
+ using a streaming parser to avoid loading the entire dataset into memory.
+ Returns None if the parsing fails.
+ """
+ try:
+ import xml.etree.cElementTree as ET
+ for event, element in ET.iterparse(xmlfile, events=("start",)):
+ return element.get("MetaType")
+ except Exception:
+ return None
-__VERSION__ = "1.2.10"
diff --git a/pbcore/io/dataset/DataSetValidator.py b/pbcore/io/dataset/DataSetValidator.py
index e4fca6a..e2804bb 100755
--- a/pbcore/io/dataset/DataSetValidator.py
+++ b/pbcore/io/dataset/DataSetValidator.py
@@ -1,3 +1,38 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
"""Validate DataSet XML files"""
import os
@@ -27,10 +62,10 @@ def validateResources(xmlroot, relTo='.'):
if resId:
parsedId = urlparse(resId)
rfn = urlparse(resId).path.strip()
- if not os.path.isfile(rfn):
- if (not os.path.isfile(os.path.join(relTo,
+ if not os.path.exists(rfn):
+ if (not os.path.exists(os.path.join(relTo,
rfn)) and
- not os.path.isfile(os.path.join('.',
+ not os.path.exists(os.path.join('.',
rfn))):
raise IOError, "{f} not found".format(f=rfn)
diff --git a/pbcore/io/dataset/DataSetWriter.py b/pbcore/io/dataset/DataSetWriter.py
index e8989d4..4ee57e4 100755
--- a/pbcore/io/dataset/DataSetWriter.py
+++ b/pbcore/io/dataset/DataSetWriter.py
@@ -1,3 +1,38 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
""" Input and output functions for DataSet XML files"""
import copy, time
@@ -226,7 +261,8 @@ def _addDataSetMetadataElement(dataSet, root):
if dataSet.metadata.summaryStats:
stats = dataSet.metadata.summaryStats
dataSet.metadata.summaryStats = None
- root.append(_eleFromDictList(dataSet.metadata.record))
+ root.append(_eleFromDictList(dataSet.metadata.record,
+ defaultNS=namespaces()['pbmeta']))
if stats:
dataSet.metadata.summaryStats = stats
# Metadata order matters....
@@ -241,7 +277,7 @@ def _guessNs(tag):
return namespaces()[nsprefix]
return ''
-def _eleFromDictList(eleAsDict, core=False):
+def _eleFromDictList(eleAsDict, core=False, defaultNS=None):
"""A last ditch capture method for uknown Elements"""
if eleAsDict['tag'] == 'DataSets':
print eleAsDict['namespace']
@@ -252,6 +288,9 @@ def _eleFromDictList(eleAsDict, core=False):
newNamespace = _guessNs(eleAsDict['tag'])
if newNamespace != '':
eleAsDict['namespace'] = newNamespace
+ if eleAsDict['namespace'] == '' and not defaultNS is None:
+ eleAsDict['namespace'] = defaultNS
+
if core:
ele = ET.Element("{{{n}}}{t}".format(n=eleAsDict['namespace'],
t=eleAsDict['tag']),
diff --git a/pbcore/io/dataset/__init__.py b/pbcore/io/dataset/__init__.py
index bffd9a8..f9d1346 100755
--- a/pbcore/io/dataset/__init__.py
+++ b/pbcore/io/dataset/__init__.py
@@ -1,4 +1,39 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
from DataSetIO import *
+from DataSetUtils import *
import logging
diff --git a/pbcore/io/dataset/utils.py b/pbcore/io/dataset/utils.py
index d9b352b..949ac94 100755
--- a/pbcore/io/dataset/utils.py
+++ b/pbcore/io/dataset/utils.py
@@ -1,3 +1,38 @@
+###############################################################################
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###############################################################################
+
+# Author: Martin D. Smith
+
+
"""
Utils that support fringe DataSet features.
"""
@@ -6,11 +41,21 @@ import tempfile
import logging
import json
import shutil
+import datetime
import pysam
+import numpy as np
from pbcore.util.Process import backticks
log = logging.getLogger(__name__)
+def getTimeStampedName(mType):
+ """Generate a timestamped name using the given metatype 'mType' and the
+ current UTC time"""
+ mType = mType.lower()
+ mType = '_'.join(mType.split('.'))
+ time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
+ return "{m}-{t}".format(m=mType, t=time)
+
def which(exe):
if os.path.exists(exe) and os.access(exe, os.X_OK):
return exe
@@ -21,7 +66,7 @@ def which(exe):
return this_path
return None
-def consolidateXml(indset, outbam, useTmp=True):
+def consolidateXml(indset, outbam, useTmp=True, cleanup=True):
tmpout = tempfile.mkdtemp(suffix="consolidate-xml")
tmp_xml = os.path.join(tmpout,
"orig.{t}.xml".format(
@@ -51,8 +96,9 @@ def consolidateXml(indset, outbam, useTmp=True):
_pbmergeXML(tmp_xml, outbam)
if useTmp:
shutil.copy(outbam, origOutBam)
- # cleanup:
- shutil.rmtree(tmpout)
+ shutil.copy(outbam + ".pbi", origOutBam + ".pbi")
+ if cleanup:
+ shutil.rmtree(tmpout)
return outbam
def consolidateBams(inFiles, outFile, filterDset=None, useTmp=True):
@@ -273,3 +319,123 @@ def _emitFilterScript(filterDset, filtScriptName):
with open(filtScriptName, 'w') as scriptFile:
scriptFile.write(json.dumps(script))
+RS = 65536
+
+def xy_to_hn(x, y):
+ return x * RS + y
+
+def hn_to_xy(hn):
+ x = hn/RS
+ y = hn - (x * RS)
+ return x, y
+
+def shift(cx, cy, d):
+ # 0 is up, 1 is right, 2 is down, 3 is left
+ if d == 0:
+ cy += 1
+ elif d == 1:
+ cx += 1
+ elif d == 2:
+ cy -= 1
+ elif d == 3:
+ cx -= 1
+ return cx, cy, d
+
+def change_d(d):
+ d += 1
+ d %= 4
+ return d
+
+def move(cx, cy, x, y, d):
+ if cx == x and cy == y:
+ return cx - 1, y, 0
+ if abs(x - cx) == abs(y - cy):
+ d = change_d(d)
+ # expand the search
+ if d == 0:
+ cx -= 1
+ return shift(cx, cy, d)
+ else:
+ return shift(cx, cy, d)
+
+def find_closest(x, y, pos, limit=81):
+ found = False
+ cx = x
+ cy = y
+ d = None
+ fails = 0
+ while not found:
+ hn = xy_to_hn(cx, cy)
+ if hn in pos:
+ return hn
+ else:
+ fails += 1
+ cx, cy, d = move(cx, cy, x, y, d)
+ if fails >= limit:
+ return None
+
+def quadratic_expand(lol):
+ samples = [[p] for p in lol[0]]
+ for ps in lol[1:]:
+ newsamples = []
+ for p in ps:
+ for s in samples:
+ newsamples.append(s[:] + [p])
+ samples = newsamples
+ return samples
+
+def prodround(values, target):
+ """Round the floats in values (whose product is <target>) to integers in a
+ way that minimizes the absolute change in values
+ Args:
+ values: a list of numbers
+ target: the product of values (perhaps approximate)
+ Returns:
+ The values array, rounded to integers
+ """
+ opts = [[np.floor(v), round(v), np.ceil(v)] for v in values]
+ combos = quadratic_expand(opts)
+ best = combos[0]
+ for combo in combos[1:]:
+ p = np.prod(combo)
+ err = abs(target - p)
+ berr = abs(target - np.prod(best))
+ rnd = np.sum([abs(v-c) for v, c in zip(values, combo)])
+ brnd = np.sum([abs(v-c) for v, c in zip(values, best)])
+ if (err < berr) or ((err == berr) and (rnd < brnd)):
+ best = combo
+ return best
+
+def sampleUniformly(nsamples, dimbounds):
+ """dimbounds is list of tuples of range, inclusive"""
+ volume = 1
+ for dmin, dmax in dimbounds:
+ volume *= dmax - dmin
+ volume_per_sample = np.true_divide(volume, nsamples)
+ sample_side_length = np.power(volume_per_sample,
+ np.true_divide(1.0, len(dimbounds)))
+ per_axis = [max(1.0, np.true_divide((dmax - dmin), sample_side_length))
+ for dmin, dmax in dimbounds]
+ per_axis = prodround(per_axis, nsamples)
+ # Shrink the stride to account for end margins
+ strides = [np.true_divide(dmax - dmin, nsam + 1)
+ for (dmin, dmax), nsam in zip(dimbounds, per_axis)]
+ # introduce a margin
+ points = [np.linspace(dmin + dstride,
+ dmax - dstride,
+ round(nsamp))
+ for (dmin, dmax), nsamp, dstride in zip(dimbounds, per_axis,
+ strides)]
+ points = [map(round, ps) for ps in points]
+ points = [map(int, ps) for ps in points]
+ samples = quadratic_expand(points)
+ return samples
+
+
+def sampleHolesUniformly(nsamples, samplefrom, faillimit=25, rowstart=64,
+ colstart=64, nrows=1024, ncols=1144):
+ xys = sampleUniformly(nsamples, [(colstart, ncols), (rowstart, nrows)])
+ hns = [find_closest(x, y, samplefrom, limit=faillimit) for x, y in xys]
+ return [hn for hn in hns if not hn is None]
+
+
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 266b3e6..3ae70f4 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,3 +1,3 @@
sphinx
nose
-pyxb
+pyxb==1.2.4
diff --git a/tests/test_pbcore_io_AlnFileReaders.py b/tests/test_pbcore_io_AlnFileReaders.py
index 2a128d7..5d95a25 100644
--- a/tests/test_pbcore_io_AlnFileReaders.py
+++ b/tests/test_pbcore_io_AlnFileReaders.py
@@ -320,7 +320,8 @@ class _BasicAlnFileReaderTests(object):
def testReadGroupTable(self):
rgFwd = self.fwdAln.readGroupInfo
- EQ([('ID', '<i4'), ('MovieName', 'O'), ('ReadType', 'O'), ('SequencingChemistry', 'O'), ('FrameRate', '<f8')], rgFwd.dtype)
+ EQ([('ID', '<i4'), ('MovieName', 'O'), ('ReadType', 'O'), ('SequencingChemistry', 'O'), ('FrameRate', '<f8'), ('BaseFeatures', 'O')], rgFwd.dtype)
+ EQ(True, isinstance(rgFwd.BaseFeatures, frozenset))
EQ("P6-C4", rgFwd.SequencingChemistry)
EQ("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0", rgFwd.MovieName)
#EQ("bar", rgFwd.ReadType)
@@ -470,6 +471,15 @@ class TestCCSBam(object):
aln.qEnd
+class TestEmptyBam(object):
+
+ def test_empty_bam_reads_in_range(self):
+ with IndexedBamReader(data.getEmptyAlignedBam()) as bam:
+ reads = bam.readsInRange("lambda_NEB3011", 0, 50000,
+ justIndices=True)
+ EQ(len(reads), 0)
+
+
class TestMultipleReadGroups(object):
"""
Verify that BAM files with multiple read groups are handled sensibly - see
diff --git a/tests/test_pbcore_io_GffIO.py b/tests/test_pbcore_io_GffIO.py
index 24c1b5c..1be13b0 100644
--- a/tests/test_pbcore_io_GffIO.py
+++ b/tests/test_pbcore_io_GffIO.py
@@ -1,5 +1,6 @@
from StringIO import StringIO
+import tempfile
import unittest
import os.path
@@ -191,3 +192,13 @@ chr1\tkinModCall\tmodified_base\t16348\t16348\t42\t-\t.\tcoverage=115;context=CC
start = [ (rec.seqid, rec.start) for rec in f ]
self.assertEqual(start, self.sorted_start)
rm_out(gff_out)
+
+ def test_empty_file(self):
+ gff_tmp = tempfile.NamedTemporaryFile(suffix=".gff").name
+ with open(gff_tmp, "w") as f:
+ f.write("##gff-version 3\n")
+ f.write("##source ipdSummary\n")
+ f.write("##sequence-region lambda_NEB3011 1 48502")
+ gff_out = tempfile.NamedTemporaryFile(suffix=".gff").name
+ merge_gffs(self.files + [gff_tmp], gff_out)
+ rm_out(gff_out)
diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py
index dd3577b..8b0d0a5 100644
--- a/tests/test_pbdataset.py
+++ b/tests/test_pbdataset.py
@@ -4,52 +4,28 @@ import re
import logging
import itertools
import tempfile
-
-import numpy as np
import unittest
-import shutil
-import copy
-from random import shuffle
from unittest.case import SkipTest
+import shutil
+import numpy as np
+
from pbcore.io import PacBioBamIndex, IndexedBamReader
from pbcore.io import openIndexedAlignmentFile
-from pbcore.io.dataset.utils import BamtoolsVersion
+from pbcore.io.dataset.utils import consolidateXml
from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
- openDataSet, DataSetMetaTypes, HdfSubreadSet,
- ConsensusReadSet, ConsensusAlignmentSet, openDataFile,
- divideKeys, keysToRanges, getDataSetUuid,
- getDataSetMetaType)
-from pbcore.io.dataset.DataSetIO import _dsIdToSuffix, InvalidDataSetIOError
+ openDataSet, HdfSubreadSet,
+ ConsensusReadSet, ConsensusAlignmentSet)
+from pbcore.io.dataset.DataSetMetaTypes import InvalidDataSetIOError
from pbcore.io.dataset.DataSetMembers import ExternalResource, Filters
from pbcore.io.dataset.DataSetValidator import validateFile
from pbcore.util.Process import backticks
import pbcore.data.datasets as data
import pbcore.data as upstreamdata
-log = logging.getLogger(__name__)
-
-def _check_constools():
- if not BamtoolsVersion().good:
- log.warn("Bamtools not found or out of date")
- return False
-
- cmd = "pbindex"
- o, r, m = backticks(cmd)
- if r != 1:
- return False
-
- cmd = "samtools"
- o, r, m = backticks(cmd)
- if r != 1:
- return False
- return True
-
-def _internal_data():
- if os.path.exists("/pbi/dept/secondary/siv/testdata"):
- return True
- return False
+from utils import _pbtestdata, _check_constools, _internal_data
+log = logging.getLogger(__name__)
class TestDataSet(unittest.TestCase):
"""Unit and integrationt tests for the DataSet class and \
@@ -237,7 +213,7 @@ class TestDataSet(unittest.TestCase):
2)
- dset = AlignmentSet(upstreamdata.getEmptyBam())
+ dset = AlignmentSet(upstreamdata.getEmptyAlignedBam())
self.assertEqual(dset.numRecords, 0)
self.assertEqual(dset.totalLength, 0)
self.assertEqual(len(list(dset)), 0)
@@ -250,7 +226,7 @@ class TestDataSet(unittest.TestCase):
1)
# empty and full:
- dset = AlignmentSet(upstreamdata.getEmptyBam(), data.getBam())
+ dset = AlignmentSet(upstreamdata.getEmptyAlignedBam(), data.getBam())
self.assertEqual(dset.numRecords, 92)
self.assertEqual(dset.totalLength, 123588)
self.assertEqual(len(list(dset)), 92)
@@ -270,7 +246,7 @@ class TestDataSet(unittest.TestCase):
dset.index
self.assertEqual(len(dset.resourceReaders()), 1)
- dset = ConsensusAlignmentSet(upstreamdata.getEmptyBam())
+ dset = ConsensusAlignmentSet(upstreamdata.getEmptyAlignedBam())
self.assertEqual(dset.numRecords, 0)
self.assertEqual(dset.totalLength, 0)
self.assertEqual(len(list(dset)), 0)
@@ -283,6 +259,9 @@ class TestDataSet(unittest.TestCase):
outfile = os.path.split(upstreamdata.getEmptyBam())[1]
outpath = os.path.join(outdir, outfile)
shutil.copy(upstreamdata.getEmptyBam(), outpath)
+ alnoutfile = os.path.split(upstreamdata.getEmptyAlignedBam())[1]
+ alnoutpath = os.path.join(outdir, alnoutfile)
+ shutil.copy(upstreamdata.getEmptyAlignedBam(), alnoutpath)
dset = SubreadSet(outpath)
self.assertEqual(dset.numRecords, 0)
self.assertEqual(dset.totalLength, 0)
@@ -309,7 +288,7 @@ class TestDataSet(unittest.TestCase):
1)
- dset = AlignmentSet(outpath)
+ dset = AlignmentSet(alnoutpath)
self.assertEqual(dset.numRecords, 0)
self.assertEqual(dset.totalLength, 0)
self.assertEqual(len(list(dset)), 0)
@@ -320,7 +299,7 @@ class TestDataSet(unittest.TestCase):
1)
# empty and full:
- dset = AlignmentSet(outpath, data.getBam())
+ dset = AlignmentSet(alnoutpath, data.getBam())
# without a pbi, updating counts is broken
self.assertEqual(dset.numRecords, 0)
self.assertEqual(dset.totalLength, 0)
@@ -340,27 +319,83 @@ class TestDataSet(unittest.TestCase):
dset.updateCounts()
self.assertEqual(len(dset.resourceReaders()), 1)
- dset = ConsensusAlignmentSet(outpath)
+ dset = ConsensusAlignmentSet(alnoutpath)
self.assertEqual(dset.numRecords, 0)
self.assertEqual(dset.totalLength, 0)
self.assertEqual(len(list(dset)), 0)
dset.updateCounts()
self.assertEqual(len(dset.resourceReaders()), 1)
dset.induceIndices()
- dset = ConsensusAlignmentSet(outpath)
+ dset = ConsensusAlignmentSet(alnoutpath)
self.assertEqual(dset.numRecords, 0)
self.assertEqual(dset.totalLength, 0)
self.assertEqual(len(list(dset)), 0)
dset.updateCounts()
self.assertEqual(len(dset.resourceReaders()), 1)
+
+ def test_empty_bam_index_dtype(self):
+ # Make sure the BAM and DataSet APIs are consistent
+ empty_bam = upstreamdata.getEmptyBam()
+ sset = SubreadSet(empty_bam)
+ empty = np.array([], dtype=np.int32)
+
+ # The BAM API
+ self.assertTrue(np.array_equal(
+ sset.resourceReaders()[0].index.qId,
+ empty))
+
+ # The DataSet API
+ self.assertTrue(np.array_equal(
+ sset.index.qId,
+ empty))
+
+ # Check to make sure we can stack them:
+ full_bam = upstreamdata.getUnalignedBam()
+ sset = SubreadSet(empty_bam, full_bam)
+
+ # The BAM API
+ self.assertTrue(len(sset.resourceReaders()[1].index.qId) != 0)
+
+ # The DataSet API
+ self.assertTrue(len(sset.index.qId) != 0)
+
+ def test_empty_aligned_bam_index_dtype(self):
+ # Make sure the BAM and DataSet APIs are consistent
+ empty_bam = data.getEmptyAlignedBam()
+ alnFile = AlignmentSet(empty_bam)
+ empty = np.array([], dtype=np.int32)
+
+ # The BAM API
+ self.assertTrue(np.array_equal(
+ alnFile.resourceReaders()[0].tId,
+ empty))
+ self.assertTrue(np.array_equal(
+ alnFile.resourceReaders()[0].index.tId,
+ empty))
+
+ # The DataSet API
+ self.assertTrue(np.array_equal(alnFile.tId, empty))
+ self.assertTrue(np.array_equal(alnFile.index.tId, empty))
+
+ # Check to make sure we can stack them:
+ full_bam = upstreamdata.getBamAndCmpH5()[0]
+ aset = AlignmentSet(empty_bam, full_bam)
+
+ # The BAM API
+ self.assertTrue(len(aset.resourceReaders()[1].index.qId) != 0)
+
+ # The DataSet API
+ self.assertTrue(len(aset.index.qId) != 0)
+
def test_read_ranges(self):
# This models the old and new ways by which Genomic Consensus generates
# lists of paired tStarts and tEnds.
full_bam = upstreamdata.getBamAndCmpH5()[0]
empty_bam = data.getEmptyAlignedBam()
- file_lists = [[full_bam, empty_bam],
+ file_lists = [[empty_bam],
+ [full_bam, empty_bam],
[empty_bam, full_bam]]
refId_list = ['lambda_NEB3011', 0]
minMapQV = 30
@@ -369,16 +404,6 @@ class TestDataSet(unittest.TestCase):
for refId in refId_list:
alnFile = AlignmentSet(*file_list)
- # old GC:
- # TODO(mdsmith)(2016-01-27): Remove assertRaises when tId
- # accession is fixed for empty aligned bam+pbi files
- with self.assertRaises(AttributeError):
- for reader in alnFile.resourceReaders(refId):
- reader.index.tId
- refLen = reader.referenceInfo(refId).Length
- rows = reader.readsInRange(refId, 0, refLen,
- justIndices=True)
-
# new GC (just testing that it doesn't raise exceptions):
rows = alnFile.index[
((alnFile.tId == alnFile.referenceInfo(refId).ID) &
@@ -465,14 +490,6 @@ class TestDataSet(unittest.TestCase):
with self.assertRaises(Exception):
HdfSubreadSet(fasta, strict=True)
- def test_dsIdToSuffix(self):
- suffixes = ['subreadset.xml', 'hdfsubreadset.xml', 'alignmentset.xml',
- 'barcodeset.xml', 'consensusreadset.xml',
- 'consensusalignmentset.xml',
- 'referenceset.xml', 'contigset.xml']
- for dsId, exp in zip(DataSetMetaTypes.ALL, suffixes):
- self.assertEqual(_dsIdToSuffix(dsId), exp)
-
def test_updateCounts_without_pbi(self):
log.info("Testing updateCounts without pbi")
data_fname = data.getBam(0)
@@ -663,148 +680,64 @@ class TestDataSet(unittest.TestCase):
_ = ds.newUuid()
self.assertTrue(old != ds.uuid)
- def test_split(self):
- ds1 = openDataSet(data.getXml(12))
- self.assertTrue(ds1.numExternalResources > 1)
- dss = ds1.split()
- self.assertTrue(len(dss) == ds1.numExternalResources)
- dss = ds1.split(chunks=1)
- self.assertTrue(len(dss) == 1)
- dss = ds1.split(chunks=2, ignoreSubDatasets=True)
- self.assertTrue(len(dss) == 2)
- self.assertFalse(dss[0].uuid == dss[1].uuid)
- self.assertTrue(dss[0].name == dss[1].name)
- # Lets try merging and splitting on subdatasets
- ds1 = openDataSet(data.getXml(8))
- self.assertEquals(ds1.totalLength, 123588)
- ds1tl = ds1.totalLength
- ds2 = openDataSet(data.getXml(11))
- self.assertEquals(ds2.totalLength, 117086)
- ds2tl = ds2.totalLength
- dss = ds1 + ds2
- self.assertTrue(dss.totalLength == (ds1tl + ds2tl))
- ds1, ds2 = sorted(dss.split(2, ignoreSubDatasets=False),
- key=lambda x: x.totalLength,
- reverse=True)
- self.assertTrue(ds1.totalLength == ds1tl)
- self.assertTrue(ds2.totalLength == ds2tl)
-
- def test_split_zmws(self):
- N_RECORDS = 117
- test_file = upstreamdata.getUnalignedBam()
- ds1 = openDataFile(test_file)
- self.assertEqual(len([r for r in ds1]), N_RECORDS)
- self.assertEqual(len(ds1), N_RECORDS)
- dss = ds1.split(chunks=1, zmws=True)
- self.assertEqual(len(dss), 1)
- self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
- N_RECORDS)
- self.assertEqual(sum([len(ds_) for ds_ in dss]),
- N_RECORDS)
-
- # We have a lower limit on the number of zmws, now
- dss = ds1.split(chunks=12, zmws=True)
- self.assertEqual(len(dss), 2)
- self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
- N_RECORDS)
- self.assertEqual(sum([len(ds_) for ds_ in dss]),
- N_RECORDS)
- self.assertEqual(
- dss[0].zmwRanges,
- [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
- 1650, 32328)])
- self.assertEqual(
- dss[-1].zmwRanges,
- [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
- 32560, 54396)])
- ranges = sorted([c.zmwRanges[0][1:] for c in dss])
- interspans = []
- last = None
- for rg in ranges:
- if not last is None:
- interspans.append((last, rg[0]))
- self.assertFalse(last == rg[0])
- last = rg[1]
- for rg in interspans:
- self.assertEqual(len(np.nonzero(np.logical_and(
- ds1.index.holeNumber < rg[1],
- ds1.index.holeNumber > rg[0]))[0]), 0)
-
- #@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
- # "Missing testadata directory")
- @unittest.skip("Too expensive")
- def test_large_split_zmws(self):
- N_RECORDS = 959539
- test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
- "2372215/0007/Analysis_Results/m150404_101626_42"
- "267_c100807920800000001823174110291514_s1_p0.al"
- "l.subreadset.xml")
- ds1 = openDataFile(test_file)
- self.assertEqual(len(ds1), N_RECORDS)
- dss = ds1.split(chunks=1, zmws=True)
- self.assertEqual(len(dss), 1)
- self.assertEqual(sum([len(ds_) for ds_ in dss]),
- N_RECORDS)
- dss = ds1.split(chunks=12, zmws=True)
- self.assertEqual(len(dss), 12)
- self.assertEqual(sum([len(ds_) for ds_ in dss]),
- N_RECORDS)
- self.assertEqual(
- dss[0].zmwRanges,
- [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
- 7, 14007)])
- self.assertEqual(
- dss[-1].zmwRanges,
- [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
- 149876, 163475)])
- ranges = sorted([c.zmwRanges[0][1:] for c in dss])
- interspans = []
- last = None
- for rg in ranges:
- if not last is None:
- interspans.append((last, rg[0]))
- self.assertFalse(last == rg[0])
- last = rg[1]
- for rg in interspans:
- self.assertEqual(len(np.nonzero(np.logical_and(
- ds1.index.holeNumber < rg[1],
- ds1.index.holeNumber > rg[0]))[0]), 0)
-
-
- @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
- "Missing testadata directory")
- def test_multi_movie_split_zmws(self):
- N_RECORDS = 1745161
- test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
- "2372215/0007/Analysis_Results/m150404_101626_42"
- "267_c100807920800000001823174110291514_s1_p0.al"
- "l.subreadset.xml")
- test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
- "2590980/0008/Analysis_Results/m141115_075238_et"
- "han_c100699872550000001823139203261572_s1_p0.al"
- "l.subreadset.xml")
- ds1 = SubreadSet(test_file_1, test_file_2)
- # used to get total:
- #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
- self.assertEqual(len(ds1), N_RECORDS)
- dss = ds1.split(chunks=1, zmws=True)
- self.assertEqual(len(dss), 1)
- self.assertEqual(sum([len(ds_) for ds_ in dss]),
- N_RECORDS)
-
- dss = ds1.split(chunks=12, zmws=True)
- self.assertEqual(len(dss), 12)
- self.assertEqual(sum([len(ds_) for ds_ in dss]),
- N_RECORDS)
- self.assertEqual(
- dss[0].zmwRanges,
- [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
- 7, 22098)])
- self.assertEqual(
- dss[-1].zmwRanges,
- [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0',
- 127814, 163468)])
-
+ def test_newUuid_repeat(self):
+ ds = DataSet()
+ old = ds.uuid
+ new = ds.newUuid()
+ self.assertTrue(old != ds.uuid)
+ self.assertTrue(old != new)
+ self.assertTrue(ds.uuid == new)
+ reallynew = ds.newUuid()
+ # Note that you can keep calling new, and each tiem it will be
+ # different:
+ last = ds.uuid
+ for _ in range(10):
+ ds.newUuid()
+ self.assertTrue(ds.uuid != new)
+ self.assertTrue(ds.uuid != last)
+ last = ds.uuid
+ self.assertTrue(reallynew != new)
+ self.assertTrue(reallynew != old)
+
+ def test_newUuid_copy(self):
+ fn_orig = data.getXml(8)
+ fn1 = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ fn2 = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ shutil.copy(fn_orig, fn1)
+ shutil.copy(fn_orig, fn2)
+ ds1 = openDataSet(fn1)
+ ds2 = openDataSet(fn2)
+ self.assertEqual(ds1.uuid, ds2.uuid)
+ for _ in range(10):
+ ds1.newUuid()
+ ds2.newUuid()
+ self.assertEqual(ds1.uuid, ds2.uuid)
+
+ def test_newUuid_random(self):
+ fn_orig = data.getXml(8)
+ fn1 = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ fn2 = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ shutil.copy(fn_orig, fn1)
+ shutil.copy(fn_orig, fn2)
+ ds1 = openDataSet(fn1)
+ ds2 = openDataSet(fn2)
+ self.assertEqual(ds1.uuid, ds2.uuid)
+ for _ in range(10):
+ ds1.newUuid(random=True)
+ ds2.newUuid(random=True)
+ self.assertNotEqual(ds1.uuid, ds2.uuid)
+
+ def test_bad_xml_extension(self):
+ fn = tempfile.NamedTemporaryFile(
+ suffix=".alignmentset.xml.disabled").name
+ with AlignmentSet(data.getXml(8)) as aln:
+ aln.write(fn)
+ with AlignmentSet(fn) as aln:
+ self.assertEqual(len(aln), 92)
+ shutil.copy(data.getBam(), fn)
+ with self.assertRaises(IOError):
+ with AlignmentSet(fn) as aln:
+ self.assertEqual(len(aln), 92)
@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
"Missing testadata directory")
@@ -902,49 +835,6 @@ class TestDataSet(unittest.TestCase):
ds3 = AlignmentSet(data.getBam())
ds3.write(outfile)
- def test_addFilters(self):
- ds1 = DataSet()
- filt = Filters()
- filt.addRequirement(rq=[('>', '0.85')])
- ds1.addFilters(filt)
- self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
- # Or added from a source XML
- ds2 = DataSet(data.getXml(16))
- self.assertTrue(str(ds2.filters).startswith(
- '( rname = E.faecalis'))
-
- def test_setFilters(self):
- ds1 = DataSet()
- filt = Filters()
- filt.addRequirement(rq=[('>', '0.85')])
- ds1.addFilters(filt)
- self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
- # Or added from a source XML
- ds2 = DataSet()
- ds2.filters = ds1.filters
- self.assertEquals(str(ds2.filters), '( rq > 0.85 )')
-
-
- def test_add_double_bound_filters(self):
- ds1 = AlignmentSet(data.getXml(8))
- ds1.filters.addRequirement(rq=[('>', '0.85'),
- ('<', '0.99')])
- self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )')
-
- ds1 = AlignmentSet(data.getXml(8))
- self.assertEquals(str(ds1.filters), '')
- ds1.filters.addFilter(rq=[('>', '0.85'),
- ('<', '0.99')])
- self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )')
-
- ds1.filters.addFilter(length=[('>', '1000')])
- self.assertEquals(str(ds1.filters),
- '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )')
-
- ds1.filters.removeFilter(0)
- self.assertEquals(str(ds1.filters),
- '( length > 1000 )')
-
def test_addMetadata(self):
ds = DataSet()
ds.addMetadata(None, Name='LongReadsRock')
@@ -957,47 +847,6 @@ class TestDataSet(unittest.TestCase):
ds2._metadata.totalLength += 100000
self.assertEquals(ds2._metadata.totalLength, 200000)
- @unittest.skipIf(not _internal_data(),
- "Internal data not available")
- def test_loadMetadata(self):
- aln = AlignmentSet(data.getXml(no=8))
- self.assertFalse(aln.metadata.collections)
- aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
- 'SA3-Sequel/lambda/roche_SAT/'
- 'm54013_151205_032353.run.metadata.xml')
- self.assertTrue(aln.metadata.collections)
- sset_fn = ('/pbi/dept/secondary/siv/testdata/'
- 'SA3-Sequel/lambda/roche_SAT/'
- 'm54013_151205_032353.subreadset.xml')
- sset = SubreadSet(sset_fn)
- orig_metadata = copy.deepcopy(sset.metadata)
- sset.metadata.collections = None
- self.assertFalse(sset.metadata.collections)
- sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
- 'SA3-Sequel/lambda/roche_SAT/'
- 'm54013_151205_032353.run.metadata.xml')
- stack = zip(sset.metadata, orig_metadata)
- fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
- sset.write(fn)
- validateFile(fn)
- validateFile(sset_fn)
- self.assertEqual(sset.metadata, orig_metadata)
-
-
- # load the wrong thing...
- sset_fn = ('/pbi/dept/secondary/siv/testdata/'
- 'SA3-Sequel/lambda/roche_SAT/'
- 'm54013_151205_032353.subreadset.xml')
- sset = SubreadSet(sset_fn)
- orig_metadata = copy.deepcopy(sset.metadata)
- sset.metadata.collections = None
- self.assertFalse(sset.metadata.collections)
- with self.assertRaises(InvalidDataSetIOError):
- sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
- 'SA3-Sequel/lambda/roche_SAT/'
- 'm54013_151205_032353.sts.xml')
-
-
def test_copyTo(self):
aln = AlignmentSet(data.getXml(no=8), strict=True)
explen = len(aln)
@@ -1036,6 +885,24 @@ class TestDataSet(unittest.TestCase):
self.assertNotEqual(sorted(aln.toExternalFiles()),
sorted(aln2.toExternalFiles()))
+ @unittest.skipIf((not _pbtestdata() or not _check_constools()),
+ "Internal data not available")
+ def test_copyTo_same_base_names(self):
+ import pbtestdata
+ # see bug 33778
+ tmp_bam = tempfile.NamedTemporaryFile(suffix=".bam").name
+ log.debug(tmp_bam)
+ ds = AlignmentSet(pbtestdata.get_file("aligned-ds-2"))
+ log.debug(pbtestdata.get_file("aligned-ds-2"))
+ consolidateXml(ds, tmp_bam, cleanup=True)
+ with AlignmentSet(tmp_bam) as f:
+ qnames = set()
+ for rec in f:
+ qnames.add(rec.qName)
+ assert len(qnames) == len([rec for rec in f])
+ assert len(qnames) == len(f)
+
+
def test_addExternalResources(self):
ds = DataSet()
er1 = ExternalResource()
@@ -1250,373 +1117,6 @@ class TestDataSet(unittest.TestCase):
for r1, r2 in itertools.izip(reads1, reads2):
self.assertEqual(r1, r2)
- def test_n_subreads_filter(self):
- ds2 = AlignmentSet(data.getXml(8))
- ds2.filters.addRequirement(n_subreads=[('>', '4')])
- self.assertEqual(len(list(ds2.records)), 87)
- self.assertEqual(len(ds2), 87)
-
- def test_filter(self):
- ds2 = AlignmentSet(data.getXml(8))
- ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
- self.assertEqual(len(list(ds2.records)), 20)
- ds2.disableFilters()
- self.assertEqual(len(list(ds2.records)), 92)
- ds2.enableFilters()
- self.assertEqual(len(list(ds2.records)), 20)
-
- def test_context_filters(self):
- ss = SubreadSet(upstreamdata.getUnalignedBam())
- self.assertEqual(set(ss.index.contextFlag), {0, 1, 2, 3})
- self.assertEqual(
- [len(np.flatnonzero(ss.index.contextFlag == cx))
- for cx in sorted(set(ss.index.contextFlag))],
- [15, 33, 32, 37])
- self.assertEqual(len(ss.index), 117)
-
- # no adapters/barcodes
- ss.filters.addRequirement(cx=[('=', 0)])
- self.assertEqual(len(ss.index), 15)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # no adapters/barcodes
- ss.filters.addRequirement(cx=[('=', 'NO_LOCAL_CONTEXT')])
- self.assertEqual(len(ss.index), 15)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # some adapters/barcodes
- ss.filters.addRequirement(cx=[('!=', 0)])
- self.assertEqual(len(ss.index), 102)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # adapter before
- ss.filters.addRequirement(cx=[('&', 1)])
- self.assertEqual(len(ss.index), 70)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # adapter before
- ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE')])
- self.assertEqual(len(ss.index), 70)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # adapter after
- ss.filters.addRequirement(cx=[('&', 2)])
- self.assertEqual(len(ss.index), 69)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # adapter before or after
- ss.filters.addRequirement(cx=[('&', 3)])
- self.assertEqual(len(ss.index), 102)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # adapter before or after
- ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE | ADAPTER_AFTER')])
- self.assertEqual(len(ss.index), 102)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # adapter before or after but not both
- ss.filters.addRequirement(cx=[('!=', 0)])
- ss.filters.addRequirement(cx=[('~', 1),
- ('~', 2)])
- self.assertEqual(len(ss.index), 65)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # adapter before or after
- ss.filters.addRequirement(cx=[('&', 1),
- ('&', 2)])
- self.assertEqual(len(ss.index), 102)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # adapter before and after
- ss.filters.addRequirement(cx=[('&', 1)])
- ss.filters.addRequirement(cx=[('&', 2)])
- self.assertEqual(len(ss.index), 37)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # adapter before but not after
- ss.filters.addRequirement(cx=[('&', 1)])
- ss.filters.addRequirement(cx=[('~', 2)])
- self.assertEqual(len(ss.index), 33)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # no adapter before
- ss.filters.addRequirement(cx=[('~', 1)])
- self.assertEqual(len(ss.index), 47)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # no adapter before or after
- ss.filters.addRequirement(cx=[('~', 1)])
- ss.filters.addRequirement(cx=[('~', 2)])
- self.assertEqual(len(ss.index), 15)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- # no adapter before or after
- ss.filters.addRequirement(cx=[('~', 3)])
- self.assertEqual(len(ss.index), 15)
- ss.filters.removeRequirement('cx')
- self.assertEqual(len(ss.index), 117)
-
- @SkipTest
- def test_split_by_contigs_presplit(self):
- # Consumes too much memory for Jenkins
-
- # Test to make sure the result of a split by contigs has an appropriate
- # number of records (make sure filters are appropriately aggressive)
- ds2 = DataSet(data.getXml(15))
- bams = ds2.externalResources.resourceIds
- self.assertEqual(len(bams), 2)
- refwindows = ds2.refWindows
- self.assertEqual(refwindows, [(0, 0, 224992)])
- res1 = openIndexedAlignmentFile(bams[0][7:])
- res2 = openIndexedAlignmentFile(bams[1][7:])
- def count(iterable):
- count = 0
- for _ in iterable:
- count += 1
- return count
- self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409)
- self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375)
- self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784)
- self.assertEqual(count(ds2.records), 2784)
- ds2.disableFilters()
- self.assertEqual(count(ds2.records), 53552)
- self.assertEqual(ds2.countRecords(), 53552)
-
- def test_split_by_contigs_with_split_and_maxChunks(self):
- # test to make sure the refWindows work when chunks == # refs
- ds3 = AlignmentSet(data.getBam())
- dss = ds3.split(contigs=True)
- self.assertEqual(len(dss), 12)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- # not all references have something mapped to them, refWindows doesn't
- # care...
- self.assertNotEqual(refWindows, sorted(ds3.refWindows))
- self.assertEqual(refWindows,
- [('B.vulgatus.4', 0, 1449), ('B.vulgatus.5', 0, 1449),
- ('C.beijerinckii.13', 0, 1433), ('C.beijerinckii.14', 0, 1433),
- ('C.beijerinckii.9', 0, 1433), ('E.coli.6', 0, 1463),
- ('E.faecalis.1', 0, 1482), ('E.faecalis.2', 0, 1482),
- ('R.sphaeroides.1', 0, 1386), ('S.epidermidis.2', 0, 1472),
- ('S.epidermidis.3', 0, 1472), ('S.epidermidis.4', 0, 1472)])
- old_refWindows = refWindows
- random_few = [('C.beijerinckii.13', 0, 1433),
- ('B.vulgatus.4', 0, 1449),
- ('E.faecalis.1', 0, 1482)]
-
- dss = ds3.split(contigs=True, maxChunks=1)
- self.assertEqual(len(dss), 1)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- self.assertEqual(refWindows, old_refWindows)
-
- dss = ds3.split(contigs=True, maxChunks=24)
- # This isn't expected if num refs >= 100, as map check isn't made
- # for now (too expensive)
- # There are only 12 refs represented in this set, however...
- self.assertEqual(len(dss), 12)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
-
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- if not found:
- log.debug(ref)
- self.assertTrue(found)
-
- # test with maxchunks but no breaking contigs
- dss = ds3.split(contigs=True, maxChunks=36)
- self.assertEqual(len(dss), 12)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- self.assertTrue(found)
-
- # test with maxchunks and breaking contigs is allowed (triggers
- # targetsize, may result in fewer chunks)
- dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True)
- self.assertEqual(len(dss), 2)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- self.assertTrue(found)
-
- # test with previous setup and smaller targetSize, resulting in more
- # chunks
- dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True,
- targetSize=10)
- self.assertEqual(len(dss), 9)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- self.assertTrue(found)
-
- # test with byRecords and fewer chunks than atoms
- dss = ds3.split(contigs=True, chunks=3, byRecords=True)
- self.assertEqual(len(dss), 3)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- self.assertTrue(found)
-
- # test with byRecords and more chunks than atoms
- orf = random_few
- random_few = [('C.beijerinckii.13', 0, 747),
- ('B.vulgatus.4', 0, 1449),
- ('E.faecalis.1', 0, 742)]
- dss = ds3.split(contigs=True, chunks=16, byRecords=True)
- self.assertEqual(len(dss), 16)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- self.assertTrue(found)
-
- # test with byRecords and updateCounts
- random_few = orf
- dss = ds3.split(contigs=True, chunks=3, byRecords=True,
- updateCounts=True)
- self.assertEqual(len(dss), 3)
- sizes = sorted([dset.numRecords for dset in dss])
- self.assertListEqual(sizes, [30, 31, 31])
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- self.assertTrue(found)
-
- # test with byRefLength and updateCounts
- random_few = orf
- dss = ds3.split(contigs=True, chunks=3, updateCounts=True)
- self.assertEqual(len(dss), 3)
- sizes = sorted([dset.numRecords for dset in dss])
- self.assertListEqual(sizes, [20, 24, 48])
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- self.assertTrue(found)
-
- def test_split_by_contigs_with_split(self):
- # test to make sure the refWindows work when chunks == # refs
- ds3 = AlignmentSet(data.getBam())
- dss = ds3.split(contigs=True)
- self.assertEqual(len(dss), 12)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- # not all references have something mapped to them, refWindows doesn't
- # care...
- self.assertNotEqual(refWindows, sorted(ds3.refWindows))
- random_few = [('C.beijerinckii.13', 0, 1433),
- ('B.vulgatus.4', 0, 1449),
- ('E.faecalis.1', 0, 1482)]
- for reference in random_few:
- found = False
- for ref in refWindows:
- if ref == reference:
- found = True
- self.assertTrue(found)
- old_refWindows = refWindows
-
- dss = ds3.split(contigs=True, chunks=1)
- self.assertEqual(len(dss), 1)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- self.assertEqual(refWindows, old_refWindows)
-
- dss = ds3.split(contigs=True, chunks=24)
- self.assertEqual(len(dss), 24)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
-
- random_few = [('E.faecalis.2', 0, 741),
- ('E.faecalis.2', 741, 1482)]
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- if not found:
- log.debug(ref)
- self.assertTrue(found)
-
- dss = ds3.split(contigs=True, chunks=36)
- self.assertEqual(len(dss), 36)
- refWindows = sorted(reduce(lambda x, y: x + y,
- [ds.refWindows for ds in dss]))
- random_few = [('E.faecalis.2', 0, 494),
- ('E.faecalis.2', 494, 988),
- ('E.faecalis.2', 988, 1482)]
- for ref in random_few:
- found = False
- for window in refWindows:
- if ref == window:
- found = True
- self.assertTrue(found)
-
- def test_filter_reference_contigs(self):
- ds2 = ReferenceSet(data.getRef())
- self.assertEqual(len(list(ds2.refNames)), 59)
- self.assertEqual(len(list(ds2.records)), len(ds2.index))
- filt = Filters()
- filt.addRequirement(id=[('==', 'E.faecalis.1')])
- ds2.addFilters(filt)
- self.assertEqual(str(ds2.filters),
- "( id == E.faecalis.1 )")
- self.assertEqual(len(ds2.refNames), 1)
- self.assertEqual(len(list(ds2.records)), 1)
- self.assertEqual(len(list(ds2.records)), len(ds2.index))
- ds2.disableFilters()
- self.assertEqual(len(list(ds2.refNames)), 59)
- self.assertEqual(len(list(ds2.records)), 59)
- self.assertEqual(len(list(ds2.records)), len(ds2.index))
- ds2.enableFilters()
- self.assertEqual(len(list(ds2.refNames)), 1)
- self.assertEqual(len(list(ds2.records)), 1)
- self.assertEqual(len(list(ds2.records)), len(ds2.index))
-
# TODO: get this working again when adding manual subdatasets is good to go
@SkipTest
def test_reads_in_subdataset(self):
@@ -1644,32 +1144,6 @@ class TestDataSet(unittest.TestCase):
#ds3 = DataSet(data.getXml(13))
#self.assertEqual(len(list(ds3.readsInSubDatasets())), 2)
- def test_refWindows(self):
- ds = AlignmentSet(data.getBam())
- dss = ds.split(chunks=2, contigs=True)
- self.assertEqual(len(dss), 2)
- log.debug(dss[0].filters)
- log.debug(dss[1].filters)
- self.assertTrue(
- '( rname = E.faecalis.2 '
- in str(dss[0].filters)
- or
- '( rname = E.faecalis.2 '
- in str(dss[1].filters))
- ds = AlignmentSet(data.getBam())
- ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
- ('=', 'E.faecalis.2')],
- tStart=[('<', '99'),
- ('<', '299')],
- tEnd=[('>', '0'),
- ('>', '100')])
- self.assertEqual(str(ds.filters),
- '( rname = E.faecalis.2 AND tstart '
- '< 99 AND tend > 0 ) OR ( rname = '
- 'E.faecalis.2 AND tstart < 299 AND tend > 100 )')
- self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
- ('E.faecalis.2', 100, 299)])
-
def test_intervalContour(self):
ds = AlignmentSet(data.getBam(0))
coverage = ds.intervalContour('E.faecalis.1')
@@ -1964,9 +1438,10 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(len(readers[1].readGroupTable), 1)
self.assertEqual(len(readers[2].readGroupTable), 1)
self.assertEqual(len(aln.readGroupTable), 3)
+ self.assertTrue("BaseFeatures" in aln.readGroupTable.dtype.fields)
def test_missing_file(self):
- with self.assertRaises(InvalidDataSetIOError):
+ with self.assertRaises(IOError):
aln = AlignmentSet("NOPE")
def test_repr(self):
@@ -2104,6 +1579,36 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(ds4.metadata.summaryStats.readLenDist.bins,
[0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+ def test_multi_channel_dists(self):
+ ds = DataSet(data.getBam())
+ ds.loadStats(data.getStats())
+ ds2 = DataSet(data.getBam())
+ ds2.loadStats(data.getStats())
+ self.assertTrue(
+ list(ds.metadata.summaryStats.findChildren('BaselineLevelDist')))
+ self.assertTrue(ds.metadata.summaryStats.channelDists)
+ self.assertTrue(ds.metadata.summaryStats.otherDists)
+ self.assertTrue(ds.metadata.summaryStats.otherDists['PausinessDist'])
+ self.assertTrue(
+ ds.metadata.summaryStats.channelDists['HqBasPkMidDist']['G'])
+
+ # merge two
+ ds3 = ds + ds2
+
+ # unmerged dists should increase in length:
+ self.assertEqual(
+ len(ds3.metadata.summaryStats.channelDists['HqBasPkMidDist']['G']),
+ 2 * len(
+ ds.metadata.summaryStats.channelDists['HqBasPkMidDist']['G']))
+ self.assertEqual(
+ len(ds3.metadata.summaryStats.otherDists['PausinessDist']),
+ 2 * len(
+ ds.metadata.summaryStats.otherDists['PausinessDist']))
+ # merged dists should not:
+ self.assertEqual(
+ len(ds3.metadata.summaryStats.readLenDist),
+ len(ds.metadata.summaryStats.readLenDist))
+
def test_stats_metadata(self):
ds = DataSet(data.getBam())
ds.loadStats(data.getStats())
@@ -2196,15 +1701,18 @@ class TestDataSet(unittest.TestCase):
[1, 1, 1, 0, 2, 2, 2])
# now lets test the subdataset metadata retention:
- ss = SubreadSet(data.getXml(10))
- ss.loadStats(data.getStats(0))
- ss.loadStats(data.getStats(1))
- self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws)
- self.assertEqual(
- 2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws)
- self.assertEqual(
- 150292.0,
- ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
+ # or not, disabling for performance reasons
+ # TODO: make this fast again, then re-enable. Copying that much was
+ # killer
+ #ss = SubreadSet(data.getXml(10))
+ #ss.loadStats(data.getStats(0))
+ #ss.loadStats(data.getStats(1))
+ #self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws)
+ #self.assertEqual(
+ # 2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws)
+ #self.assertEqual(
+ # 150292.0,
+ # ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
@unittest.skipIf(not _internal_data(),
"Internal data not available")
@@ -2395,105 +1903,6 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
aln.referenceInfo(1))
- @unittest.skipIf(not _internal_data(),
- "Internal data not available")
- def test_qname_filter_scaling(self):
- # unaligned bam
- bam0 = ("/pbi/dept/secondary/siv/testdata/"
- "SA3-DS/ecoli/2590956/0003/"
- "Analysis_Results/m140913_222218_42240_c10069"
- "9952400000001823139203261564_s1_p0.all.subreadset.xml")
- bam1 = ("/pbi/dept/secondary/siv/testdata/"
- "SA3-DS/ecoli/2590953/0001/"
- "Analysis_Results/m140913_005018_42139_c10071"
- "3652400000001823152404301534_s1_p0.all.subreadset.xml")
- sset = SubreadSet(bam0, bam1)
- self.assertEqual(len(sset), 178570)
- size = 10
- qn = [r.qName for r in sset[:size]]
- good_qn = [('=', name) for name in qn]
- sset.filters.addRequirement(qname=good_qn)
- self.assertEqual(size, sum(1 for _ in sset))
- self.assertEqual(size, len(sset))
-
-
- sset = SubreadSet(data.getXml(10))
- self.assertEqual(len(sset), 92)
- size = 10
- qn = [r.qName for r in sset[:size]]
- good_qn = [('=', name) for name in qn]
- sset.filters.addRequirement(qname=good_qn)
- self.assertEqual(size, sum(1 for _ in sset))
- self.assertEqual(size, len(sset))
-
-
- @unittest.skipIf(not _internal_data(),
- "Internal data not available")
- def test_movie_filter(self):
- # unaligned bam
- bam0 = ("/pbi/dept/secondary/siv/testdata/"
- "SA3-DS/ecoli/2590956/0003/"
- "Analysis_Results/m140913_222218_42240_c10069"
- "9952400000001823139203261564_s1_p0.all.subreadset.xml")
- bam1 = ("/pbi/dept/secondary/siv/testdata/"
- "SA3-DS/ecoli/2590953/0001/"
- "Analysis_Results/m140913_005018_42139_c10071"
- "3652400000001823152404301534_s1_p0.all.subreadset.xml")
- aln = SubreadSet(bam0, bam1)
- self.assertEqual(len(set(aln.readGroupTable['ID'])),
- len(aln.readGroupTable['ID']))
- self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
- self.assertEqual(len(set(aln.readGroupTable['ID'])),
- len(set(aln.index.qId)))
- self.assertEqual(len(aln), 178570)
- aln.filters.addRequirement(movie=[(
- '=',
- 'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
- self.assertEqual(len(SubreadSet(bam1)), len(aln))
-
- # aligned bam
- #bam0 = ("/pbi/dept/secondary/siv/testdata/"
- # "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
- # "m140913_222218_42240_c1006999524000000018231"
- # "39203261564_s1_p0.all.alignmentset.xml")
- bam0 = upstreamdata.getBamAndCmpH5()[0]
- bam1 = ("/pbi/dept/secondary/siv/testdata/"
- "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
- "m140913_005018_42139_c1007136524000000018231"
- "52404301534_s1_p0.all.alignmentset.xml")
- aln = AlignmentSet(bam0, bam1)
- self.assertEqual(len(set(aln.readGroupTable['ID'])),
- len(aln.readGroupTable['ID']))
- self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
- self.assertEqual(len(set(aln.readGroupTable['ID'])),
- len(set(aln.index.qId)))
- self.assertEqual(len(aln), 103144)
- aln.filters.addRequirement(movie=[(
- '=',
- 'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
- self.assertEqual(len(AlignmentSet(bam1)), len(aln))
-
- # cmpH5
- cmp1 = upstreamdata.getBamAndCmpH5()[1]
- cmp2 = ("/pbi/dept/secondary/siv/testdata/"
- "genomic_consensus-unittest/bam_c4p6_tests/"
- "ecoli_c4p6.cmp.h5")
- aln = AlignmentSet(cmp1, cmp2)
- self.assertEqual(len(set(aln.readGroupTable['ID'])),
- len(aln.readGroupTable['ID']))
- self.assertEqual(len(set(aln.readGroupTable['ID'])),
- len(set(aln.index.MovieID)))
- self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
- self.assertEqual(len(aln), 57147)
- aln.filters.addRequirement(movie=[(
- '=',
- 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0')])
- len1 = len(AlignmentSet(cmp1))
- self.assertEqual(len1, len(aln))
-
- aln.filters.removeRequirement('movie')
- self.assertEqual(len(aln), 57147)
-
def test_exceptions(self):
try:
raise InvalidDataSetIOError("Wrong!")
@@ -2504,75 +1913,6 @@ class TestDataSet(unittest.TestCase):
aln = AlignmentSet(data.getXml(8))
self.assertEqual(aln.createdAt, '2015-08-05T10:25:18')
- def test_divideKeys_keysToRanges(self):
- keys = [0, 1, 2, 3, 5, 8, 50]
- res = divideKeys(keys, 0)
- self.assertEqual(res, [])
- res = keysToRanges(res)
- self.assertEqual(res, [])
-
- res = divideKeys(keys, 1)
- self.assertEqual(res, [[0, 1, 2, 3, 5, 8, 50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 50]])
-
- res = divideKeys(keys, 2)
- self.assertEqual(res, [[0, 1, 2], [3, 5, 8, 50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 2], [3, 50]])
-
- res = divideKeys(keys, 3)
- self.assertEqual(res, [[0, 1], [2, 3], [5, 8, 50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 1], [2, 3], [5, 50]])
-
- res = divideKeys(keys, 7)
- self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
- [5, 5], [8, 8], [50, 50]])
-
- res = divideKeys(keys, 8)
- self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
- [5, 5], [8, 8], [50, 50]])
-
-
- keys = [0, 1, 2, 2, 3, 5, 8, 50, 50]
- res = divideKeys(keys, 0)
- self.assertEqual(res, [])
- res = keysToRanges(res)
- self.assertEqual(res, [])
-
- res = divideKeys(keys, 1)
- self.assertEqual(res, [[0, 1, 2, 2, 3, 5, 8, 50, 50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 50]])
-
- res = divideKeys(keys, 2)
- self.assertEqual(res, [[0, 1, 2, 2], [3, 5, 8, 50, 50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 2], [3, 50]])
-
- res = divideKeys(keys, 3)
- self.assertEqual(res, [[0, 1, 2], [2, 3, 5], [8, 50, 50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 2], [2, 5], [8, 50]])
-
- res = divideKeys(keys, 9)
- self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
- [5, 5], [8, 8], [50, 50], [50, 50]])
-
- res = divideKeys(keys, 10)
- self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
- res = keysToRanges(res)
- self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
- [5, 5], [8, 8], [50, 50], [50, 50]])
-
-
@unittest.skipIf(not _internal_data(),
"Internal data not available")
def test_load_sts_from_extres(self):
@@ -2666,8 +2006,10 @@ class TestDataSet(unittest.TestCase):
ss3 = ss + ss2
self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
[b1 + b2 for b1, b2 in
- zip(ss.metadata.summaryStats.readLenDist.bins,
- ss2.metadata.summaryStats.readLenDist.bins)])
+ itertools.izip_longest(
+ ss.metadata.summaryStats.readLenDist.bins,
+ ss2.metadata.summaryStats.readLenDist.bins,
+ fillvalue=0)])
ss4 = SubreadSet(outXml, outXml2)
# one full one partial
@@ -2684,8 +2026,10 @@ class TestDataSet(unittest.TestCase):
ss3 = ss + ss2
self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
[b1 + b2 for b1, b2 in
- zip(ss.metadata.summaryStats.readLenDist.bins,
- ss2.metadata.summaryStats.readLenDist.bins)])
+ itertools.izip_longest(
+ ss.metadata.summaryStats.readLenDist.bins,
+ ss2.metadata.summaryStats.readLenDist.bins,
+ fillvalue=0)])
ss4 = SubreadSet(outXml, outXml2)
# two full
@@ -2751,29 +2095,3 @@ class TestDataSet(unittest.TestCase):
sset.write(resXml, validate=False)
with self.assertRaises(InvalidDataSetIOError):
sset = SubreadSet(resXml)
-
-
- @unittest.skip("Too expensive")
- def test_huge_zmw_split(self):
- human = ('/pbi/dept/secondary/siv/testdata/SA3-DS/'
- 'human/JCV_85x_v030/jcv_85x_v030.subreadset.xml')
- sset = SubreadSet(human)
- ssets = sset.split(zmws=True, maxChunks=5)
-
- def test_get_dataset_uuid(self):
- ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
- ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
- ds.write(ds_file)
- uuid = getDataSetUuid(ds_file)
- self.assertEqual(uuid, ds.uuid)
- with open(ds_file, "w") as out:
- out.write("hello world!")
- uuid = getDataSetUuid(ds_file)
- self.assertEqual(uuid, None)
-
- def test_get_dataset_metatype(self):
- ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
- ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
- ds.write(ds_file)
- meta_type = getDataSetMetaType(ds_file)
- self.assertEqual(meta_type, "PacBio.DataSet.SubreadSet")
diff --git a/tests/test_pbdataset_filters.py b/tests/test_pbdataset_filters.py
new file mode 100644
index 0000000..afbbb02
--- /dev/null
+++ b/tests/test_pbdataset_filters.py
@@ -0,0 +1,373 @@
+
+
+import logging
+import tempfile
+import unittest
+from unittest.case import SkipTest
+
+import numpy as np
+
+from pbcore.io import DataSet, SubreadSet, ReferenceSet, AlignmentSet
+from pbcore.io.dataset.DataSetMembers import Filters
+import pbcore.data.datasets as data
+import pbcore.data as upstreamdata
+
+from utils import _pbtestdata, _check_constools, _internal_data
+
+log = logging.getLogger(__name__)
+
+
+class TestDataSetFilters(unittest.TestCase):
+ """Unit and integrationt tests for the DataSet class and \
+ associated module functions"""
+
+ def test_addFilters(self):
+ ds1 = DataSet()
+ filt = Filters()
+ filt.addRequirement(rq=[('>', '0.85')])
+ ds1.addFilters(filt)
+ self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
+ # Or added from a source XML
+ ds2 = DataSet(data.getXml(16))
+ self.assertTrue(str(ds2.filters).startswith(
+ '( rname = E.faecalis'))
+
+ def test_setFilters(self):
+ ds1 = DataSet()
+ filt = Filters()
+ filt.addRequirement(rq=[('>', '0.85')])
+ ds1.addFilters(filt)
+ self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
+ # Or added from a source XML
+ ds2 = DataSet()
+ ds2.filters = ds1.filters
+ self.assertEquals(str(ds2.filters), '( rq > 0.85 )')
+
+ def test_add_double_bound_filters(self):
+ ds1 = AlignmentSet(data.getXml(8))
+ ds1.filters.addRequirement(rq=[('>', '0.85'),
+ ('<', '0.99')])
+ self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )')
+
+ ds1 = AlignmentSet(data.getXml(8))
+ self.assertEquals(str(ds1.filters), '')
+ ds1.filters.addFilter(rq=[('>', '0.85'),
+ ('<', '0.99')])
+ self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )')
+
+ ds1.filters.addFilter(length=[('>', '1000')])
+ self.assertEquals(str(ds1.filters),
+ '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )')
+
+ ds1.filters.removeFilter(0)
+ self.assertEquals(str(ds1.filters),
+ '( length > 1000 )')
+
+ def test_n_subreads_filter(self):
+ ds2 = AlignmentSet(data.getXml(8))
+ ds2.filters.addRequirement(n_subreads=[('>', '4')])
+ self.assertEqual(len(list(ds2.records)), 87)
+ self.assertEqual(len(ds2), 87)
+
+ def test_filter(self):
+ ds2 = AlignmentSet(data.getXml(8))
+ ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
+ self.assertEqual(len(list(ds2.records)), 20)
+ ds2.disableFilters()
+ self.assertEqual(len(list(ds2.records)), 92)
+ ds2.enableFilters()
+ self.assertEqual(len(list(ds2.records)), 20)
+
+ def test_context_filters(self):
+ ss = SubreadSet(upstreamdata.getUnalignedBam())
+ self.assertEqual(set(ss.index.contextFlag), {0, 1, 2, 3})
+ self.assertEqual(
+ [len(np.flatnonzero(ss.index.contextFlag == cx))
+ for cx in sorted(set(ss.index.contextFlag))],
+ [15, 33, 32, 37])
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapters/barcodes
+ ss.filters.addRequirement(cx=[('=', 0)])
+ self.assertEqual(len(ss.index), 15)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapters/barcodes
+ ss.filters.addRequirement(cx=[('=', 'NO_LOCAL_CONTEXT')])
+ self.assertEqual(len(ss.index), 15)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # some adapters/barcodes
+ ss.filters.addRequirement(cx=[('!=', 0)])
+ self.assertEqual(len(ss.index), 102)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before
+ ss.filters.addRequirement(cx=[('&', 1)])
+ self.assertEqual(len(ss.index), 70)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before
+ ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE')])
+ self.assertEqual(len(ss.index), 70)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter after
+ ss.filters.addRequirement(cx=[('&', 2)])
+ self.assertEqual(len(ss.index), 69)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before or after
+ ss.filters.addRequirement(cx=[('&', 3)])
+ self.assertEqual(len(ss.index), 102)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before or after
+ ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE | ADAPTER_AFTER')])
+ self.assertEqual(len(ss.index), 102)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before or after but not both
+ ss.filters.addRequirement(cx=[('!=', 0)])
+ ss.filters.addRequirement(cx=[('~', 1),
+ ('~', 2)])
+ self.assertEqual(len(ss.index), 65)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before or after
+ ss.filters.addRequirement(cx=[('&', 1),
+ ('&', 2)])
+ self.assertEqual(len(ss.index), 102)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before and after
+ ss.filters.addRequirement(cx=[('&', 1)])
+ ss.filters.addRequirement(cx=[('&', 2)])
+ self.assertEqual(len(ss.index), 37)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # adapter before but not after
+ ss.filters.addRequirement(cx=[('&', 1)])
+ ss.filters.addRequirement(cx=[('~', 2)])
+ self.assertEqual(len(ss.index), 33)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapter before
+ ss.filters.addRequirement(cx=[('~', 1)])
+ self.assertEqual(len(ss.index), 47)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapter before or after
+ ss.filters.addRequirement(cx=[('~', 1)])
+ ss.filters.addRequirement(cx=[('~', 2)])
+ self.assertEqual(len(ss.index), 15)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ # no adapter before or after
+ ss.filters.addRequirement(cx=[('~', 3)])
+ self.assertEqual(len(ss.index), 15)
+ ss.filters.removeRequirement('cx')
+ self.assertEqual(len(ss.index), 117)
+
+ def test_filter_reference_contigs(self):
+ ds2 = ReferenceSet(data.getRef())
+ self.assertEqual(len(list(ds2.refNames)), 59)
+ self.assertEqual(len(list(ds2.records)), len(ds2.index))
+ filt = Filters()
+ filt.addRequirement(id=[('==', 'E.faecalis.1')])
+ ds2.addFilters(filt)
+ self.assertEqual(str(ds2.filters),
+ "( id == E.faecalis.1 )")
+ self.assertEqual(len(ds2.refNames), 1)
+ self.assertEqual(len(list(ds2.records)), 1)
+ self.assertEqual(len(list(ds2.records)), len(ds2.index))
+ ds2.disableFilters()
+ self.assertEqual(len(list(ds2.refNames)), 59)
+ self.assertEqual(len(list(ds2.records)), 59)
+ self.assertEqual(len(list(ds2.records)), len(ds2.index))
+ ds2.enableFilters()
+ self.assertEqual(len(list(ds2.refNames)), 1)
+ self.assertEqual(len(list(ds2.records)), 1)
+ self.assertEqual(len(list(ds2.records)), len(ds2.index))
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_qname_filter_scaling(self):
+ # unaligned bam
+ bam0 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590956/0003/"
+ "Analysis_Results/m140913_222218_42240_c10069"
+ "9952400000001823139203261564_s1_p0.all.subreadset.xml")
+ bam1 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590953/0001/"
+ "Analysis_Results/m140913_005018_42139_c10071"
+ "3652400000001823152404301534_s1_p0.all.subreadset.xml")
+ sset = SubreadSet(bam0, bam1)
+ self.assertEqual(len(sset), 178570)
+ size = 10
+ qn = [r.qName for r in sset[:size]]
+ good_qn = [('=', name) for name in qn]
+ sset.filters.addRequirement(qname=good_qn)
+ self.assertEqual(size, sum(1 for _ in sset))
+ self.assertEqual(size, len(sset))
+
+
+ sset = SubreadSet(data.getXml(10))
+ self.assertEqual(len(sset), 92)
+ size = 10
+ qn = [r.qName for r in sset[:size]]
+ good_qn = [('=', name) for name in qn]
+ sset.filters.addRequirement(qname=good_qn)
+ self.assertEqual(size, sum(1 for _ in sset))
+ self.assertEqual(size, len(sset))
+
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_movie_filter(self):
+ # unaligned bam
+ bam0 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590956/0003/"
+ "Analysis_Results/m140913_222218_42240_c10069"
+ "9952400000001823139203261564_s1_p0.all.subreadset.xml")
+ bam1 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590953/0001/"
+ "Analysis_Results/m140913_005018_42139_c10071"
+ "3652400000001823152404301534_s1_p0.all.subreadset.xml")
+ aln = SubreadSet(bam0, bam1)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(aln.readGroupTable['ID']))
+ self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(set(aln.index.qId)))
+ self.assertEqual(len(aln), 178570)
+ aln.filters.addRequirement(movie=[(
+ '=',
+ 'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
+ self.assertEqual(len(SubreadSet(bam1)), len(aln))
+
+ # aligned bam
+ #bam0 = ("/pbi/dept/secondary/siv/testdata/"
+ # "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+ # "m140913_222218_42240_c1006999524000000018231"
+ # "39203261564_s1_p0.all.alignmentset.xml")
+ bam0 = upstreamdata.getBamAndCmpH5()[0]
+ bam1 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
+ "m140913_005018_42139_c1007136524000000018231"
+ "52404301534_s1_p0.all.alignmentset.xml")
+ aln = AlignmentSet(bam0, bam1)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(aln.readGroupTable['ID']))
+ self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(set(aln.index.qId)))
+ self.assertEqual(len(aln), 103144)
+ aln.filters.addRequirement(movie=[(
+ '=',
+ 'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
+ self.assertEqual(len(AlignmentSet(bam1)), len(aln))
+
+ # cmpH5
+ cmp1 = upstreamdata.getBamAndCmpH5()[1]
+ cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+ "genomic_consensus-unittest/bam_c4p6_tests/"
+ "ecoli_c4p6.cmp.h5")
+ aln = AlignmentSet(cmp1, cmp2)
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(aln.readGroupTable['ID']))
+ self.assertEqual(len(set(aln.readGroupTable['ID'])),
+ len(set(aln.index.MovieID)))
+ self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+ self.assertEqual(len(aln), 57147)
+ aln.filters.addRequirement(movie=[(
+ '=',
+ 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0')])
+ len1 = len(AlignmentSet(cmp1))
+ self.assertEqual(len1, len(aln))
+
+ aln.filters.removeRequirement('movie')
+ self.assertEqual(len(aln), 57147)
+
+ def test_accuracy_filter(self):
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(list(aln)), 177)
+ aln.filters.addRequirement(accuracy=[('>', '.85')])
+ self.assertEqual(len(list(aln)), 174)
+
+ def test_membership_filter(self):
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(list(aln)), 177)
+ hns = np.unique(aln.index.holeNumber)[:1]
+ aln.filters.addRequirement(zm=[('in', hns)])
+ self.assertEqual(len(list(aln)), 5)
+
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(list(aln)), 177)
+ hns = np.unique(aln.index.holeNumber)
+ aln.filters.addRequirement(zm=[('in', hns)])
+ self.assertEqual(len(list(aln)), 177)
+
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(list(aln)), 177)
+ hns = np.unique(aln.index.holeNumber)
+ hns = [n for _ in range(10000) for n in hns]
+ hns = np.array(hns)
+ aln.filters.addRequirement(zm=[('in', hns)])
+ self.assertEqual(len(list(aln)), 177)
+
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(list(aln)), 177)
+ hns = np.unique(aln.index.holeNumber)[:1]
+ hns = list(hns)
+ aln.filters.addRequirement(zm=[('in', hns)])
+ self.assertEqual(len(list(aln)), 5)
+
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(list(aln)), 177)
+ hns = np.unique(aln.index.holeNumber)[:1]
+ hns = set(hns)
+ aln.filters.addRequirement(zm=[('in', hns)])
+ self.assertEqual(len(list(aln)), 5)
+
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(list(aln)), 177)
+ qnames = [r.qName for r in aln[:10]]
+ aln.filters.addRequirement(qname=[('in', qnames)])
+ self.assertEqual(len(list(aln)), 10)
+
+ fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(list(aln)), 177)
+ hns = np.unique(aln.index.holeNumber)[:1]
+ aln.filters.addRequirement(zm=[('in', hns)])
+ aln.write(fn)
+ aln.close()
+ aln2 = AlignmentSet(fn)
+ self.assertEqual(len(list(aln2)), 5)
+
+ def test_contigset_filter(self):
+ ref = ReferenceSet(data.getXml(9))
+ self.assertEqual(len(list(ref)), 59)
+ ref.filters.addRequirement(length=[('>', '1450')])
+ self.assertEqual(len(list(ref)), 34)
+
+ def test_cmp_alignmentset_filters(self):
+ aln = AlignmentSet(upstreamdata.getBamAndCmpH5()[1], strict=True)
+ self.assertEqual(len(aln), 112)
+ aln.filters.addRequirement(length=[('>=', 1000)])
+ self.assertEqual(len(aln), 12)
diff --git a/tests/test_pbdataset_metadata.py b/tests/test_pbdataset_metadata.py
new file mode 100644
index 0000000..e20f7c1
--- /dev/null
+++ b/tests/test_pbdataset_metadata.py
@@ -0,0 +1,114 @@
+
+import logging
+import unittest
+import tempfile
+import copy
+
+from pbcore.io import SubreadSet, AlignmentSet
+from pbcore.io.dataset.DataSetErrors import InvalidDataSetIOError
+from pbcore.io.dataset.DataSetMembers import CollectionMetadata
+import pbcore.data.datasets as data
+from pbcore.io.dataset.DataSetValidator import validateFile
+
+from utils import _pbtestdata, _check_constools, _internal_data
+
+log = logging.getLogger(__name__)
+
+class TestDataSet(unittest.TestCase):
+ """Unit and integrationt tests for the DataSet class and \
+ associated module functions"""
+
+ def test_existing(self):
+ ds = SubreadSet(data.getSubreadSet(), skipMissing=True)
+ self.assertEqual(ds.metadata.bioSamples[0].name,
+ 'consectetur purus')
+
+ self.assertTrue(ds.metadata.collections[0].getV('children',
+ 'Automation'))
+ self.assertTrue(ds.metadata.collections[0].automation)
+ ds.metadata.collections[
+ 0].automation.automationParameters.addParameter('foo', 'bar')
+ self.assertEqual(
+ ds.metadata.collections[
+ 0].automation.automationParameters['foo'].value,
+ 'bar')
+ self.assertEqual(
+ ds.metadata.collections[
+ 0].automation.automationParameters.parameterNames,
+ [None, 'foo'])
+
+
+ def test_de_novo(self):
+ ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ log.info(ofn)
+ ss = SubreadSet(data.getXml(10))
+ col = CollectionMetadata()
+ self.assertFalse(ss.metadata.collections)
+
+ ss.metadata.collections.append(col)
+ self.assertTrue(ss.metadata.collections)
+
+ col.cellIndex = 1
+ self.assertTrue(ss.metadata.collections[0].cellIndex, 1)
+
+ col.instrumentName = "foo"
+ self.assertTrue(ss.metadata.collections[0].instrumentName, "foo")
+
+ col.context = 'bar'
+ self.assertTrue(ss.metadata.collections[0].context, "bar")
+
+ ss.metadata.collections[0].runDetails.name = 'foo'
+ self.assertEqual('foo', ss.metadata.collections[0].runDetails.name)
+
+ ss.metadata.collections[0].wellSample.name = 'bar'
+ self.assertEqual('bar', ss.metadata.collections[0].wellSample.name)
+
+ ss.metadata.collections[0].wellSample.wellName = 'baz'
+ self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName)
+
+ ss.metadata.collections[0].wellSample.concentration = 'baz'
+ self.assertEqual('baz',
+ ss.metadata.collections[0].wellSample.concentration)
+ ss.write(ofn, validate=False)
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_loadMetadata(self):
+ aln = AlignmentSet(data.getXml(no=8))
+ self.assertFalse(aln.metadata.collections)
+ aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.run.metadata.xml')
+ self.assertTrue(aln.metadata.collections)
+ sset_fn = ('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.subreadset.xml')
+ sset = SubreadSet(sset_fn)
+ orig_metadata = copy.deepcopy(sset.metadata)
+ sset.metadata.collections = None
+ self.assertFalse(sset.metadata.collections)
+ sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.run.metadata.xml')
+ stack = zip(sset.metadata, orig_metadata)
+ fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ sset.write(fn)
+ validateFile(fn)
+ validateFile(sset_fn)
+ self.assertEqual(sset.metadata, orig_metadata)
+
+
+ # load the wrong thing...
+ sset_fn = ('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.subreadset.xml')
+ sset = SubreadSet(sset_fn)
+ orig_metadata = copy.deepcopy(sset.metadata)
+ sset.metadata.collections = None
+ self.assertFalse(sset.metadata.collections)
+ with self.assertRaises(InvalidDataSetIOError):
+ sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.sts.xml')
+
+
diff --git a/tests/test_pbdataset_split.py b/tests/test_pbdataset_split.py
new file mode 100644
index 0000000..9ca3b26
--- /dev/null
+++ b/tests/test_pbdataset_split.py
@@ -0,0 +1,506 @@
+
+import os
+import logging
+import tempfile
+import unittest
+from unittest.case import SkipTest
+
+import numpy as np
+
+from pbcore.io import openIndexedAlignmentFile
+from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
+ openDataSet, HdfSubreadSet,
+ openDataFile)
+import pbcore.data.datasets as data
+import pbcore.data as upstreamdata
+
+from utils import _pbtestdata, _check_constools, _internal_data
+
+log = logging.getLogger(__name__)
+
+class TestDataSetSplit(unittest.TestCase):
+ """Unit and integrationt tests for the DataSet class and \
+ associated module functions"""
+
+ def test_split(self):
+ ds1 = openDataSet(data.getXml(12))
+ self.assertTrue(ds1.numExternalResources > 1)
+ dss = ds1.split()
+ self.assertTrue(len(dss) == ds1.numExternalResources)
+ self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
+ self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
+ self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
+ dss = ds1.split(chunks=1)
+ self.assertTrue(len(dss) == 1)
+ self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
+ self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
+ self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
+ dss = ds1.split(chunks=2)
+ self.assertTrue(len(dss) == 2)
+ self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
+ self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
+ self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
+ dss = ds1.split(chunks=2, ignoreSubDatasets=True)
+ self.assertTrue(len(dss) == 2)
+ self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords)
+ self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength)
+ self.assertEqual(sum(len(ds) for ds in dss), len(ds1))
+ self.assertFalse(dss[0].uuid == dss[1].uuid)
+ self.assertTrue(dss[0].name == dss[1].name)
+ # Lets try merging and splitting on subdatasets
+ ds1 = openDataSet(data.getXml(8))
+ self.assertEquals(ds1.totalLength, 123588)
+ ds1tl = ds1.totalLength
+ ds2 = openDataSet(data.getXml(11))
+ self.assertEquals(ds2.totalLength, 117086)
+ ds2tl = ds2.totalLength
+ dss = ds1 + ds2
+ self.assertTrue(dss.totalLength == (ds1tl + ds2tl))
+ ds1, ds2 = sorted(dss.split(2, ignoreSubDatasets=False),
+ key=lambda x: x.totalLength,
+ reverse=True)
+ self.assertTrue(ds1.totalLength == ds1tl)
+ self.assertTrue(ds2.totalLength == ds2tl)
+
+ def test_split_zmws(self):
+ N_RECORDS = 117
+ test_file = upstreamdata.getUnalignedBam()
+ ds1 = openDataFile(test_file)
+ self.assertEqual(len([r for r in ds1]), N_RECORDS)
+ self.assertEqual(len(ds1), N_RECORDS)
+ dss = ds1.split(chunks=1, zmws=True)
+ self.assertEqual(len(dss), 1)
+ self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
+ N_RECORDS)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+
+ # We have a lower limit on the number of zmws, now
+ dss = ds1.split(chunks=12, zmws=True)
+ self.assertEqual(len(dss), 2)
+ self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
+ N_RECORDS)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+ self.assertEqual(
+ dss[0].zmwRanges,
+ [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
+ 1650, 32328)])
+ self.assertEqual(
+ dss[-1].zmwRanges,
+ [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
+ 32560, 54396)])
+ ranges = sorted([c.zmwRanges[0][1:] for c in dss])
+ interspans = []
+ last = None
+ for rg in ranges:
+ if not last is None:
+ interspans.append((last, rg[0]))
+ self.assertFalse(last == rg[0])
+ last = rg[1]
+ for rg in interspans:
+ self.assertEqual(len(np.nonzero(np.logical_and(
+ ds1.index.holeNumber < rg[1],
+ ds1.index.holeNumber > rg[0]))[0]), 0)
+
+ #@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+ # "Missing testadata directory")
+ @unittest.skip("Too expensive")
+ def test_large_split_zmws(self):
+ N_RECORDS = 959539
+ test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+ "2372215/0007/Analysis_Results/m150404_101626_42"
+ "267_c100807920800000001823174110291514_s1_p0.al"
+ "l.subreadset.xml")
+ ds1 = openDataFile(test_file)
+ self.assertEqual(len(ds1), N_RECORDS)
+ dss = ds1.split(chunks=1, zmws=True)
+ self.assertEqual(len(dss), 1)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+ dss = ds1.split(chunks=12, zmws=True)
+ self.assertEqual(len(dss), 12)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+ self.assertEqual(
+ dss[0].zmwRanges,
+ [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+ 7, 14007)])
+ self.assertEqual(
+ dss[-1].zmwRanges,
+ [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+ 149876, 163475)])
+ ranges = sorted([c.zmwRanges[0][1:] for c in dss])
+ interspans = []
+ last = None
+ for rg in ranges:
+ if not last is None:
+ interspans.append((last, rg[0]))
+ self.assertFalse(last == rg[0])
+ last = rg[1]
+ for rg in interspans:
+ self.assertEqual(len(np.nonzero(np.logical_and(
+ ds1.index.holeNumber < rg[1],
+ ds1.index.holeNumber > rg[0]))[0]), 0)
+
+
+ @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+ "Missing testadata directory")
+ def test_multi_movie_split_zmws(self):
+ N_RECORDS = 1745161
+ test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+ "2372215/0007/Analysis_Results/m150404_101626_42"
+ "267_c100807920800000001823174110291514_s1_p0.al"
+ "l.subreadset.xml")
+ test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+ "2590980/0008/Analysis_Results/m141115_075238_et"
+ "han_c100699872550000001823139203261572_s1_p0.al"
+ "l.subreadset.xml")
+ ds1 = SubreadSet(test_file_1, test_file_2)
+ # used to get total:
+ #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
+ self.assertEqual(len(ds1), N_RECORDS)
+ dss = ds1.split(chunks=1, zmws=True)
+ self.assertEqual(len(dss), 1)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+
+ dss = ds1.split(chunks=12, zmws=True)
+ self.assertEqual(len(dss), 12)
+ self.assertEqual(sum([len(ds_) for ds_ in dss]),
+ N_RECORDS)
+ self.assertEqual(
+ dss[0].zmwRanges,
+ [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+ 7, 22098)])
+ self.assertEqual(
+ dss[-1].zmwRanges,
+ [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0',
+ 127814, 163468)])
+
+
+ @SkipTest
+ def test_split_by_contigs_presplit(self):
+ # Consumes too much memory for Jenkins
+
+ # Test to make sure the result of a split by contigs has an appropriate
+ # number of records (make sure filters are appropriately aggressive)
+ ds2 = DataSet(data.getXml(15))
+ bams = ds2.externalResources.resourceIds
+ self.assertEqual(len(bams), 2)
+ refwindows = ds2.refWindows
+ self.assertEqual(refwindows, [(0, 0, 224992)])
+ res1 = openIndexedAlignmentFile(bams[0][7:])
+ res2 = openIndexedAlignmentFile(bams[1][7:])
+ def count(iterable):
+ count = 0
+ for _ in iterable:
+ count += 1
+ return count
+ self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409)
+ self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375)
+ self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784)
+ self.assertEqual(count(ds2.records), 2784)
+ ds2.disableFilters()
+ self.assertEqual(count(ds2.records), 53552)
+ self.assertEqual(ds2.countRecords(), 53552)
+
+ def test_split_by_contigs_with_split_and_maxChunks(self):
+ # test to make sure the refWindows work when chunks == # refs
+ ds3 = AlignmentSet(data.getBam())
+ dss = ds3.split(contigs=True)
+ self.assertEqual(len(dss), 12)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ # not all references have something mapped to them, refWindows doesn't
+ # care...
+ self.assertNotEqual(refWindows, sorted(ds3.refWindows))
+ self.assertEqual(refWindows,
+ [('B.vulgatus.4', 0, 1449), ('B.vulgatus.5', 0, 1449),
+ ('C.beijerinckii.13', 0, 1433), ('C.beijerinckii.14', 0, 1433),
+ ('C.beijerinckii.9', 0, 1433), ('E.coli.6', 0, 1463),
+ ('E.faecalis.1', 0, 1482), ('E.faecalis.2', 0, 1482),
+ ('R.sphaeroides.1', 0, 1386), ('S.epidermidis.2', 0, 1472),
+ ('S.epidermidis.3', 0, 1472), ('S.epidermidis.4', 0, 1472)])
+ old_refWindows = refWindows
+ random_few = [('C.beijerinckii.13', 0, 1433),
+ ('B.vulgatus.4', 0, 1449),
+ ('E.faecalis.1', 0, 1482)]
+
+ dss = ds3.split(contigs=True, maxChunks=1)
+ self.assertEqual(len(dss), 1)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ self.assertEqual(refWindows, old_refWindows)
+
+ dss = ds3.split(contigs=True, maxChunks=24)
+ # This isn't expected if num refs >= 100, as map check isn't made
+ # for now (too expensive)
+ # There are only 12 refs represented in this set, however...
+ self.assertEqual(len(dss), 12)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ if not found:
+ log.debug(ref)
+ self.assertTrue(found)
+
+ # test with maxchunks but no breaking contigs
+ dss = ds3.split(contigs=True, maxChunks=36)
+ self.assertEqual(len(dss), 12)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ self.assertTrue(found)
+
+ # test with maxchunks and breaking contigs is allowed (triggers
+ # targetsize, may result in fewer chunks)
+ dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True)
+ self.assertEqual(len(dss), 2)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ self.assertTrue(found)
+
+ # test with previous setup and smaller targetSize, resulting in more
+ # chunks
+ dss = ds3.split(contigs=True, maxChunks=36, breakContigs=True,
+ targetSize=10)
+ self.assertEqual(len(dss), 9)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ self.assertTrue(found)
+
+ # test with byRecords and fewer chunks than atoms
+ dss = ds3.split(contigs=True, chunks=3, byRecords=True)
+ self.assertEqual(len(dss), 3)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ self.assertTrue(found)
+
+ # test with byRecords and more chunks than atoms
+ orf = random_few
+ random_few = [('C.beijerinckii.13', 0, 747),
+ ('B.vulgatus.4', 0, 1449),
+ ('E.faecalis.1', 0, 742)]
+ dss = ds3.split(contigs=True, chunks=16, byRecords=True)
+ self.assertEqual(len(dss), 16)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ self.assertTrue(found)
+
+ # test with byRecords and updateCounts
+ random_few = orf
+ dss = ds3.split(contigs=True, chunks=3, byRecords=True,
+ updateCounts=True)
+ self.assertEqual(len(dss), 3)
+ sizes = sorted([dset.numRecords for dset in dss])
+ self.assertListEqual(sizes, [30, 31, 31])
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ self.assertTrue(found)
+
+ # test with byRefLength and updateCounts
+ random_few = orf
+ dss = ds3.split(contigs=True, chunks=3, updateCounts=True)
+ self.assertEqual(len(dss), 3)
+ sizes = sorted([dset.numRecords for dset in dss])
+ self.assertListEqual(sizes, [20, 24, 48])
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ self.assertTrue(found)
+
+ def test_split_by_contigs_with_split(self):
+ # test to make sure the refWindows work when chunks == # refs
+ ds3 = AlignmentSet(data.getBam())
+ dss = ds3.split(contigs=True)
+ self.assertEqual(len(dss), 12)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ # not all references have something mapped to them, refWindows doesn't
+ # care...
+ self.assertNotEqual(refWindows, sorted(ds3.refWindows))
+ random_few = [('C.beijerinckii.13', 0, 1433),
+ ('B.vulgatus.4', 0, 1449),
+ ('E.faecalis.1', 0, 1482)]
+ for reference in random_few:
+ found = False
+ for ref in refWindows:
+ if ref == reference:
+ found = True
+ self.assertTrue(found)
+ old_refWindows = refWindows
+
+ dss = ds3.split(contigs=True, chunks=1)
+ self.assertEqual(len(dss), 1)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ self.assertEqual(refWindows, old_refWindows)
+
+ dss = ds3.split(contigs=True, chunks=24)
+ self.assertEqual(len(dss), 24)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+
+ random_few = [('E.faecalis.2', 0, 741),
+ ('E.faecalis.2', 741, 1482)]
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ if not found:
+ log.debug(ref)
+ self.assertTrue(found)
+
+ dss = ds3.split(contigs=True, chunks=36)
+ self.assertEqual(len(dss), 36)
+ refWindows = sorted(reduce(lambda x, y: x + y,
+ [ds.refWindows for ds in dss]))
+ random_few = [('E.faecalis.2', 0, 494),
+ ('E.faecalis.2', 494, 988),
+ ('E.faecalis.2', 988, 1482)]
+ for ref in random_few:
+ found = False
+ for window in refWindows:
+ if ref == window:
+ found = True
+ self.assertTrue(found)
+
+ def test_refWindows(self):
+ ds = AlignmentSet(data.getBam())
+ dss = ds.split(chunks=2, contigs=True)
+ self.assertEqual(len(dss), 2)
+ log.debug(dss[0].filters)
+ log.debug(dss[1].filters)
+ self.assertTrue(
+ '( rname = E.faecalis.2 '
+ in str(dss[0].filters)
+ or
+ '( rname = E.faecalis.2 '
+ in str(dss[1].filters))
+ ds = AlignmentSet(data.getBam())
+ ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
+ ('=', 'E.faecalis.2')],
+ tStart=[('<', '99'),
+ ('<', '299')],
+ tEnd=[('>', '0'),
+ ('>', '100')])
+ self.assertEqual(str(ds.filters),
+ '( rname = E.faecalis.2 AND tstart '
+ '< 99 AND tend > 0 ) OR ( rname = '
+ 'E.faecalis.2 AND tstart < 299 AND tend > 100 )')
+ self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
+ ('E.faecalis.2', 100, 299)])
+
+
+ @unittest.skip("Too expensive")
+ def test_huge_zmw_split(self):
+ human = ('/pbi/dept/secondary/siv/testdata/SA3-DS/'
+ 'human/JCV_85x_v030/jcv_85x_v030.subreadset.xml')
+ sset = SubreadSet(human)
+ ssets = sset.split(zmws=True, maxChunks=5)
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not found, skipping")
+ def test_subreadset_split_metadata_element_name(self):
+ fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ log.debug(fn)
+ sset = SubreadSet("/pbi/dept/secondary/siv/testdata/"
+ "SA3-Sequel/phi29/315/3150101/"
+ "r54008_20160219_002905/1_A01/"
+ "m54008_160219_003234.subreadset.xml")
+ chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True)
+ chunks[0].write(fn)
+
+ def test_contigset_split(self):
+ ref = ReferenceSet(data.getXml(9))
+ exp_n_contigs = len(ref)
+ refs = ref.split(10)
+ self.assertEqual(len(refs), 10)
+ obs_n_contigs = 0
+ for r in refs:
+ obs_n_contigs += sum(1 for _ in r)
+ self.assertEqual(obs_n_contigs, exp_n_contigs)
+
+
+ def test_split_hdfsubreadset(self):
+ hdfds = HdfSubreadSet(*upstreamdata.getBaxH5_v23())
+ self.assertEqual(len(hdfds.toExternalFiles()), 3)
+ hdfdss = hdfds.split(chunks=2, ignoreSubDatasets=True)
+ self.assertEqual(len(hdfdss), 2)
+ self.assertEqual(len(hdfdss[0].toExternalFiles()), 2)
+ self.assertEqual(len(hdfdss[1].toExternalFiles()), 1)
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not found, skipping")
+ def test_barcode_split_cornercases(self):
+ fn = ('/pbi/dept/secondary/siv/testdata/'
+ 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
+ '.tiny.subreadset.xml')
+ sset = SubreadSet(fn, skipMissing=True)
+ ssets = sset.split(chunks=3, barcodes=True)
+ self.assertEqual([str(ss.filters) for ss in ssets],
+ ["( bc = [0, 0] )",
+ "( bc = [1, 1] )",
+ "( bc = [2, 2] )"])
+ sset = SubreadSet(fn, skipMissing=True)
+ self.assertEqual(len(sset), 15133)
+ sset.filters = None
+ self.assertEqual(str(sset.filters), "")
+ sset.updateCounts()
+ self.assertEqual(len(sset), 2667562)
+
+ sset.filters.addRequirement(bc=[('=', '[2, 2]')])
+ self.assertEqual(str(sset.filters), "( bc = [2, 2] )")
+ sset.updateCounts()
+ self.assertEqual(len(sset), 4710)
+
+ sset.filters = None
+ self.assertEqual(str(sset.filters), "")
+ sset.updateCounts()
+ self.assertEqual(len(sset), 2667562)
+
+ sset.filters.addRequirement(bc=[('=', '[2,2]')])
+ self.assertEqual(str(sset.filters), "( bc = [2,2] )")
+ sset.updateCounts()
+ self.assertEqual(len(sset), 4710)
+
diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py
index 71282a7..9d74aeb 100644
--- a/tests/test_pbdataset_subtypes.py
+++ b/tests/test_pbdataset_subtypes.py
@@ -5,22 +5,20 @@ import unittest
import tempfile
import os
import itertools
-import numpy as np
-
-import pysam
+import xml.etree.ElementTree as ET
from pbcore.util.Process import backticks
from pbcore.io.dataset.utils import (consolidateBams, _infixFname,
- BamtoolsVersion)
-from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet,
+ BamtoolsVersion, consolidateXml)
+from pbcore.io import (SubreadSet, ConsensusReadSet,
ReferenceSet, ContigSet, AlignmentSet, BarcodeSet,
FastaReader, FastaWriter, IndexedFastaReader,
HdfSubreadSet, ConsensusAlignmentSet,
- openDataFile, FastaWriter, FastqReader, openDataSet)
+ openDataFile, FastqReader,
+ GmapReferenceSet)
import pbcore.data as upstreamData
import pbcore.data.datasets as data
from pbcore.io.dataset.DataSetValidator import validateXml
-import xml.etree.ElementTree as ET
log = logging.getLogger(__name__)
@@ -49,7 +47,6 @@ class TestDataSet(unittest.TestCase):
"""Unit and integrationt tests for the DataSet class and \
associated module functions"""
-
def test_subread_build(self):
ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
ds2 = SubreadSet(data.getXml(no=5), skipMissing=True)
@@ -67,6 +64,38 @@ class TestDataSet(unittest.TestCase):
self.assertEquals(type(ds4._metadata).__name__, 'SubreadSetMetadata')
self.assertEquals(len(ds4.metadata.collections), 1)
+ def test_subreadset_metadata_element_name(self):
+ # without touching the element:
+ sset = SubreadSet(data.getXml(10))
+ log.debug(data.getXml(10))
+ fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ log.debug(fn)
+ sset.write(fn)
+ f = ET.parse(fn)
+ self.assertEqual(len(f.getroot().findall(
+ '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
+ 'SubreadSetMetadata')),
+ 0)
+ self.assertEqual(len(f.getroot().findall(
+ '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
+ 'DataSetMetadata')),
+ 1)
+
+ # with touching the element:
+ sset = SubreadSet(data.getXml(10))
+ sset.metadata.description = 'foo'
+ fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ sset.write(fn, validate=False)
+ f = ET.parse(fn)
+ self.assertEqual(len(f.getroot().findall(
+ '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
+ 'SubreadSetMetadata')),
+ 0)
+ self.assertEqual(len(f.getroot().findall(
+ '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
+ 'DataSetMetadata')),
+ 1)
+
def test_valid_referencesets(self):
validateXml(ET.parse(data.getXml(9)).getroot(), skipResources=True)
@@ -134,17 +163,6 @@ class TestDataSet(unittest.TestCase):
for name in names:
self.assertTrue(ds[name].id == name)
- def test_contigset_split(self):
- ref = ReferenceSet(data.getXml(9))
- exp_n_contigs = len(ref)
- refs = ref.split(10)
- self.assertEqual(len(refs), 10)
- obs_n_contigs = 0
- for r in refs:
- obs_n_contigs += sum(1 for _ in r)
- self.assertEqual(obs_n_contigs, exp_n_contigs)
-
-
def test_contigset_len(self):
ref = ReferenceSet(data.getXml(9))
exp_n_contigs = len(ref)
@@ -155,6 +173,30 @@ class TestDataSet(unittest.TestCase):
obs_n_contigs += len(r)
self.assertEqual(obs_n_contigs, exp_n_contigs)
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not found, skipping")
+ def test_gmapreferenceset_len(self):
+ fas = ('/pbi/dept/secondary/siv/testdata/isoseq/'
+ 'lexigoen-ground-truth/reference/SIRV_150601a.fasta')
+ ref = GmapReferenceSet(fas)
+ fn = '/pbi/dept/secondary/siv/testdata/isoseq/gmap_db'
+ ver = '2014-12-19'
+ name = 'SIRV'
+ ref.gmap = fn
+ self.assertEqual(ref.gmap.resourceId, fn)
+ ref.gmap.version = ver
+ self.assertEqual(ref.gmap.version, ver)
+ ref.gmap.name = name
+ self.assertEqual(ref.gmap.name, name)
+ fn = tempfile.NamedTemporaryFile(suffix=".gmapreferenceset.xml").name
+ # TODO: Turn validation back on when xsd sync happens
+ ref.write(fn, validate=False)
+ log.debug(fn)
+ gref = GmapReferenceSet(fn)
+ gref.gmap.resourceId
+ gref.gmap.version
+ gref.gmap.name
+ gref.gmap.timeStampedName
def test_ccsread_build(self):
ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
@@ -255,6 +297,31 @@ class TestDataSet(unittest.TestCase):
@unittest.skipIf(not _check_constools(),
"bamtools or pbindex not found, skipping")
+ def test_pbmerge_indexing(self):
+ log.debug("Test through API")
+ aln = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(aln.toExternalFiles()), 2)
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outfn = os.path.join(outdir, 'merged.bam')
+ log.info(outfn)
+ consolidateXml(aln, outfn, cleanup=False)
+ self.assertTrue(os.path.exists(outfn))
+ self.assertTrue(os.path.exists(outfn + '.pbi'))
+ cons = AlignmentSet(outfn)
+ self.assertEqual(len(aln), len(cons))
+ orig_stats = os.stat(outfn + '.pbi')
+ cons.externalResources[0].pbi = None
+ self.assertEqual(None, cons.externalResources[0].pbi)
+ cons.induceIndices()
+ self.assertEqual(outfn + '.pbi', cons.externalResources[0].pbi)
+ self.assertEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
+ cons.externalResources[0].pbi = None
+ self.assertEqual(None, cons.externalResources[0].pbi)
+ cons.induceIndices(force=True)
+ self.assertNotEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
+
+ @unittest.skipIf(not _check_constools(),
+ "bamtools or pbindex not found, skipping")
def test_alignmentset_consolidate(self):
log.debug("Test methods directly")
aln = AlignmentSet(data.getXml(12))
@@ -399,18 +466,6 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(aln.externalResources[0].reference, reference)
- def test_accuracy_filter(self):
- aln = AlignmentSet(data.getXml(12))
- self.assertEqual(len(list(aln)), 177)
- aln.filters.addRequirement(accuracy=[('>', '.85')])
- self.assertEqual(len(list(aln)), 174)
-
- def test_contigset_filter(self):
- ref = ReferenceSet(data.getXml(9))
- self.assertEqual(len(list(ref)), 59)
- ref.filters.addRequirement(length=[('>', '1450')])
- self.assertEqual(len(list(ref)), 34)
-
@unittest.skipIf(not _check_constools() or not _internal_data(),
"bamtools, pbindex or data not found, skipping")
def test_alignmentset_partial_consolidate(self):
@@ -585,14 +640,6 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(len([rec for rec in ds_new]), 1,
"failed on %d" % i)
- def test_split_hdfsubreadset(self):
- hdfds = HdfSubreadSet(*upstreamData.getBaxH5_v23())
- self.assertEqual(len(hdfds.toExternalFiles()), 3)
- hdfdss = hdfds.split(chunks=2, ignoreSubDatasets=True)
- self.assertEqual(len(hdfdss), 2)
- self.assertEqual(len(hdfdss[0].toExternalFiles()), 2)
- self.assertEqual(len(hdfdss[1].toExternalFiles()), 1)
-
@unittest.skipIf(not _internal_data(),
"Internal data not found, skipping")
def test_len_fastq(self):
@@ -812,25 +859,32 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(sset.numRecords, 59)
def test_alignment_reference(self):
+ rfn = data.getXml(9)
rs1 = ReferenceSet(data.getXml(9))
fasta_res = rs1.externalResources[0]
fasta_file = urlparse(fasta_res.resourceId).path
ds1 = AlignmentSet(data.getXml(8),
- referenceFastaFname=rs1)
+ referenceFastaFname=rs1)
aln_ref = None
for aln in ds1:
aln_ref = aln.reference()
break
self.assertTrue(aln_ref is not None)
+ self.assertEqual(ds1.externalResources[0].reference, fasta_file)
+ self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
+ fasta_file)
ds1 = AlignmentSet(data.getXml(8),
- referenceFastaFname=fasta_file)
+ referenceFastaFname=fasta_file)
aln_ref = None
for aln in ds1:
aln_ref = aln.reference()
break
self.assertTrue(aln_ref is not None)
+ self.assertEqual(ds1.externalResources[0].reference, fasta_file)
+ self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
+ fasta_file)
ds1 = AlignmentSet(data.getXml(8))
ds1.addReference(fasta_file)
@@ -839,6 +893,41 @@ class TestDataSet(unittest.TestCase):
aln_ref = aln.reference()
break
self.assertTrue(aln_ref is not None)
+ self.assertEqual(ds1.externalResources[0].reference, fasta_file)
+ self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
+ fasta_file)
+
+ fofn_out = tempfile.NamedTemporaryFile(suffix=".fofn").name
+ log.debug(fofn_out)
+ with open(fofn_out, 'w') as f:
+ f.write(data.getXml(8))
+ f.write('\n')
+ f.write(data.getXml(11))
+ f.write('\n')
+ ds1 = AlignmentSet(fofn_out,
+ referenceFastaFname=fasta_file)
+ aln_ref = None
+ for aln in ds1:
+ aln_ref = aln.reference()
+ break
+ self.assertTrue(aln_ref is not None)
+ self.assertEqual(ds1.externalResources[0].reference, fasta_file)
+ self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
+ fasta_file)
+
+ # Re-enable when referenceset is acceptable reference
+ #ds1 = AlignmentSet(data.getXml(8),
+ # referenceFastaFname=rfn)
+ #aln_ref = None
+ #for aln in ds1:
+ # aln_ref = aln.reference()
+ # break
+ #self.assertTrue(aln_ref is not None)
+ #self.assertTrue(isinstance(aln_ref, basestring))
+ #self.assertEqual(
+ # ds1.externalResources[0]._getSubExtResByMetaType(
+ # 'PacBio.ReferenceFile.ReferenceFastaFile').uniqueId,
+ # rs1.uuid)
def test_nested_external_resources(self):
log.debug("Testing nested externalResources in AlignmentSets")
@@ -931,40 +1020,6 @@ class TestDataSet(unittest.TestCase):
ds_out = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name
ds.write(ds_out)
- @unittest.skipIf(not _internal_data(),
- "Internal data not found, skipping")
- def test_barcode_split_cornercases(self):
- fn = ('/pbi/dept/secondary/siv/testdata/'
- 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
- '.tiny.subreadset.xml')
- sset = SubreadSet(fn)
- ssets = sset.split(chunks=3, barcodes=True)
- self.assertEqual([str(ss.filters) for ss in ssets],
- ["( bc = [0, 0] )",
- "( bc = [1, 1] )",
- "( bc = [2, 2] )"])
- sset = SubreadSet(fn)
- self.assertEqual(len(sset), 15133)
- sset.filters = None
- self.assertEqual(str(sset.filters), "")
- sset.updateCounts()
- self.assertEqual(len(sset), 2667562)
-
- sset.filters.addRequirement(bc=[('=', '[2, 2]')])
- self.assertEqual(str(sset.filters), "( bc = [2, 2] )")
- sset.updateCounts()
- self.assertEqual(len(sset), 4710)
-
- sset.filters = None
- self.assertEqual(str(sset.filters), "")
- sset.updateCounts()
- self.assertEqual(len(sset), 2667562)
-
- sset.filters.addRequirement(bc=[('=', '[2,2]')])
- self.assertEqual(str(sset.filters), "( bc = [2,2] )")
- sset.updateCounts()
- self.assertEqual(len(sset), 4710)
-
def test_merged_contigset(self):
fn = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
with ContigSet(upstreamData.getLambdaFasta(),
diff --git a/tests/test_pbdataset_utils.py b/tests/test_pbdataset_utils.py
new file mode 100644
index 0000000..e19e02c
--- /dev/null
+++ b/tests/test_pbdataset_utils.py
@@ -0,0 +1,183 @@
+
+import logging
+import tempfile
+import unittest
+
+from pbcore.io.dataset.DataSetMetaTypes import dsIdToSuffix
+from pbcore.io import (DataSetMetaTypes, divideKeys, keysToRanges,
+ SubreadSet, getDataSetUuid, getDataSetMetaType)
+from pbcore.io.dataset.utils import (xy_to_hn, hn_to_xy, sampleHolesUniformly,
+ sampleUniformly, quadratic_expand,
+ prodround)
+
+import pbcore.data as upstreamdata
+
+from utils import _pbtestdata, _check_constools, _internal_data
+
+log = logging.getLogger(__name__)
+
+class TestDataSetUtils(unittest.TestCase):
+ """Unit and integrationt tests for the DataSet class and \
+ associated module functions"""
+
+ def test_hn_xy_converters(self):
+ for x in range(64, 1024, 10):
+ for y in range(64, 1144, 10):
+ hn = xy_to_hn(x, y)
+ ox, oy = hn_to_xy(hn)
+ self.assertEqual(x, ox)
+ self.assertEqual(y, oy)
+
+ def test_quadratic_expand(self):
+ i = [[0, 1, 2]]
+ e = [[0], [1], [2]]
+ o = quadratic_expand(i)
+ self.assertEqual(e, o)
+
+ i = [[0, 1, 2], [3, 4, 5]]
+ e = [[0, 3], [1, 3], [2, 3],
+ [0, 4], [1, 4], [2, 4],
+ [0, 5], [1, 5], [2, 5]]
+ o = quadratic_expand(i)
+ self.assertEqual(e, o)
+
+ def test_prodround(self):
+ i = [1.1, 2.5, 3.8]
+ e = [1, 2, 4]
+ o = prodround(i, 8)
+ self.assertEqual(e, o)
+ e = [1, 3, 3]
+ o = prodround(i, 9)
+ self.assertEqual(e, o)
+ e = [1, 3, 4]
+ o = prodround(i, 12)
+ self.assertEqual(e, o)
+
+ def test_sampleUniformly(self):
+ hns = sampleUniformly(4, [(0, 1), (0, 1)])
+ self.assertEqual(hns, [[0, 0], [1, 0], [0, 1], [1, 1]])
+
+ hns = sampleUniformly(4, [(0, 4), (0, 4)])
+ self.assertEqual(hns, [[1, 1], [3, 1], [1, 3], [3, 3]])
+
+ hns = sampleUniformly(4, [(0, 4), (0, 8)])
+ self.assertEqual(hns, [[1, 3], [3, 3], [1, 5], [3, 5]])
+
+ hns = sampleUniformly(3, [(0, 4), (0, 8)])
+ self.assertEqual(hns, [[2, 2], [2, 4], [2, 6]])
+
+ def test_sampleHolesUniformly(self):
+ ncols = 1144
+ nrows = 1024
+ bounds = [(64, ncols), (64, nrows)]
+ expected = [xy_to_hn(x, y) for x,y in sampleUniformly(100, bounds)]
+ all_xy = quadratic_expand([range(64, ncols), range(64, nrows)])
+ all_holes = [xy_to_hn(x, y) for x, y in all_xy]
+ samples = sampleHolesUniformly(100, all_holes)
+ self.assertEqual(expected, samples)
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_sampleHolesUniformly_realdata(self):
+ path = ('/pbi/dept/secondary/siv/testdata/SA3-Sequel/'
+ 'lambda/315/3150128/r54008_20160308_001811/'
+ '2_B01/m54008_160308_053311.subreads.bam')
+ ds = SubreadSet(path, strict=True)
+ samples = sampleHolesUniformly(100, ds.index.holeNumber)
+ self.assertEqual(len(samples), 100)
+
+ def test_get_dataset_uuid(self):
+ ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
+ ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ ds.write(ds_file)
+ uuid = getDataSetUuid(ds_file)
+ self.assertEqual(uuid, ds.uuid)
+ with open(ds_file, "w") as out:
+ out.write("hello world!")
+ uuid = getDataSetUuid(ds_file)
+ self.assertEqual(uuid, None)
+
+ def test_get_dataset_metatype(self):
+ ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
+ ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ ds.write(ds_file)
+ meta_type = getDataSetMetaType(ds_file)
+ self.assertEqual(meta_type, "PacBio.DataSet.SubreadSet")
+
+ def test_dsIdToSuffix(self):
+ suffixes = ['subreadset.xml', 'hdfsubreadset.xml', 'alignmentset.xml',
+ 'barcodeset.xml', 'consensusreadset.xml',
+ 'consensusalignmentset.xml',
+ 'referenceset.xml', 'contigset.xml']
+ for dsId, exp in zip(DataSetMetaTypes.ALL, suffixes):
+ self.assertEqual(dsIdToSuffix(dsId), exp)
+
+ def test_divideKeys_keysToRanges(self):
+ keys = [0, 1, 2, 3, 5, 8, 50]
+ res = divideKeys(keys, 0)
+ self.assertEqual(res, [])
+ res = keysToRanges(res)
+ self.assertEqual(res, [])
+
+ res = divideKeys(keys, 1)
+ self.assertEqual(res, [[0, 1, 2, 3, 5, 8, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 50]])
+
+ res = divideKeys(keys, 2)
+ self.assertEqual(res, [[0, 1, 2], [3, 5, 8, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 2], [3, 50]])
+
+ res = divideKeys(keys, 3)
+ self.assertEqual(res, [[0, 1], [2, 3], [5, 8, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 1], [2, 3], [5, 50]])
+
+ res = divideKeys(keys, 7)
+ self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
+ [5, 5], [8, 8], [50, 50]])
+
+ res = divideKeys(keys, 8)
+ self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
+ [5, 5], [8, 8], [50, 50]])
+
+
+ keys = [0, 1, 2, 2, 3, 5, 8, 50, 50]
+ res = divideKeys(keys, 0)
+ self.assertEqual(res, [])
+ res = keysToRanges(res)
+ self.assertEqual(res, [])
+
+ res = divideKeys(keys, 1)
+ self.assertEqual(res, [[0, 1, 2, 2, 3, 5, 8, 50, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 50]])
+
+ res = divideKeys(keys, 2)
+ self.assertEqual(res, [[0, 1, 2, 2], [3, 5, 8, 50, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 2], [3, 50]])
+
+ res = divideKeys(keys, 3)
+ self.assertEqual(res, [[0, 1, 2], [2, 3, 5], [8, 50, 50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 2], [2, 5], [8, 50]])
+
+ res = divideKeys(keys, 9)
+ self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
+ [5, 5], [8, 8], [50, 50], [50, 50]])
+
+ res = divideKeys(keys, 10)
+ self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
+ res = keysToRanges(res)
+ self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
+ [5, 5], [8, 8], [50, 50], [50, 50]])
+
+
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..4b0d2da
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,40 @@
+
+import logging
+import os
+from pbcore.util.Process import backticks
+from pbcore.io.dataset.utils import BamtoolsVersion
+
+log = logging.getLogger(__name__)
+
+def _pbtestdata():
+ try:
+ import pbtestdata
+ return True
+ except ImportError:
+ return False
+
+def _check_constools():
+ if not BamtoolsVersion().good:
+ log.warn("Bamtools not found or out of date")
+ return False
+
+ cmd = "pbindex"
+ o, r, m = backticks(cmd)
+ if r != 1:
+ return False
+
+ cmd = "samtools"
+ o, r, m = backticks(cmd)
+ if r != 1:
+ return False
+
+ cmd = "pbmerge"
+ o, r, m = backticks(cmd)
+ if r != 1:
+ return False
+ return True
+
+def _internal_data():
+ if os.path.exists("/pbi/dept/secondary/siv/testdata"):
+ return True
+ return False
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git
More information about the debian-med-commit
mailing list