[med-svn] [python-pbcore] 01/06: Imported Upstream version 1.2.10+dfsg
Afif Elghraoui
afif at moszumanska.debian.org
Sun Oct 9 18:38:06 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository python-pbcore.
commit a06a1f09e29c192af60fc19fce873086e8343c43
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Sat Oct 8 14:54:03 2016 -0700
Imported Upstream version 1.2.10+dfsg
---
CHANGELOG.org | 5 +
pbcore/__init__.py | 2 +-
pbcore/chemistry/resources/mapping.xml | 71 ++++++-
pbcore/io/align/BamAlignment.py | 2 +-
pbcore/io/align/PacBioBamIndex.py | 17 ++
pbcore/io/dataset/DataSetErrors.py | 17 ++
pbcore/io/dataset/DataSetIO.py | 313 +++++++++++++++++++++--------
pbcore/io/dataset/DataSetMembers.py | 185 ++++++++++++------
pbcore/io/dataset/DataSetReader.py | 44 ++++-
pbcore/io/dataset/utils.py | 81 +++++++-
requirements.txt | 2 +-
tests/test_pbcore_io_AlnFileReaders.py | 19 ++
tests/test_pbdataset.py | 348 ++++++++++++++++++++++++++++++++-
tests/test_pbdataset_subtypes.py | 129 +++++++++++-
14 files changed, 1073 insertions(+), 162 deletions(-)
diff --git a/CHANGELOG.org b/CHANGELOG.org
index bf187aa..daccd48 100644
--- a/CHANGELOG.org
+++ b/CHANGELOG.org
@@ -1,3 +1,8 @@
+* Version 1.2.10 (> SMRTanalysis 3.1)
+ - Update to pysam 0.9.0
+ - Recognize new and prospective future partnumbers for the 3.1 timeframe
+ - Misc fixes for speeding up dataset and cmp.h5 access
+
* Version 1.2.9
- Rename pulseFeature accessors to "baseFeature...", since they
reflect features accompanying basecalls as opposed to pulse calls;
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
index bfe805f..7375688 100644
--- a/pbcore/__init__.py
+++ b/pbcore/__init__.py
@@ -28,4 +28,4 @@
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
-__VERSION__ = "1.2.9"
+__VERSION__ = "1.2.10"
diff --git a/pbcore/chemistry/resources/mapping.xml b/pbcore/chemistry/resources/mapping.xml
index c40b10a..0f7e95b 100644
--- a/pbcore/chemistry/resources/mapping.xml
+++ b/pbcore/chemistry/resources/mapping.xml
@@ -182,9 +182,78 @@
<SoftwareVersion>3.0</SoftwareVersion>
</Mapping>
<Mapping>
- <SequencingChemistry>S/P1-C1</SequencingChemistry>
+ <SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
<BindingKit>100-619-300</BindingKit>
<SequencingKit>100-620-000</SequencingKit>
<SoftwareVersion>3.1</SoftwareVersion>
</Mapping>
+
+ <!-- 1.0/1.1: a 1/1 compabible new SK condition -->
+ <Mapping>
+ <SequencingChemistry>S/P1-C1</SequencingChemistry>
+ <BindingKit>100-619-300</BindingKit>
+ <SequencingKit>100-867-300</SequencingKit>
+ <SoftwareVersion>3.1</SoftwareVersion>
+ </Mapping>
+
+ <!-- 1.1/1.1: a 1/1 compatible new BK and SK condition -->
+ <Mapping>
+ <SequencingChemistry>S/P1-C1</SequencingChemistry>
+ <BindingKit>100-867-500</BindingKit>
+ <SequencingKit>100-867-300</SequencingKit>
+ <SoftwareVersion>3.1</SoftwareVersion>
+ </Mapping>
+
+ <!-- 2.0/2.0: new kits... 1/1 compatible... -->
+ <Mapping>
+ <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
+ <BindingKit>100-864-400</BindingKit>
+ <SequencingKit>100-864-100</SequencingKit>
+ <SoftwareVersion>3.1</SoftwareVersion>
+ </Mapping>
+
+ <!-- 2.1/2.1, ibid -->
+ <Mapping>
+ <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
+ <BindingKit>100-864-600</BindingKit>
+ <SequencingKit>100-864-200</SequencingKit>
+ <SoftwareVersion>3.1</SoftwareVersion>
+ </Mapping>
+
+ <!-- 3.2 basecaller support -->
+ <Mapping>
+ <SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
+ <BindingKit>100-619-300</BindingKit>
+ <SequencingKit>100-620-000</SequencingKit>
+ <SoftwareVersion>3.2</SoftwareVersion>
+ </Mapping>
+
+ <Mapping>
+ <SequencingChemistry>S/P1-C1</SequencingChemistry>
+ <BindingKit>100-619-300</BindingKit>
+ <SequencingKit>100-867-300</SequencingKit>
+ <SoftwareVersion>3.2</SoftwareVersion>
+ </Mapping>
+
+ <Mapping>
+ <SequencingChemistry>S/P1-C1</SequencingChemistry>
+ <BindingKit>100-867-500</BindingKit>
+ <SequencingKit>100-867-300</SequencingKit>
+ <SoftwareVersion>3.2</SoftwareVersion>
+ </Mapping>
+
+ <Mapping>
+ <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
+ <BindingKit>100-864-400</BindingKit>
+ <SequencingKit>100-864-100</SequencingKit>
+ <SoftwareVersion>3.2</SoftwareVersion>
+ </Mapping>
+
+ <Mapping>
+ <SequencingChemistry>S/P2-C2/prospective-compatible</SequencingChemistry>
+ <BindingKit>100-864-600</BindingKit>
+ <SequencingKit>100-864-200</SequencingKit>
+ <SoftwareVersion>3.2</SoftwareVersion>
+ </Mapping>
+
</MappingTable>
diff --git a/pbcore/io/align/BamAlignment.py b/pbcore/io/align/BamAlignment.py
index 8428d0f..104e0b8 100644
--- a/pbcore/io/align/BamAlignment.py
+++ b/pbcore/io/align/BamAlignment.py
@@ -302,7 +302,7 @@ class BamAlignment(AlignmentRecordMixin):
@property
@requiresMapping
def identity(self):
- if self.hasPbi is not None:
+ if self.hasPbi:
# Fast (has pbi)
if self.readLength == 0:
return 0.
diff --git a/pbcore/io/align/PacBioBamIndex.py b/pbcore/io/align/PacBioBamIndex.py
index ea7d724..89ae316 100644
--- a/pbcore/io/align/PacBioBamIndex.py
+++ b/pbcore/io/align/PacBioBamIndex.py
@@ -74,6 +74,10 @@ class PacBioBamIndex(object):
return (self.pbiFlags & PBI_FLAGS_MAPPED)
@property
+ def hasCoordinateSortedInfo(self):
+ return (self.pbiFlags & PBI_FLAGS_COORDINATE_SORTED)
+
+ @property
def hasBarcodeInfo(self):
return (self.pbiFlags & PBI_FLAGS_BARCODE)
@@ -99,6 +103,11 @@ class PacBioBamIndex(object):
("nMM" , "u4"),
("mapQV" , "u1") ]
+ COORDINATE_SORTED_DTYPE = [
+ ("tId" , "u4"),
+ ("beginRow" , "u4"),
+ ("endRow" , "u4")]
+
BARCODE_INDEX_DTYPE = [
("bcForward" , "i2"),
("bcReverse" , "i2"),
@@ -135,6 +144,14 @@ class PacBioBamIndex(object):
tbl.nIns = tbl.aEnd - tbl.aStart - tbl.nM - tbl.nMM
tbl.nDel = tbl.tEnd - tbl.tStart - tbl.nM - tbl.nMM
+ # TODO: do something with these:
+ # TODO: remove nReads check when the rest of this code can handle empty
+ # mapped bam files (columns are missing, flags don't reflect that)
+ if self.hasCoordinateSortedInfo and self.nReads:
+ ntId = int(peek("u4", 1))
+ for columnName, columnType in COORDINATE_SORTED_DTYPE:
+ peek(columnType, ntId)
+
if self.hasBarcodeInfo:
for columnName, columnType in BARCODE_INDEX_DTYPE:
tbl[columnName] = peek(columnType, self.nReads)
diff --git a/pbcore/io/dataset/DataSetErrors.py b/pbcore/io/dataset/DataSetErrors.py
new file mode 100644
index 0000000..4695df9
--- /dev/null
+++ b/pbcore/io/dataset/DataSetErrors.py
@@ -0,0 +1,17 @@
+
+class InvalidDataSetIOError(Exception):
+ """The base class for all DataSetIO related custom exceptions
+ """
+
+
+class ResourceMismatchError(InvalidDataSetIOError):
+
+ def __init__(self, responses):
+ super(ResourceMismatchError, self).__init__()
+ self.responses = responses
+
+ def __str__(self):
+ return "Resources responded differently: " + ', '.join(
+ map(str, self.responses))
+
+
diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py
index a5edaef..cbde652 100755
--- a/pbcore/io/dataset/DataSetIO.py
+++ b/pbcore/io/dataset/DataSetIO.py
@@ -8,6 +8,7 @@ import copy
import os, sys
import re
import errno
+import uuid
import logging
import itertools
import xml.dom.minidom
@@ -26,7 +27,8 @@ from pbcore.io import (BaxH5Reader, FastaReader, IndexedFastaReader,
from pbcore.io.align._BamSupport import UnavailableFeature
from pbcore.io.dataset.DataSetReader import (parseStats, populateDataSet,
resolveLocation, xmlRootType,
- wrapNewResource, openFofnFile)
+ wrapNewResource, openFofnFile,
+ parseMetadata)
from pbcore.io.dataset.DataSetWriter import toXml
from pbcore.io.dataset.DataSetValidator import validateString
from pbcore.io.dataset.DataSetMembers import (DataSetMetadata,
@@ -36,7 +38,10 @@ from pbcore.io.dataset.DataSetMembers import (DataSetMetadata,
ExternalResources,
ExternalResource, Filters)
from pbcore.io.dataset.utils import (consolidateBams, _infixFname, _pbindexBam,
- _indexBam, _indexFasta)
+ _indexBam, _indexFasta, _fileCopy,
+ _swapPath, which, consolidateXml)
+from pbcore.io.dataset.DataSetErrors import (InvalidDataSetIOError,
+ ResourceMismatchError)
log = logging.getLogger(__name__)
@@ -111,13 +116,38 @@ def isDataSet(xmlfile):
except Exception:
return False
-def openDataSet(*files, **kwargs):
- """Factory function for DataSet types as suggested by the first file"""
+def getDataSetUuid(xmlfile):
+ """
+ Quickly retrieve the uuid from the root element of a dataset XML file,
+ using a streaming parser to avoid loading the entire dataset into memory.
+ Returns None if the parsing fails.
+ """
+ try:
+ import xml.etree.cElementTree as ET
+ for event, element in ET.iterparse(xmlfile, events=("start",)):
+ return element.get("UniqueId")
+ except Exception:
+ return None
+
+
+def getDataSetMetaType(xmlfile):
+ """
+ Quickly retrieve the MetaType from the root element of a dataset XML file,
+ using a streaming parser to avoid loading the entire dataset into memory.
+ Returns None if the parsing fails.
+ """
try:
- tbrType = _typeDataSet(files[0])
- return tbrType(*files, **kwargs)
+ import xml.etree.cElementTree as ET
+ for event, element in ET.iterparse(xmlfile, events=("start",)):
+ return element.get("MetaType")
except Exception:
- raise TypeError("openDataSet requires that the first file is a DS")
+ return None
+
+
+def openDataSet(*files, **kwargs):
+ """Factory function for DataSet types as suggested by the first file"""
+ tbrType = _typeDataSet(files[0])
+ return tbrType(*files, **kwargs)
def openDataFile(*files, **kwargs):
"""Factory function for DataSet types determined by the first data file"""
@@ -194,6 +224,12 @@ def _flatten(lol, times=1):
lol = np.concatenate(lol)
return lol
+def _ranges_in_list(alist):
+ """Takes a sorted list, finds the boundaries of runs of each value"""
+ unique, indices, counts = np.unique(np.array(alist), return_index=True,
+ return_counts=True)
+ return {u: (i, i + c) for u, i, c in zip(unique, indices, counts)}
+
def divideKeys(keys, chunks):
if chunks < 1:
return []
@@ -281,6 +317,24 @@ def _pathChanger(osPathFunc, metaTypeFunc, resource):
resource.resourceId = currentPath
metaTypeFunc(resource)
+def _copier(dest, resource, subfolder=None):
+ """Apply these two functions to the resource or ResourceId"""
+ if subfolder is None:
+ subfolder = [uuid.uuid4()]
+ resId = resource.resourceId
+ currentPath = urlparse(resId)
+ if currentPath.scheme == 'file' or not currentPath.scheme:
+ currentPath = currentPath.path
+ try:
+ currentPath = _fileCopy(dest, currentPath, uuid=resource.uniqueId)
+ subfolder[0] = resource.uniqueId
+ except AttributeError:
+ if subfolder:
+ currentPath = _fileCopy(dest, currentPath, uuid=subfolder[0])
+ else:
+ raise
+ resource.resourceId = currentPath
+
class DataSet(object):
"""The record containing the DataSet information, with possible type
@@ -425,10 +479,13 @@ class DataSet(object):
break
if not found:
allowed = self._metaTypeMapping().keys()
+ extension = fname.split('.')[-1]
raise IOError(errno.EIO,
"Cannot create {c} with resource of type "
- "{t}, only {a}".format(c=dsType, t=fname,
- a=allowed))
+ "'{t}' ({f}), only {a}".format(c=dsType,
+ t=extension,
+ f=fname,
+ a=allowed))
# State tracking:
self._cachedFilters = []
@@ -520,27 +577,28 @@ class DataSet(object):
# dataset for the first time
firstIn = True if len(self.externalResources) == 0 else False
+ if copyOnMerge:
+ result = self.copy()
+ else:
+ result = self
+
# Block on filters?
if (not firstIn and
not self.filters.testCompatibility(other.filters)):
log.warning("Filter incompatibility has blocked the merging "
"of two datasets")
return None
- else:
- self.addFilters(other.filters, underConstruction=True)
+ elif firstIn:
+ result.addFilters(other.filters, underConstruction=True)
# reset the filters, just in case
- self._cachedFilters = []
+ result._cachedFilters = []
# block on object metadata?
- self._checkObjMetadata(other.objMetadata)
+ result._checkObjMetadata(other.objMetadata)
# There is probably a cleaner way to do this:
- self.objMetadata.update(other.objMetadata)
- if copyOnMerge:
- result = self.copy()
- else:
- result = self
+ result.objMetadata.update(other.objMetadata)
# If this dataset has no subsets representing it, add self as a
# subdataset to the result
@@ -683,6 +741,22 @@ class DataSet(object):
self.objMetadata['UniqueId'] = newId
return newId
+ def copyTo(self, dest, relative=False):
+ """Doesn't resolve resource name collisions"""
+ ofn = None
+ dest = os.path.abspath(dest)
+ if not os.path.isdir(dest):
+ ofn = dest
+ dest = os.path.split(dest)[0]
+ # unfortunately file indices must have the same name as the file they
+ # index, so we carry around some state to store the most recent uuid
+ # seen. Good thing we do a depth first traversal!
+ state = [self.uuid]
+ resFunc = partial(_copier, dest, subfolder=state)
+ self._modResources(resFunc)
+ if not ofn is None:
+ self.write(ofn, relPaths=relative)
+
def copy(self, asType=None):
"""Deep copy the representation of this DataSet
@@ -958,15 +1032,16 @@ class DataSet(object):
results.append(newCopy)
return results
- def write(self, outFile, validate=True, modPaths=False,
- relPaths=False, pretty=True):
+ def write(self, outFile, validate=True, modPaths=None,
+ relPaths=None, pretty=True):
"""Write to disk as an XML file
Args:
:outFile: The filename of the xml file to be created
:validate: T/F (True) validate the ExternalResource ResourceIds
- :relPaths: T/F (False) make the ExternalResource ResourceIds
- relative instead of absolute filenames
+ :relPaths: T/F (None/no change) make the ExternalResource
+ ResourceIds relative instead of absolute filenames
+ :modPaths: DEPRECATED (T/F) allow paths to be modified
Doctest:
>>> import pbcore.data.datasets as data
@@ -980,8 +1055,17 @@ class DataSet(object):
>>> ds1 == ds2
True
"""
+ if not modPaths is None:
+ log.info("modPaths as a write argument is deprecated. Paths "
+ "aren't modified unless relPaths is explicitly set "
+ "to True or False. Will be removed in future versions.")
+ # make sure we keep the same effect for now, in case someone has
+ # something odd like modPaths=False, relPaths=True
+ if not modPaths:
+ relPaths = None
+
# fix paths if validate:
- if validate and modPaths:
+ if validate and not relPaths is None:
if relPaths:
self.makePathsRelative(os.path.dirname(outFile))
else:
@@ -1052,6 +1136,25 @@ class DataSet(object):
else:
self.metadata.append(statsMetadata)
+ def readsByName(self, query):
+ reads = _flatten([rr.readsByName(query)
+ for rr in self.resourceReaders()])
+ return sorted(reads, key=lambda a: a.readStart)
+
+ def loadMetadata(self, filename):
+ """Load pipeline metadata from a <moviename>.run.metadata.xml file.
+
+ Args:
+ :filename: the filename of a <moviename>.run.metadata.xml file
+
+ """
+ if isinstance(filename, basestring):
+ metadata = parseMetadata(str(filename))
+ else:
+ metadata = filename
+ self.addMetadata(metadata)
+ self.updateCounts()
+
def processFilters(self):
"""Generate a list of functions to apply to a read, all of which return
T/F. Each function is an OR filter, so any() true passes the read.
@@ -1303,7 +1406,6 @@ class DataSet(object):
"""
self.metadata.totalLength = -1
self.metadata.numRecords = -1
- self._countsUpdated = True
def addExternalResources(self, newExtResources, updateCount=True):
"""Add additional ExternalResource objects, ensuring no duplicate
@@ -1685,6 +1787,15 @@ class DataSet(object):
responses = self._pollResources(lambda x: x.baseFeaturesAvailable())
return self._unifyResponses(responses)
+ def hasPulseFeature(self, featureName):
+ responses = self._pollResources(
+ lambda x: x.hasPulseFeature(featureName))
+ return self._unifyResponses(responses)
+
+ def pulseFeaturesAvailable(self):
+ responses = self._pollResources(lambda x: x.pulseFeaturesAvailable())
+ return self._unifyResponses(responses)
+
@property
def sequencingChemistry(self):
responses = self._pollResources(lambda x: x.sequencingChemistry)
@@ -1753,20 +1864,26 @@ class DataSet(object):
def __getitem__(self, index):
"""Should respect filters for free, as _indexMap should only be
populated by filtered reads. Only pbi filters considered, however."""
+ if self._indexMap is None:
+ _ = self.index
if isinstance(index, int):
- if self._indexMap is None:
- _ = self.index
# support negatives
if index < 0:
index = len(self.index) + index
rrNo, recNo = self._indexMap[index]
return self.resourceReaders()[rrNo][recNo]
elif isinstance(index, slice):
- raise NotImplementedError()
+ indexTuples = self._indexMap[index]
+ return [self.resourceReaders()[ind[0]][ind[1]] for ind in
+ indexTuples]
elif isinstance(index, list):
indexTuples = [self._indexMap[ind] for ind in index]
return [self.resourceReaders()[ind[0]][ind[1]] for ind in
indexTuples]
+ elif isinstance(index, np.ndarray):
+ indexTuples = self._indexMap[index]
+ return [self.resourceReaders()[ind[0]][ind[1]] for ind in
+ indexTuples]
elif isinstance(index, str):
if 'id' in self.index.dtype.names:
row = np.nonzero(self.index.id == index)[0][0]
@@ -1775,22 +1892,6 @@ class DataSet(object):
raise NotImplementedError()
-class InvalidDataSetIOError(Exception):
- """The base class for all DataSetIO related custom exceptions
- """
-
-
-class ResourceMismatchError(InvalidDataSetIOError):
-
- def __init__(self, responses):
- super(ResourceMismatchError, self).__init__()
- self.responses = responses
-
- def __str__(self):
- return "Resources responded differently: " + ', '.join(
- map(str, self.responses))
-
-
class ReadSet(DataSet):
"""Base type for read sets, should probably never be used as a concrete
class"""
@@ -1809,7 +1910,8 @@ class ReadSet(DataSet):
if not res.bai:
newInds.append(_indexBam(fname))
self.close()
- res.addIndices(newInds)
+ if newInds:
+ res.addIndices(newInds)
self._populateMetaTypes()
self.updateCounts()
@@ -1852,7 +1954,15 @@ class ReadSet(DataSet):
if refFile:
if not refFile in sharedRefs:
log.debug("Using reference: {r}".format(r=refFile))
- sharedRefs[refFile] = IndexedFastaReader(refFile)
+ try:
+ sharedRefs[refFile] = IndexedFastaReader(refFile)
+ except IOError:
+ if not self._strict:
+ log.warn("Problem opening reference with"
+ "IndexedFastaReader")
+ sharedRefs[refFile] = None
+ else:
+ raise
location = urlparse(extRes.resourceId).path
resource = None
try:
@@ -2230,30 +2340,61 @@ class ReadSet(DataSet):
:numFiles: The number of data files to be produced.
"""
- if numFiles > 1:
- assert len(self.resourceReaders()) == len(self.toExternalFiles())
- resSizes = [[i, size[0], size[1]]
- for i, size in enumerate(self._resourceSizes())]
- chunks = self._chunkList(resSizes, numFiles, lambda x: x[1])
- resLists = []
- for chunk in chunks:
- thisResList = []
- for i in chunk:
- thisResList.append(self.toExternalFiles()[i[0]])
- resLists.append(thisResList)
- fnames = [_infixFname(dataFile, str(i)) for i in range(numFiles)]
- for resList, fname in zip(resLists, fnames):
- consolidateBams(resList, fname, filterDset=self, useTmp=useTmp)
+ references = [er.reference for er in self.externalResources if
+ er.reference]
+ if which('pbmerge'):
+ log.debug("Using pbmerge to consolidate")
+ dsets = self.split(zmws=True, chunks=numFiles)
+ if numFiles > 1:
+ fnames = [_infixFname(dataFile, str(i))
+ for i in range(numFiles)]
+ else:
+ fnames = [dataFile]
+ for chunk, fname in zip(dsets, fnames):
+ consolidateXml(chunk, fname, useTmp=useTmp)
log.debug("Replacing resources")
self.externalResources = ExternalResources()
self.addExternalResources(fnames)
+ self.induceIndices()
else:
- consolidateBams(self.toExternalFiles(), dataFile, filterDset=self,
- useTmp=useTmp)
- # TODO: add as subdatasets
- log.debug("Replacing resources")
- self.externalResources = ExternalResources()
- self.addExternalResources([dataFile])
+ if numFiles > 1:
+ assert (len(self.resourceReaders()) ==
+ len(self.toExternalFiles()))
+ resSizes = [[i, size[0], size[1]]
+ for i, size in enumerate(self._resourceSizes())]
+ chunks = self._chunkList(resSizes, numFiles, lambda x: x[1])
+ resLists = []
+ for chunk in chunks:
+ thisResList = []
+ for i in chunk:
+ thisResList.append(self.toExternalFiles()[i[0]])
+ resLists.append(thisResList)
+ fnames = [_infixFname(dataFile, str(i))
+ for i in range(numFiles)]
+ for resList, fname in zip(resLists, fnames):
+ consolidateBams(resList, fname, filterDset=self,
+ useTmp=useTmp)
+ log.debug("Replacing resources")
+ self.externalResources = ExternalResources()
+ self.addExternalResources(fnames)
+ else:
+ consolidateBams(self.toExternalFiles(), dataFile,
+ filterDset=self, useTmp=useTmp)
+ # TODO: remove subdatasets?
+ log.debug("Replacing resources")
+ self.externalResources = ExternalResources()
+ self.addExternalResources([dataFile])
+ # make sure reference gets passed through:
+ if references:
+ refCounts = dict(Counter(references))
+ if len(refCounts) > 1:
+ log.warn("Consolidating AlignmentSets with "
+ "different references, but BamReaders "
+ "can only have one. References will be "
+ "lost")
+ else:
+ for extres in self.externalResources:
+ extres.reference = refCounts.keys()[0]
# reset the indexmap especially, as it is out of date:
self._index = None
self._indexMap = None
@@ -2261,7 +2402,6 @@ class ReadSet(DataSet):
self._populateMetaTypes()
def updateCounts(self):
- self._countsUpdated = True
if self._skipCounts:
log.debug("SkipCounts is true, skipping updateCounts()")
self.metadata.totalLength = -1
@@ -2273,6 +2413,7 @@ class ReadSet(DataSet):
numRecords, totalLength = self._length
self.metadata.totalLength = totalLength
self.metadata.numRecords = numRecords
+ self._countsUpdated = True
except (IOError, UnavailableFeature):
if not self._strict:
log.debug("File problem, metadata not populated")
@@ -2331,7 +2472,6 @@ class HdfSubreadSet(ReadSet):
def updateCounts(self):
"""Overriding here so we don't have to assertIndexed"""
- self._countsUpdated = True
if self._skipCounts:
log.debug("SkipCounts is true, skipping updateCounts()")
self.metadata.totalLength = -1
@@ -2342,6 +2482,7 @@ class HdfSubreadSet(ReadSet):
numRecords, totalLength = self._length
self.metadata.totalLength = totalLength
self.metadata.numRecords = numRecords
+ self._countsUpdated = True
except (IOError, UnavailableFeature):
if not self._strict:
log.debug("File problem, metadata not populated")
@@ -2516,7 +2657,7 @@ class AlignmentSet(ReadSet):
"""
recArrays = []
- log.debug("Processing resource pbis")
+ log.debug("Processing resource indices")
_indexMap = []
for rrNum, rr in enumerate(self.resourceReaders()):
indices = rr.index
@@ -2560,11 +2701,13 @@ class AlignmentSet(ReadSet):
'tEnd',))
tbr = tbr[sort_order]
self._indexMap = self._indexMap[sort_order]
+ ranges = _ranges_in_list(tbr.RefGroupID)
for ref in self.referenceInfoTable:
- hits = np.flatnonzero(tbr.RefGroupID == ref.ID)
- if len(hits) != 0:
- ref.StartRow = hits[0]
- ref.EndRow = hits[-1]
+ bounds = ranges.get(ref.ID)
+ if bounds:
+ ref.StartRow = bounds[0]
+ # we want the ranges to be inclusive:
+ ref.EndRow = bounds[1] - 1
# and fix the naming scheme while we're at it
ref.Name = self._cleanCmpName(ref.FullName)
return tbr
@@ -3721,11 +3864,21 @@ class ContigSet(DataSet):
self._populateContigMetadata()
def _populateContigMetadata(self):
+ log.debug("Adding contig metadata...")
+ numrec = 0
+ totlen = 0
for contig in self.contigs:
self._metadata.addContig(contig)
+ numrec += 1
+ totlen += len(contig)
+ if not self._countsUpdated:
+ log.debug("Counts updated: numrec={n} totlen={l}".format(n=numrec,
+ l=totlen))
+ self.numRecords = numrec
+ self.totalLength = totlen
+ self._countsUpdated = True
def updateCounts(self):
- self._countsUpdated = True
if self._skipCounts:
if not self.metadata.totalLength:
self.metadata.totalLength = -1
@@ -3733,14 +3886,17 @@ class ContigSet(DataSet):
self.metadata.numRecords = -1
return
if not self.isIndexed:
- log.info("Cannot updateCounts without an index file")
- self.metadata.totalLength = 0
- self.metadata.numRecords = 0
+ if (not self.totalLength and not self.numRecords and not
+ self._countsUpdated):
+ log.info("Cannot updateCounts without an index file")
+ self.metadata.totalLength = 0
+ self.metadata.numRecords = 0
return
try:
log.debug('Updating counts')
self.metadata.totalLength = sum(self.index.length)
self.metadata.numRecords = len(self.index)
+ self._countsUpdated = True
except (IOError, UnavailableFeature, TypeError):
# IOError for missing files
# UnavailableFeature for files without companion files
@@ -3906,8 +4062,9 @@ class ContigSet(DataSet):
# index file
names = []
for contig in self.contigs:
- if (self.noFiltering
- or self._filters.testParam('id', contig.id, str)):
+ if self.noFiltering:
+ names.append(contig.id)
+ elif self._filters.testParam('id', contig.id, str):
names.append(contig.id)
return sorted(list(set(names)))
diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py
index 05303a7..99090ae 100755
--- a/pbcore/io/dataset/DataSetMembers.py
+++ b/pbcore/io/dataset/DataSetMembers.py
@@ -205,20 +205,32 @@ class RecordWrapper(object):
**repr_d)
return rep
+ def __eq__(self, other):
+ """Does not take child order into account!!!!"""
+ if (sorted([c.record['tag'] for c in self]) !=
+ sorted([c.record['tag'] for c in other])):
+ return False
+ if self.__class__.__name__ != other.__class__.__name__:
+ return False
+ for attrib in ['metaname', 'namespace', 'metavalue', 'metadata']:
+ if getattr(self, attrib) != getattr(other, attrib):
+ return False
+ return True
+
def pop(self, index):
return self.record['children'].pop(index)
def merge(self, other):
pass
- def getMemberV(self, tag, container='text'):
+ def getMemberV(self, tag, container='text', default=None, asType=str):
"""Generic accessor for the contents of the children of this element,
without having to interface with them directly"""
try:
- return self.record['children'][self.index(str(tag))][
- str(container)]
- except KeyError:
- return None
+ return asType(self.record['children'][self.index(str(tag))][
+ str(container)])
+ except (KeyError, ValueError):
+ return default
def setMemberV(self, tag, value, container='text'):
"""Generic accessor for the contents of the children of this element,
@@ -420,8 +432,8 @@ class Filters(RecordWrapper):
if len(self) == 0:
return True
return all([sFilt == oFilt for sFilt, oFilt in zip(
- sorted(list(self)),
- sorted(list(other)))])
+ sorted(list(self), key=lambda x: x.metaname),
+ sorted(list(other), key=lambda x: x.metaname))])
def __nonzero__(self):
for filt in self:
@@ -557,6 +569,8 @@ class Filters(RecordWrapper):
def _bamTypeMap(self):
return {'rname': str,
'length': int,
+ 'qstart': int,
+ 'qend': int,
'qname': str,
'movie': str,
'zm': int,
@@ -622,6 +636,7 @@ class Filters(RecordWrapper):
# renamed:
accMap['movie'] = (lambda x: x.MovieID)
accMap['qname'] = (lambda x: x.MovieID)
+ accMap['zm'] = (lambda x: x.HoleNumber)
elif readType == 'fasta':
accMap = {'id': (lambda x: x.id),
'length': (lambda x: x.length.astype(int)),
@@ -654,6 +669,29 @@ class Filters(RecordWrapper):
operator = mapOp(req.operator)
reqResultsForRecords &= operator(
accMap[param](indexRecords), value)
+ elif param == 'qname':
+ movie, hole, span = value.split('/')
+ operator = mapOp(req.operator)
+
+ value = movieMap[movie]
+ reqResultsForRecords = operator(
+ accMap[param](indexRecords), value)
+
+ param = 'zm'
+ value = typeMap[param](hole)
+ reqResultsForRecords &= operator(
+ accMap[param](indexRecords), value)
+
+ param = 'qstart'
+ value = typeMap[param](span.split('_')[0])
+ reqResultsForRecords &= operator(
+ accMap[param](indexRecords), value)
+
+ param = 'qend'
+ value = typeMap[param](span.split('_')[1])
+ reqResultsForRecords &= operator(
+ accMap[param](indexRecords), value)
+
else:
operator = mapOp(req.operator)
reqResultsForRecords = operator(
@@ -977,6 +1015,10 @@ class ExternalResource(RecordWrapper):
return True
return False
+ @property
+ def uniqueId(self):
+ return self.getV('attrib', 'UniqueId')
+
def merge(self, other):
if self.metaType:
if self.metaType != other.metaType:
@@ -1052,6 +1094,14 @@ class ExternalResource(RecordWrapper):
self._setSubResByMetaType('PacBio.SubreadFile.ScrapsBamFile', value)
@property
+ def control(self):
+ return self._getSubResByMetaType('PacBio.SubreadFile.Control.SubreadBamFile')
+
+ @control.setter
+ def control(self, value):
+ self._setSubResByMetaType('PacBio.SubreadFile.Control.SubreadBamFile', value)
+
+ @property
def barcodes(self):
return self._getSubResByMetaType("PacBio.DataSet.BarcodeSet")
@@ -1076,8 +1126,11 @@ class ExternalResource(RecordWrapper):
return res.resourceId
def _setSubResByMetaType(self, mType, value):
- tmp = ExternalResource()
- tmp.resourceId = value
+ if not isinstance(value, ExternalResource):
+ tmp = ExternalResource()
+ tmp.resourceId = value
+ else:
+ tmp = value
tmp.metaType = mType
tmp.timeStampedName = getTimeStampedName(mType)
resources = self.externalResources
@@ -1122,11 +1175,11 @@ class ExternalResource(RecordWrapper):
fileIndices = FileIndices(fileIndices[0])
else:
fileIndices = FileIndices()
+ self.append(fileIndices)
for index in list(indices):
temp = FileIndex()
temp.resourceId = index
fileIndices.append(temp)
- self.append(fileIndices)
class FileIndices(RecordWrapper):
@@ -1203,10 +1256,7 @@ class DataSetMetadata(RecordWrapper):
def numRecords(self):
"""Return the number of records in a DataSet using helper functions
defined in the base class"""
- try:
- return int(self.getMemberV('NumRecords'))
- except ValueError:
- return 0
+ return self.getMemberV('NumRecords', default=0, asType=int)
@numRecords.setter
def numRecords(self, value):
@@ -1218,10 +1268,7 @@ class DataSetMetadata(RecordWrapper):
"""Return the TotalLength property of this dataset.
TODO: update the value from the actual external reference on
ValueError"""
- try:
- return int(self.getMemberV('TotalLength'))
- except ValueError:
- return 0
+ return self.getMemberV('TotalLength', default=0, asType=int)
@totalLength.setter
def totalLength(self, value):
@@ -1320,10 +1367,7 @@ class ContigSetMetadata(DataSetMetadata):
@property
def organism(self):
- try:
- return self.getMemberV('Organism')
- except ValueError:
- return None
+ return self.getMemberV('Organism')
@organism.setter
def organism(self, value):
@@ -1331,10 +1375,7 @@ class ContigSetMetadata(DataSetMetadata):
@property
def ploidy(self):
- try:
- return self.getMemberV('Ploidy')
- except ValueError:
- return None
+ return self.getMemberV('Ploidy')
@ploidy.setter
def ploidy(self, value):
@@ -1381,10 +1422,7 @@ class BarcodeSetMetadata(DataSetMetadata):
@property
def barcodeConstruction(self):
- try:
- return self.getMemberV('BarcodeConstruction')
- except ValueError:
- return None
+ return self.getMemberV('BarcodeConstruction')
@barcodeConstruction.setter
def barcodeConstruction(self, value):
@@ -1550,32 +1588,35 @@ class StatsMetadata(RecordWrapper):
"""The metadata from the machine sts.xml"""
def merge(self, other):
- self.shortInsertFraction = (self.shortInsertFraction *
- self.prodDist.bins[1] +
- other.shortInsertFraction *
- other.prodDist.bins[1])/(
- self.prodDist.bins[1]
- + other.prodDist.bins[1])
- self.adapterDimerFraction = (self.adapterDimerFraction *
- self.prodDist.bins[1] +
- other.adapterDimerFraction *
- other.prodDist.bins[1])/(
- self.prodDist.bins[1]
- + other.prodDist.bins[1])
+ if other.shortInsertFraction and other.prodDist:
+ self.shortInsertFraction = (self.shortInsertFraction *
+ self.prodDist.bins[1] +
+ other.shortInsertFraction *
+ other.prodDist.bins[1])/(
+ self.prodDist.bins[1]
+ + other.prodDist.bins[1])
+ if other.adapterDimerFraction and other.prodDist:
+ self.adapterDimerFraction = (self.adapterDimerFraction *
+ self.prodDist.bins[1] +
+ other.adapterDimerFraction *
+ other.prodDist.bins[1])/(
+ self.prodDist.bins[1]
+ + other.prodDist.bins[1])
self.numSequencingZmws += other.numSequencingZmws
for dist in DISTLIST:
selfDist = getattr(self, dist[0].lower() + dist[1:])
otherDist = getattr(other, dist[0].lower() + dist[1:])
- try:
- selfDist.merge(otherDist)
- except ZeroBinWidthError as e:
- removed = self.removeChildren(dist)
- self.append(otherDist)
- except BinMismatchError:
- self.append(otherDist)
- except ValueError:
+ if not selfDist:
if otherDist:
self.append(otherDist)
+ else:
+ try:
+ selfDist.merge(otherDist)
+ except ZeroBinWidthError as e:
+ removed = self.removeChildren(dist)
+ self.append(otherDist)
+ except BinMismatchError:
+ self.append(otherDist)
@property
def prodDist(self):
@@ -1609,10 +1650,20 @@ class StatsMetadata(RecordWrapper):
'InsertReadQualDist'))
@property
+ def insertReadLenDists(self):
+ return [ContinuousDistribution(child) for child in
+ self.findChildren('InsertReadLenDist')]
+
+ @property
def insertReadLenDist(self):
return ContinuousDistribution(self.getV('children',
'InsertReadLenDist'))
@property
+ def insertReadQualDists(self):
+ return [ContinuousDistribution(child) for child in
+ self.findChildren('InsertReadQualDist')]
+
+ @property
def controlReadQualDist(self):
return ContinuousDistribution(self.getV('children',
'ControlReadQualDist'))
@@ -1633,7 +1684,7 @@ class StatsMetadata(RecordWrapper):
@property
def adapterDimerFraction(self):
- return float(self.getMemberV('AdapterDimerFraction'))
+ return self.getMemberV('AdapterDimerFraction', asType=float)
@adapterDimerFraction.setter
def adapterDimerFraction(self, value):
@@ -1641,7 +1692,7 @@ class StatsMetadata(RecordWrapper):
@property
def numSequencingZmws(self):
- return float(self.getMemberV('NumSequencingZmws'))
+ return self.getMemberV('NumSequencingZmws', asType=float)
@numSequencingZmws.setter
def numSequencingZmws(self, value):
@@ -1649,7 +1700,7 @@ class StatsMetadata(RecordWrapper):
@property
def shortInsertFraction(self):
- return float(self.getMemberV('ShortInsertFraction'))
+ return self.getMemberV('ShortInsertFraction', asType=float)
@shortInsertFraction.setter
def shortInsertFraction(self, value):
@@ -1697,7 +1748,7 @@ class ContinuousDistribution(RecordWrapper):
@property
def numBins(self):
- return int(self.getMemberV('NumBins'))
+ return self.getMemberV('NumBins', asType=int)
@numBins.setter
def numBins(self, value):
@@ -1705,27 +1756,27 @@ class ContinuousDistribution(RecordWrapper):
@property
def sampleSize(self):
- return int(self.getMemberV('SampleSize'))
+ return self.getMemberV('SampleSize', asType=int)
@property
def sampleMean(self):
- return float(self.getMemberV('SampleMean'))
+ return self.getMemberV('SampleMean', asType=float)
@property
def sampleMed(self):
- return float(self.getMemberV('SampleMed'))
+ return self.getMemberV('SampleMed', asType=float)
@property
def sampleStd(self):
- return float(self.getMemberV('SampleStd'))
+ return self.getMemberV('SampleStd', asType=float)
@property
def sample95thPct(self):
- return float(self.getMemberV('Sample95thPct'))
+ return self.getMemberV('Sample95thPct', asType=float)
@property
def binWidth(self):
- return float(self.getMemberV('BinWidth'))
+ return self.getMemberV('BinWidth', asType=float)
@binWidth.setter
def binWidth(self, value):
@@ -1733,15 +1784,15 @@ class ContinuousDistribution(RecordWrapper):
@property
def minOutlierValue(self):
- return float(self.getMemberV('MinOutlierValue'))
+ return self.getMemberV('MinOutlierValue', asType=float)
@property
def maxOutlierValue(self):
- return float(self.getMemberV('MaxOutlierValue'))
+ return self.getMemberV('MaxOutlierValue', asType=float)
@property
def minBinValue(self):
- return float(self.getMemberV('MinBinValue'))
+ return self.getMemberV('MinBinValue', asType=float)
@minBinValue.setter
def minBinValue(self, value):
@@ -1749,7 +1800,7 @@ class ContinuousDistribution(RecordWrapper):
@property
def maxBinValue(self):
- return float(self.getMemberV('MaxBinValue'))
+ return self.getMemberV('MaxBinValue', asType=float)
@maxBinValue.setter
def maxBinValue(self, value):
@@ -1838,7 +1889,7 @@ class DiscreteDistribution(RecordWrapper):
@property
def numBins(self):
- return int(self.getMemberV('NumBins'))
+ return self.getMemberV('NumBins', asType=int)
@property
def bins(self):
@@ -1876,6 +1927,10 @@ class RunDetailsMetadata(RecordWrapper):
def name(self):
return self.getMemberV('Name')
+ @name.setter
+ def name(self, value):
+ return self.setMemberV('Name', value)
+
class WellSampleMetadata(RecordWrapper):
diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py
index c33ddb1..1ba4c27 100755
--- a/pbcore/io/dataset/DataSetReader.py
+++ b/pbcore/io/dataset/DataSetReader.py
@@ -11,6 +11,7 @@ from pbcore.io.dataset.DataSetMembers import (ExternalResource,
Filters, AutomationParameters,
StatsMetadata, DISTLIST)
from pbcore.io.dataset.DataSetWriter import _eleFromDictList
+from pbcore.io.dataset.DataSetErrors import InvalidDataSetIOError
log = logging.getLogger(__name__)
@@ -98,10 +99,22 @@ def _addUnknownFile(dset, path):
else:
return _addGenericFile(dset, path)
+SUB_RESOURCES = ['.scraps.bam', '.control.subreads.bam']
+FILE_INDICES = ['.fai', '.pbi', '.bai', '.metadata.xml',
+ '.index', '.contig.index', '.sa']
+
# TODO needs namespace
def _addGenericFile(dset, path):
"""Create and populate an Element object, put it in an available members
dictionary, return"""
+ # filter out resource file types that aren't top level:
+ # if we want to exclude scraps as well:
+ for ext in SUB_RESOURCES + FILE_INDICES:
+ if path.endswith(ext):
+ log.debug('Sub resource file {f} given as regular file, '
+ 'will be treated '
+ 'as a sub resource file instead'.format(f=path))
+ return
extRes = wrapNewResource(path)
extRess = ExternalResources()
extRess.append(extRes)
@@ -110,9 +123,8 @@ def _addGenericFile(dset, path):
# TODO needs namespace
def wrapNewResource(path):
- possible_indices = ['.fai', '.pbi', '.bai', '.metadata.xml',
- '.index', '.contig.index', '.sa']
- for ext in possible_indices:
+ # filter out non-resource file types:
+ for ext in FILE_INDICES:
if path.endswith(ext):
log.debug('Index file {f} given as regular file, will be treated '
' as an index file instead'.format(f=path))
@@ -120,10 +132,18 @@ def wrapNewResource(path):
extRes = ExternalResource()
path = resolveLocation(path)
extRes.resourceId = path
- index_files = [path + ext for ext in possible_indices if
+ index_files = [path + ext for ext in FILE_INDICES if
os.path.exists(path + ext)]
if index_files:
extRes.addIndices(index_files)
+
+ # Check for sub resources:
+ for ext in SUB_RESOURCES:
+ filen = '.'.join(path.split('.')[:-2]) + ext
+ # don't want to add e.g. scraps to scraps:
+ if os.path.exists(filen) and path.endswith('subreads.bam'):
+ subres = wrapNewResource(filen)
+ setattr(extRes, ext.split('.')[1], subres)
return extRes
@@ -255,3 +275,19 @@ def parseStats(filename):
stats.pruneChildrenTo(whitelist)
return stats
+def parseMetadata(filename):
+ url = urlparse(filename)
+ fileLocation = url.path.strip()
+ if url.netloc:
+ fileLocation = url.netloc
+ tree = ET.parse(fileLocation)
+ dsm_tag = (".//{http://pacificbiosciences.com/PacBioDatasets.xsd}"
+ "DataSetMetadata")
+ try:
+ metadata = _parseXmlDataSetMetadata(tree.getroot().find(dsm_tag))
+ except AttributeError:
+ # the tag wasn't found, we're trying to do something with None
+ raise InvalidDataSetIOError("Unable to parse metadata from "
+ "{f}".format(f=filename))
+ return metadata
+
diff --git a/pbcore/io/dataset/utils.py b/pbcore/io/dataset/utils.py
index 6881555..d9b352b 100755
--- a/pbcore/io/dataset/utils.py
+++ b/pbcore/io/dataset/utils.py
@@ -11,6 +11,50 @@ from pbcore.util.Process import backticks
log = logging.getLogger(__name__)
+def which(exe):
+ if os.path.exists(exe) and os.access(exe, os.X_OK):
+ return exe
+ path = os.getenv('PATH')
+ for this_path in path.split(os.path.pathsep):
+ this_path = os.path.join(this_path, exe)
+ if os.path.exists(this_path) and os.access(this_path, os.X_OK):
+ return this_path
+ return None
+
+def consolidateXml(indset, outbam, useTmp=True):
+ tmpout = tempfile.mkdtemp(suffix="consolidate-xml")
+ tmp_xml = os.path.join(tmpout,
+ "orig.{t}.xml".format(
+ t=indset.__class__.__name__.lower()))
+
+ final_free_space = disk_free(os.path.split(outbam)[0])
+ projected_size = sum(file_size(infn)
+ for infn in indset.toExternalFiles())
+ log.debug("Projected size: {p}".format(p=projected_size))
+ log.debug("In place free space: {f}".format(f=final_free_space))
+ # give it a 5% buffer
+ if final_free_space < (projected_size * 1.05):
+ raise RuntimeError("No space available to consolidate")
+ if useTmp:
+ tmp_free_space = disk_free(tmpout)
+ log.debug("Tmp free space (need ~2x): {f}".format(f=tmp_free_space))
+ # need 2x for tmp in and out, plus 10% buffer
+ if tmp_free_space > (projected_size * 2.1):
+ log.debug("Using tmp storage: " + tmpout)
+ indset.copyTo(tmp_xml)
+ origOutBam = outbam
+ outbam = os.path.join(tmpout, "outfile.bam")
+ else:
+ log.debug("Using in place storage")
+ indset.write(tmp_xml)
+ useTmp = False
+ _pbmergeXML(tmp_xml, outbam)
+ if useTmp:
+ shutil.copy(outbam, origOutBam)
+ # cleanup:
+ shutil.rmtree(tmpout)
+ return outbam
+
def consolidateBams(inFiles, outFile, filterDset=None, useTmp=True):
"""Take a list of infiles, an outFile to produce, and optionally a dataset
(filters) to provide the definition and content of filtrations."""
@@ -127,11 +171,11 @@ def _sortBam(fname):
shutil.move(tmpname, fname)
def _indexBam(fname):
- pysam.index(fname)
+ pysam.samtools.index(fname, catch_stdout=False)
return fname + ".bai"
def _indexFasta(fname):
- pysam.faidx(fname)
+ pysam.samtools.faidx(fname, catch_stdout=False)
return fname + ".fai"
def _mergeBams(inFiles, outFile):
@@ -146,6 +190,15 @@ def _mergeBams(inFiles, outFile):
else:
shutil.copy(inFiles[0], outFile)
+def _pbmergeXML(indset, outbam):
+ cmd = "pbmerge -o {o} {i} ".format(i=indset,
+ o=outbam)
+ log.info(cmd)
+ o, r, m = backticks(cmd)
+ if r != 0:
+ raise RuntimeError(m)
+ return outbam
+
def _filterBam(inFile, outFile, filterDset):
BamtoolsVersion().check()
tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
@@ -162,6 +215,30 @@ def _infixFname(fname, infix):
prefix, extension = os.path.splitext(fname)
return prefix + str(infix) + extension
+def _earlyInfixFname(fname, infix):
+ path, name = os.path.split(fname)
+ tokens = name.split('.')
+ tokens.insert(1, str(infix))
+ return os.path.join(path, '.'.join(tokens))
+
+def _swapPath(dest, infile):
+ return os.path.join(dest, os.path.split(infile)[1])
+
+def _fileCopy(dest, infile, uuid=None):
+ fn = _swapPath(dest, infile)
+ if os.path.exists(fn):
+ if uuid is None:
+ raise IOError("File name exists in destination: "
+ "{f}".format(f=infile))
+ subdir = os.path.join(dest, uuid)
+ if not os.path.exists(subdir):
+ os.mkdir(subdir)
+ fn = _swapPath(subdir, fn)
+ shutil.copy(infile, fn)
+ assert os.path.exists(fn)
+ return fn
+
+
def _emitFilterScript(filterDset, filtScriptName):
"""Use the filter script feature of bamtools. Use with specific filters if
all that are needed are available, otherwise filter by readname (easy but
diff --git a/requirements.txt b/requirements.txt
index 357619b..0182647 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
cython
numpy >= 1.7.1
h5py >= 2.0.1
-pysam == 0.8.4
+pysam >= 0.9.0
diff --git a/tests/test_pbcore_io_AlnFileReaders.py b/tests/test_pbcore_io_AlnFileReaders.py
index df0e005..2a128d7 100644
--- a/tests/test_pbcore_io_AlnFileReaders.py
+++ b/tests/test_pbcore_io_AlnFileReaders.py
@@ -7,6 +7,7 @@ from nose.tools import (nottest,
from nose import SkipTest
import tempfile
+import shutil
import pysam
import numpy as np
import bisect
@@ -426,12 +427,30 @@ class TestIndexedBam(_IndexedAlnFileReaderTests):
EQ(len(bam), 0)
def test_alignment_identity(self):
+ """
+ Check that the values of the 'identity' property are consistent
+ between IndexedBamReader (numpy array) and BamAlignment (float)
+ """
fn = data.getBamAndCmpH5()[0]
with IndexedBamReader(fn) as bam_in:
i1 = bam_in.identity
i2 = np.array([ rec.identity for rec in bam_in ])
EQ((i2 == i1).all(), True)
+ def test_alignment_identity_unindexed(self):
+ """
+ Check that the value of the 'identity' property is the same whether
+ or not the .pbi index was used to calculate it.
+ """
+ fn1 = data.getBamAndCmpH5()[0]
+ fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
+ shutil.copyfile(fn1, fn2)
+ with IndexedBamReader(fn1) as bam_pbi:
+ with BamReader(fn2) as bam_noindex:
+ i1 = np.array([ rec.identity for rec in bam_pbi ])
+ i2 = np.array([ rec.identity for rec in bam_noindex ])
+ EQ((i2 == i1).all(), True)
+
class TestCCSBam(object):
def __init__(self):
diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py
index 13ea083..dd3577b 100644
--- a/tests/test_pbdataset.py
+++ b/tests/test_pbdataset.py
@@ -8,6 +8,7 @@ import tempfile
import numpy as np
import unittest
import shutil
+import copy
from random import shuffle
from unittest.case import SkipTest
@@ -17,7 +18,8 @@ from pbcore.io.dataset.utils import BamtoolsVersion
from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
openDataSet, DataSetMetaTypes, HdfSubreadSet,
ConsensusReadSet, ConsensusAlignmentSet, openDataFile,
- divideKeys, keysToRanges)
+ divideKeys, keysToRanges, getDataSetUuid,
+ getDataSetMetaType)
from pbcore.io.dataset.DataSetIO import _dsIdToSuffix, InvalidDataSetIOError
from pbcore.io.dataset.DataSetMembers import ExternalResource, Filters
from pbcore.io.dataset.DataSetValidator import validateFile
@@ -80,7 +82,6 @@ class TestDataSet(unittest.TestCase):
self.assertEquals(ds1.numExternalResources, 2)
ds1 = DataSet(data.getFofn())
self.assertEquals(ds1.numExternalResources, 2)
- # New! Use the correct constructor:
self.assertEquals(type(SubreadSet(data.getSubreadSet(),
skipMissing=True)).__name__,
'SubreadSet')
@@ -149,6 +150,49 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(merged.subdatasets[1].toExternalFiles(),
AlignmentSet(data.getXml(11)).toExternalFiles())
+ # No filters, 3 files:
+ ds1 = AlignmentSet(data.getXml(8))
+ self.assertEqual(len(ds1.subdatasets), 0)
+ ds2 = AlignmentSet(data.getXml(11))
+ self.assertEqual(len(ds2.subdatasets), 0)
+ ds3 = AlignmentSet(data.getXml(11))
+ self.assertEqual(len(ds3.subdatasets), 0)
+ ds3.externalResources[0].resourceId = "/blah.bam"
+ ds4 = ds1 + ds2 + ds3
+ self.assertEqual(len(ds4.externalResources), 3)
+ self.assertEqual(len(ds4.subdatasets), 3)
+
+ # Filters, 3 files:
+ ds1 = AlignmentSet(data.getXml(8))
+ self.assertEqual(len(ds1.subdatasets), 0)
+ ds1.filters.addRequirement(rq=[('>', 0.8)])
+ ds2 = AlignmentSet(data.getXml(11))
+ self.assertEqual(len(ds2.subdatasets), 0)
+ ds2.filters.addRequirement(rq=[('>', 0.8)])
+ ds3 = AlignmentSet(data.getXml(11))
+ self.assertEqual(len(ds3.subdatasets), 0)
+ ds3.externalResources[0].resourceId = "/blah.bam"
+ ds3.filters.addRequirement(rq=[('>', 0.8)])
+ ds4 = ds1 + ds2 + ds3
+ self.assertEqual(len(ds4.externalResources), 3)
+ self.assertEqual(len(ds4.subdatasets), 3)
+ self.assertEqual(str(ds4.filters), '( rq > 0.8 )')
+ for sss in ds4.subdatasets:
+ self.assertEqual(str(sss.filters), '( rq > 0.8 )')
+ with self.assertRaises(TypeError):
+ # mismatched Filters, 3 files:
+ ds1 = AlignmentSet(data.getXml(8))
+ self.assertEqual(len(ds1.subdatasets), 0)
+ ds1.filters.addRequirement(rq=[('>', 0.8)])
+ ds2 = AlignmentSet(data.getXml(11))
+ self.assertEqual(len(ds2.subdatasets), 0)
+ ds2.filters.addRequirement(rq=[('>', 0.7)])
+ ds3 = AlignmentSet(data.getXml(11))
+ self.assertEqual(len(ds3.subdatasets), 0)
+ ds3.externalResources[0].resourceId = "/blah.bam"
+ ds3.filters.addRequirement(rq=[('>', 0.8)])
+ ds4 = ds1 + ds2 + ds3
+
def test_empty_metatype(self):
inBam = data.getBam()
d = DataSet(inBam)
@@ -526,6 +570,42 @@ class TestDataSet(unittest.TestCase):
@unittest.skipIf(not _internal_data(),
"Internal data not available")
+ def test_scraps_detection(self):
+ path = ('/pbi/dept/secondary/siv/testdata/SA3-Sequel/'
+ 'lambda/315/3150128/r54008_20160308_001811/'
+ '2_B01/m54008_160308_053311.')
+ subreads = path + 'subreads.bam'
+ control = path + 'control.subreads.bam'
+ controlscraps = path + 'control.scraps.bam'
+ scraps = path + 'scraps.bam'
+ subreadspbi = subreads + '.pbi'
+ scrapspbi = scraps + '.pbi'
+
+ filesets = [[subreads],
+ [subreads, scraps],
+ [subreads, subreadspbi],
+ [subreads, scrapspbi]]
+
+ for files in filesets:
+ sset = SubreadSet(*files, strict=True)
+ self.assertEqual(len(sset.externalResources), 1)
+ self.assertEqual(sset.externalResources[0].resourceId, subreads)
+ self.assertEqual(sset.externalResources[0].scraps, scraps)
+ self.assertEqual(sset.externalResources[0].control, control)
+ self.assertEqual(
+ sset.externalResources[0].externalResources[0].resourceId,
+ scraps)
+ self.assertEqual(
+ sset.externalResources[0].externalResources[1].resourceId,
+ control)
+ self.assertEqual(
+ sset.externalResources[
+ 0].externalResources[1].externalResources[0].resourceId,
+ controlscraps)
+
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
def test_referenceInfoTableMerging(self):
log.info("Testing refIds, etc. after merging")
bam1 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
@@ -650,8 +730,9 @@ class TestDataSet(unittest.TestCase):
ds1.index.holeNumber < rg[1],
ds1.index.holeNumber > rg[0]))[0]), 0)
- @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
- "Missing testadata directory")
+ #@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+ # "Missing testadata directory")
+ @unittest.skip("Too expensive")
def test_large_split_zmws(self):
N_RECORDS = 959539
test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
@@ -727,6 +808,40 @@ class TestDataSet(unittest.TestCase):
@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
"Missing testadata directory")
+ def test_multi_movie_readsByName(self):
+ N_RECORDS = 1745161
+ test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+ "2372215/0007/Analysis_Results/m150404_101626_42"
+ "267_c100807920800000001823174110291514_s1_p0.al"
+ "l.subreadset.xml")
+ test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+ "2590980/0008/Analysis_Results/m141115_075238_et"
+ "han_c100699872550000001823139203261572_s1_p0.al"
+ "l.subreadset.xml")
+ ds1 = SubreadSet(test_file_1, test_file_2)
+ self.assertEqual(len(ds1), N_RECORDS)
+ queries = [('m150404_101626_42267_c1008079208000'
+ '00001823174110291514_s1_p0/7/*', 2),
+ ('m141115_075238_ethan_c1006998725500'
+ '00001823139203261572_s1_p0/9/*', 39),
+ ]
+ for query, count in queries:
+ reads = ds1.readsByName(query)
+ self.assertEqual(len(reads), count)
+ parts = query.split('/')
+ movie = parts[0]
+ hn = int(parts[1])
+ if len(parts) > 2:
+ qrange = parts[2]
+ for read in reads:
+ self.assertEqual(read.movieName, movie)
+ self.assertEqual(read.holeNumber, hn)
+ # TODO: test qrange/ccs
+
+
+ #@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+ # "Missing testadata directory")
+ @unittest.skip("Too expensive")
def test_large_pbi(self):
pbiFn = ('/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/simulated'
'/100Gb/alnsubreads/pbalchemy_100Gb_Seq_sim1_p0.'
@@ -842,6 +957,85 @@ class TestDataSet(unittest.TestCase):
ds2._metadata.totalLength += 100000
self.assertEquals(ds2._metadata.totalLength, 200000)
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_loadMetadata(self):
+ aln = AlignmentSet(data.getXml(no=8))
+ self.assertFalse(aln.metadata.collections)
+ aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.run.metadata.xml')
+ self.assertTrue(aln.metadata.collections)
+ sset_fn = ('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.subreadset.xml')
+ sset = SubreadSet(sset_fn)
+ orig_metadata = copy.deepcopy(sset.metadata)
+ sset.metadata.collections = None
+ self.assertFalse(sset.metadata.collections)
+ sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.run.metadata.xml')
+ stack = zip(sset.metadata, orig_metadata)
+ fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ sset.write(fn)
+ validateFile(fn)
+ validateFile(sset_fn)
+ self.assertEqual(sset.metadata, orig_metadata)
+
+
+ # load the wrong thing...
+ sset_fn = ('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.subreadset.xml')
+ sset = SubreadSet(sset_fn)
+ orig_metadata = copy.deepcopy(sset.metadata)
+ sset.metadata.collections = None
+ self.assertFalse(sset.metadata.collections)
+ with self.assertRaises(InvalidDataSetIOError):
+ sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
+ 'SA3-Sequel/lambda/roche_SAT/'
+ 'm54013_151205_032353.sts.xml')
+
+
+ def test_copyTo(self):
+ aln = AlignmentSet(data.getXml(no=8), strict=True)
+ explen = len(aln)
+ fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
+ aln.copyTo(fn)
+ aln.close()
+ del aln
+ aln = AlignmentSet(fn, strict=True)
+ self.assertEqual(explen, len(aln))
+
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ aln.copyTo(outdir)
+ fn = os.path.join(outdir, "test.alignmentset.xml")
+ aln.write(fn)
+ aln.close()
+ del aln
+ aln = AlignmentSet(fn, strict=True)
+ self.assertEqual(explen, len(aln))
+
+ # do it twice to same dir to induce collisions
+ aln = AlignmentSet(data.getXml(no=8), strict=True)
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ aln.copyTo(outdir)
+ fn = os.path.join(outdir, "test.alignmentset.xml")
+ aln.write(fn)
+
+ aln = AlignmentSet(data.getXml(no=8), strict=True)
+ aln.copyTo(outdir)
+ fn2 = os.path.join(outdir, "test2.alignmentset.xml")
+ aln.write(fn2)
+
+ aln = AlignmentSet(fn, strict=True)
+ aln2 = AlignmentSet(fn2, strict=True)
+ self.assertEqual(explen, len(aln))
+ self.assertEqual(explen, len(aln2))
+ self.assertNotEqual(sorted(aln.toExternalFiles()),
+ sorted(aln2.toExternalFiles()))
+
def test_addExternalResources(self):
ds = DataSet()
er1 = ExternalResource()
@@ -979,7 +1173,7 @@ class TestDataSet(unittest.TestCase):
def test_reads_in_range_order(self):
log.debug("Testing with one file")
testFile = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
- "2372215/0007/Alignment_Results/m150404_101"
+ "2372215/0007_tiny/Alignment_Results/m150404_101"
"626_42267_c1008079208000000018231741102915"
"14_s1_p0.1.alignmentset.xml")
aln = AlignmentSet(testFile)
@@ -991,11 +1185,11 @@ class TestDataSet(unittest.TestCase):
for r1, r2 in itertools.izip(reads1, reads2):
self.assertEqual(r1, r2)
num += 1
- self.assertTrue(num > 2000)
+ self.assertEqual(num, 28)
log.debug("Testing with three files")
testFile = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
- "2372215/0007/Alignment_Results/m150404_101"
+ "2372215/0007_tiny/Alignment_Results/m150404_101"
"626_42267_c1008079208000000018231741102915"
"14_s1_p0.all.alignmentset.xml")
aln = AlignmentSet(testFile)
@@ -1007,7 +1201,7 @@ class TestDataSet(unittest.TestCase):
for r1, r2 in itertools.izip(reads1, reads2):
self.assertEqual(r1, r2)
num += 1
- self.assertTrue(num > 2000)
+ self.assertEqual(num, 105)
@unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
"Missing testadata directory")
@@ -2203,6 +2397,38 @@ class TestDataSet(unittest.TestCase):
@unittest.skipIf(not _internal_data(),
"Internal data not available")
+ def test_qname_filter_scaling(self):
+ # unaligned bam
+ bam0 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590956/0003/"
+ "Analysis_Results/m140913_222218_42240_c10069"
+ "9952400000001823139203261564_s1_p0.all.subreadset.xml")
+ bam1 = ("/pbi/dept/secondary/siv/testdata/"
+ "SA3-DS/ecoli/2590953/0001/"
+ "Analysis_Results/m140913_005018_42139_c10071"
+ "3652400000001823152404301534_s1_p0.all.subreadset.xml")
+ sset = SubreadSet(bam0, bam1)
+ self.assertEqual(len(sset), 178570)
+ size = 10
+ qn = [r.qName for r in sset[:size]]
+ good_qn = [('=', name) for name in qn]
+ sset.filters.addRequirement(qname=good_qn)
+ self.assertEqual(size, sum(1 for _ in sset))
+ self.assertEqual(size, len(sset))
+
+
+ sset = SubreadSet(data.getXml(10))
+ self.assertEqual(len(sset), 92)
+ size = 10
+ qn = [r.qName for r in sset[:size]]
+ good_qn = [('=', name) for name in qn]
+ sset.filters.addRequirement(qname=good_qn)
+ self.assertEqual(size, sum(1 for _ in sset))
+ self.assertEqual(size, len(sset))
+
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
def test_movie_filter(self):
# unaligned bam
bam0 = ("/pbi/dept/secondary/siv/testdata/"
@@ -2390,6 +2616,95 @@ class TestDataSet(unittest.TestCase):
zip(ss.metadata.summaryStats.readLenDist.bins,
ss2.metadata.summaryStats.readLenDist.bins)])
+ # smoke tests
+ ss3.metadata.summaryStats.insertReadLenDists
+ ss3.metadata.summaryStats.insertReadQualDists
+
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not available")
+ def test_reduced_sts_merging(self):
+ # don't have a subreadset.xml with loaded sts.xml in testdata,
+ # fabricate one here:
+
+ full = ('/pbi/dept/secondary/siv/testdata/'
+ 'pbreports-unittest/data/sts_xml/'
+ '3120134-r54009_20160323_173308-'
+ '1_A01-Bug30772/m54009_160323_'
+ '173323.sts.xml')
+ partial = ('/pbi/dept/secondary/siv/testdata/'
+ 'pbreports-unittest/data/sts_xml/'
+ '32246/m54026_160402_062929.sts.xml')
+
+ # two partial
+ ss = SubreadSet(data.getXml(10))
+ ss.externalResources[0].sts = partial
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outXml = os.path.join(outdir, 'tempfile.xml')
+ outXml2 = os.path.join(outdir, 'tempfile2.xml')
+ ss.write(outXml)
+ ss.write(outXml2)
+ ss = SubreadSet(outXml)
+ ss2 = SubreadSet(outXml2)
+ ss3 = ss + ss2
+ self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+ [b1 + b2 for b1, b2 in
+ zip(ss.metadata.summaryStats.readLenDist.bins,
+ ss2.metadata.summaryStats.readLenDist.bins)])
+ ss4 = SubreadSet(outXml, outXml2)
+
+ # one partial one full
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outXml = os.path.join(outdir, 'tempfile.xml')
+ outXml2 = os.path.join(outdir, 'tempfile2.xml')
+ ss = SubreadSet(data.getXml(10))
+ ss.externalResources[0].sts = partial
+ ss.write(outXml)
+ ss.externalResources[0].sts = full
+ ss.write(outXml2)
+ ss = SubreadSet(outXml)
+ ss2 = SubreadSet(outXml2)
+ ss3 = ss + ss2
+ self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+ [b1 + b2 for b1, b2 in
+ zip(ss.metadata.summaryStats.readLenDist.bins,
+ ss2.metadata.summaryStats.readLenDist.bins)])
+ ss4 = SubreadSet(outXml, outXml2)
+
+ # one full one partial
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outXml = os.path.join(outdir, 'tempfile.xml')
+ outXml2 = os.path.join(outdir, 'tempfile2.xml')
+ ss = SubreadSet(data.getXml(10))
+ ss.externalResources[0].sts = full
+ ss.write(outXml)
+ ss.externalResources[0].sts = partial
+ ss.write(outXml2)
+ ss = SubreadSet(outXml)
+ ss2 = SubreadSet(outXml2)
+ ss3 = ss + ss2
+ self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+ [b1 + b2 for b1, b2 in
+ zip(ss.metadata.summaryStats.readLenDist.bins,
+ ss2.metadata.summaryStats.readLenDist.bins)])
+ ss4 = SubreadSet(outXml, outXml2)
+
+ # two full
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outXml = os.path.join(outdir, 'tempfile.xml')
+ outXml2 = os.path.join(outdir, 'tempfile2.xml')
+ ss = SubreadSet(data.getXml(10))
+ ss.externalResources[0].sts = full
+ ss.write(outXml)
+ ss.write(outXml2)
+ ss = SubreadSet(outXml)
+ ss2 = SubreadSet(outXml2)
+ ss3 = ss + ss2
+ self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+ [b1 + b2 for b1, b2 in
+ zip(ss.metadata.summaryStats.readLenDist.bins,
+ ss2.metadata.summaryStats.readLenDist.bins)])
+ ss4 = SubreadSet(outXml, outXml2)
+
@unittest.skipIf(not _internal_data(),
"Internal data not available")
def test_missing_extres(self):
@@ -2445,3 +2760,20 @@ class TestDataSet(unittest.TestCase):
sset = SubreadSet(human)
ssets = sset.split(zmws=True, maxChunks=5)
+ def test_get_dataset_uuid(self):
+ ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
+ ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ ds.write(ds_file)
+ uuid = getDataSetUuid(ds_file)
+ self.assertEqual(uuid, ds.uuid)
+ with open(ds_file, "w") as out:
+ out.write("hello world!")
+ uuid = getDataSetUuid(ds_file)
+ self.assertEqual(uuid, None)
+
+ def test_get_dataset_metatype(self):
+ ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
+ ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
+ ds.write(ds_file)
+ meta_type = getDataSetMetaType(ds_file)
+ self.assertEqual(meta_type, "PacBio.DataSet.SubreadSet")
diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py
index 7741b36..71282a7 100644
--- a/tests/test_pbdataset_subtypes.py
+++ b/tests/test_pbdataset_subtypes.py
@@ -16,7 +16,7 @@ from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet,
ReferenceSet, ContigSet, AlignmentSet, BarcodeSet,
FastaReader, FastaWriter, IndexedFastaReader,
HdfSubreadSet, ConsensusAlignmentSet,
- openDataFile, FastaWriter, FastqReader)
+ openDataFile, FastaWriter, FastqReader, openDataSet)
import pbcore.data as upstreamData
import pbcore.data.datasets as data
from pbcore.io.dataset.DataSetValidator import validateXml
@@ -362,6 +362,43 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(read1, read2)
self.assertEqual(len(list(aln)), len(list(nonCons)))
+ log.debug("Test with one reference")
+ aln = AlignmentSet(data.getXml(12))
+ reference = upstreamData.getFasta()
+ aln.externalResources[0].reference = reference
+ nonCons = aln.copy()
+ self.assertEqual(len(aln.toExternalFiles()), 2)
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outfn = os.path.join(outdir, 'merged.bam')
+ aln.consolidate(outfn)
+ self.assertTrue(os.path.exists(outfn))
+ self.assertEqual(len(aln.toExternalFiles()), 1)
+ #nonCons = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(nonCons.toExternalFiles()), 2)
+ for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
+ self.assertEqual(read1, read2)
+ self.assertEqual(len(aln), len(nonCons))
+ self.assertEqual(aln.externalResources[0].reference, reference)
+
+ log.debug("Test with two references")
+ aln = AlignmentSet(data.getXml(12))
+ reference = upstreamData.getFasta()
+ for extRes in aln.externalResources:
+ extRes.reference = reference
+ self.assertEqual(len(aln.toExternalFiles()), 2)
+ outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+ outfn = os.path.join(outdir, 'merged.bam')
+ aln.consolidate(outfn)
+ self.assertTrue(os.path.exists(outfn))
+ self.assertEqual(len(aln.toExternalFiles()), 1)
+ #nonCons = AlignmentSet(data.getXml(12))
+ self.assertEqual(len(nonCons.toExternalFiles()), 2)
+ for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
+ self.assertEqual(read1, read2)
+ self.assertEqual(len(aln), len(nonCons))
+ self.assertEqual(aln.externalResources[0].reference, reference)
+
+
def test_accuracy_filter(self):
aln = AlignmentSet(data.getXml(12))
self.assertEqual(len(list(aln)), 177)
@@ -517,6 +554,7 @@ class TestDataSet(unittest.TestCase):
for name, seq in zip(exp_names, exp_double_seqs):
self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
+
def test_contigset_consolidate_genomic_consensus(self):
"""
Verify that the contigs output by GenomicConsensus (e.g. quiver) can
@@ -602,6 +640,86 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(cset_l, sum(1 for _ in cfq))
+ @unittest.skipIf(not _internal_data(),
+ "Internal data not found, skipping")
+ def test_empty_fastq_consolidate(self):
+ fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
+ 'lambda/2590980/0008/Analysis_Results/'
+ 'm141115_075238_ethan_c100699872550000001'
+ '823139203261572_s1_p0.1.subreads.fastq')
+ fq1_out = tempfile.NamedTemporaryFile(suffix="1.fastq").name
+ fq2_out = tempfile.NamedTemporaryFile(suffix="2.fastq").name
+ cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
+
+ # Two full
+ with open(fq1_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ for line in itertools.islice(fih, 240):
+ fqh.write(line)
+ with open(fq2_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ for line in itertools.islice(fih, 240, 480):
+ fqh.write(line)
+ cset = ContigSet(fq1_out, fq2_out)
+ cset_l = sum(1 for _ in cset)
+ self.assertEqual(cset_l, 120)
+ cset.consolidate(cfq_out)
+ cset_l = sum(1 for _ in cset)
+ cfq = FastqReader(cfq_out)
+ self.assertEqual(cset_l, 120)
+ self.assertEqual(cset_l, sum(1 for _ in cfq))
+
+ # one full one empty
+ with open(fq1_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ for line in itertools.islice(fih, 240):
+ fqh.write(line)
+ with open(fq2_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ fqh.write("")
+ cset = ContigSet(fq1_out, fq2_out)
+ cset_l = sum(1 for _ in cset)
+ self.assertEqual(cset_l, 60)
+ cset.consolidate(cfq_out)
+ cset_l = sum(1 for _ in cset)
+ cfq = FastqReader(cfq_out)
+ self.assertEqual(cset_l, 60)
+ self.assertEqual(cset_l, sum(1 for _ in cfq))
+
+ # one empty one full
+ with open(fq1_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ fqh.write("")
+ with open(fq2_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ for line in itertools.islice(fih, 240):
+ fqh.write(line)
+ cset = ContigSet(fq1_out, fq2_out)
+ cset_l = sum(1 for _ in cset)
+ self.assertEqual(cset_l, 60)
+ cset.consolidate(cfq_out)
+ cset_l = sum(1 for _ in cset)
+ cfq = FastqReader(cfq_out)
+ self.assertEqual(cset_l, 60)
+ self.assertEqual(cset_l, sum(1 for _ in cfq))
+
+ # both empty
+ with open(fq1_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ fqh.write("")
+ with open(fq2_out, 'w') as fqh:
+ with open(fn, 'r') as fih:
+ fqh.write("")
+ cset = ContigSet(fq1_out, fq2_out)
+ cset_l = sum(1 for _ in cset)
+ self.assertEqual(cset_l, 0)
+ cset.consolidate(cfq_out)
+ cset_l = sum(1 for _ in cset)
+ cfq = FastqReader(cfq_out)
+ self.assertEqual(cset_l, 0)
+ self.assertEqual(cset_l, sum(1 for _ in cfq))
+
+
def test_len(self):
# AlignmentSet
aln = AlignmentSet(data.getXml(8), strict=True)
@@ -862,6 +980,15 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(len(list(cset)), 49)
self.assertEqual(len(cset), 49)
+ def test_getitem(self):
+ types = [AlignmentSet(data.getXml(8)),
+ ReferenceSet(data.getXml(9)),
+ SubreadSet(data.getXml(10)),
+ ]
+ for ds in types:
+ self.assertTrue(ds[0])
+
+
def test_incorrect_len_getitem(self):
types = [AlignmentSet(data.getXml(8)),
ReferenceSet(data.getXml(9)),
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git
More information about the debian-med-commit
mailing list