[med-svn] [pbgenomicconsensus] 01/03: Imported Upstream version 1.1.0
Afif Elghraoui
afif-guest at moszumanska.debian.org
Wed Nov 25 21:07:38 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository pbgenomicconsensus.
commit 2e74dad9f58be74db8e92ad6935331b845bb2af5
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Tue Nov 24 23:44:29 2015 -0800
Imported Upstream version 1.1.0
---
CHANGELOG | 3 +++
GenomicConsensus/ResultCollector.py | 2 +-
GenomicConsensus/__init__.py | 2 +-
GenomicConsensus/io/utils.py | 17 +++++------------
GenomicConsensus/main.py | 27 ++++++++++++++-------------
GenomicConsensus/options.py | 9 +++++++++
GenomicConsensus/quiver/model.py | 2 +-
GenomicConsensus/quiver/quiver.py | 2 +-
GenomicConsensus/reference.py | 30 +++++++++++++++++++++---------
9 files changed, 56 insertions(+), 38 deletions(-)
diff --git a/CHANGELOG b/CHANGELOG
index 8ee9baa..b6eed56 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,6 @@
+Version 1.1.0
+ * Working support for DataSet read and reference files
+
Version 1.0.0
* Working support for BAM files adhering to our BAM spec (version 3.0b6)
diff --git a/GenomicConsensus/ResultCollector.py b/GenomicConsensus/ResultCollector.py
index e6d18e9..41f26ba 100644
--- a/GenomicConsensus/ResultCollector.py
+++ b/GenomicConsensus/ResultCollector.py
@@ -139,7 +139,7 @@ class ResultCollector(object):
if (s == 0) and (e == refEntry.length):
spanName = refName
else:
- spanName = refName + ":%d-%d" % (s, e)
+ spanName = refName + "_%d_%d" % (s, e)
cssName = consensus.consensusContigName(spanName,
options.algorithm)
# Gather just the chunks pertaining to this span
diff --git a/GenomicConsensus/__init__.py b/GenomicConsensus/__init__.py
index 92a44d5..abed5b6 100644
--- a/GenomicConsensus/__init__.py
+++ b/GenomicConsensus/__init__.py
@@ -30,4 +30,4 @@
# Author: David Alexander
-__VERSION__ = "1.0.0"
+__VERSION__ = "1.1.0"
diff --git a/GenomicConsensus/io/utils.py b/GenomicConsensus/io/utils.py
index 10b5d85..8e9928e 100644
--- a/GenomicConsensus/io/utils.py
+++ b/GenomicConsensus/io/utils.py
@@ -33,7 +33,7 @@
__all__ = ["loadCmpH5", "loadBam"]
import h5py, os.path
-from pbcore.io import CmpH5Reader, BamReader
+from pbcore.io import AlignmentSet
def loadCmpH5(filename, referenceFname, disableChunkCache=False):
@@ -41,17 +41,10 @@ def loadCmpH5(filename, referenceFname, disableChunkCache=False):
Get a CmpH5Reader object, disabling the chunk cache if requested.
"""
filename = os.path.abspath(os.path.expanduser(filename))
- if not disableChunkCache:
- file = h5py.File(filename, "r")
- else:
- propfaid = h5py.h5p.create(h5py.h5p.FILE_ACCESS)
- propfaid.set_cache(0, 0, 0, 0)
- fid = h5py.h5f.open(filename,
- flags=h5py.h5f.ACC_RDONLY,
- fapl=propfaid)
- file = h5py.File(fid)
- return CmpH5Reader(file)
+ return AlignmentSet(filename)
def loadBam(filename, referenceFname):
filename = os.path.abspath(os.path.expanduser(filename))
- return BamReader(filename, referenceFname)
+ aln = AlignmentSet(filename)
+ aln.addReference(referenceFname)
+ return aln
diff --git a/GenomicConsensus/main.py b/GenomicConsensus/main.py
index 8e2432f..09b2d8b 100644
--- a/GenomicConsensus/main.py
+++ b/GenomicConsensus/main.py
@@ -36,7 +36,8 @@ from __future__ import absolute_import
import argparse, atexit, cProfile, gc, glob, h5py, logging, multiprocessing
import os, pstats, random, shutil, tempfile, time, threading, Queue, traceback
-from pbcore.io import CmpH5Reader, BamReader
+from pbcore.io import AlignmentSet
+
from GenomicConsensus import reference
from GenomicConsensus.options import (options,
parseOptions,
@@ -139,11 +140,7 @@ class ToolRunner(object):
store it as self._inCmpH5.
"""
fname = options.inputFilename
- if options.usingBam:
- self._inCmpH5 = BamReader(fname, options.referenceFilename)
- else:
- logging.debug("Before open on main process, # hdf5 objects open: %d" % h5py.h5f.get_obj_count())
- self._inCmpH5 = CmpH5Reader(fname)
+ self._inCmpH5 = AlignmentSet(fname)
def _loadReference(self, cmpH5):
logging.info("Loading reference")
@@ -165,6 +162,8 @@ class ToolRunner(object):
else:
options.referenceWindows = map(reference.stringToWindow,
options.referenceWindowsAsString.split(","))
+ if options.referenceWindowsFromAlignment:
+ options.referenceWindows = cmpH5.refWindows
def _checkFileCompatibility(self, cmpH5):
if not cmpH5.isSorted:
@@ -173,11 +172,13 @@ class ToolRunner(object):
die("Input CmpH5 file must be nonempty.")
def _shouldDisableChunkCache(self, cmpH5):
- if isinstance(cmpH5, CmpH5Reader):
- threshold = options.autoDisableHdf5ChunkCache
- return datasetCountExceedsThreshold(cmpH5, threshold)
- else:
- return False
+ #if isinstance(cmpH5, CmpH5Reader):
+ #if cmpH5.isCmpH5:
+ # threshold = options.autoDisableHdf5ChunkCache
+ # return datasetCountExceedsThreshold(cmpH5, threshold)
+ #else:
+ # return False
+ return False
def _configureAlgorithm(self, options, cmpH5):
assert self._algorithm != None
@@ -276,7 +277,7 @@ class ToolRunner(object):
options.fancyChunking = False
# Peek at the bam file to build tables
- with BamReader(options.inputFilename) as peekCmpH5:
+ with AlignmentSet(options.inputFilename) as peekCmpH5:
logging.info("Peeking at BAM file %s" % options.inputFilename)
logging.info("Input BAM data: numAlnHits=%d" % len(peekCmpH5))
resolveOptions(peekCmpH5)
@@ -289,7 +290,7 @@ class ToolRunner(object):
# whether the selected algorithm parameters (Quiver) are
# compatible with the data. But we then have to close the
# file, and let the "real" open happen after the fork.
- with CmpH5Reader(options.inputFilename) as peekCmpH5:
+ with AlignmentSet(options.inputFilename) as peekCmpH5:
logging.info("Peeking at CmpH5 file %s" % options.inputFilename)
logging.info("Input CmpH5 data: numAlnHits=%d" % len(peekCmpH5))
resolveOptions(peekCmpH5)
diff --git a/GenomicConsensus/options.py b/GenomicConsensus/options.py
index 7e71607..e063575 100644
--- a/GenomicConsensus/options.py
+++ b/GenomicConsensus/options.py
@@ -167,6 +167,15 @@ def parseOptions():
"(default: entire reference).",
default=None)
+ readSelection.add_argument(
+ "--alignmentSetRefWindows",
+ action="store_true",
+ dest="referenceWindowsFromAlignment",
+ help="The window (or multiple comma-delimited windows) of the reference to " + \
+ "be processed, in the format refGroup:refStart-refEnd " + \
+ "will be pulled from the alignment file.",
+ default=False)
+
def slurpWindowFile(fname):
return ",".join(map(str.strip, open(fname).readlines()))
diff --git a/GenomicConsensus/quiver/model.py b/GenomicConsensus/quiver/model.py
index 7eedfdd..0c9c234 100644
--- a/GenomicConsensus/quiver/model.py
+++ b/GenomicConsensus/quiver/model.py
@@ -136,7 +136,7 @@ class Model(object):
MappedRead.
"""
assert aln.referenceSpan > 0
- name = str(aln.rowNumber)
+ name = aln.readName
chemistry = chemOrUnknown(aln)
read = cc.Read(cls.extractFeatures(aln), name, chemistry)
return cc.MappedRead(read,
diff --git a/GenomicConsensus/quiver/quiver.py b/GenomicConsensus/quiver/quiver.py
index 6e2f674..2aca757 100644
--- a/GenomicConsensus/quiver/quiver.py
+++ b/GenomicConsensus/quiver/quiver.py
@@ -108,7 +108,7 @@ def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
logging.debug("%s: Reads being used: %s" %
(reference.windowToString(subWin),
- " ".join([str(hit.rowNumber) for hit in alns])))
+ " ".join([str(hit.readName) for hit in alns])))
css = U.consensusForAlignments(subWin,
intRefSeq,
diff --git a/GenomicConsensus/reference.py b/GenomicConsensus/reference.py
index 38e3e74..62bc2ef 100644
--- a/GenomicConsensus/reference.py
+++ b/GenomicConsensus/reference.py
@@ -34,7 +34,7 @@ from __future__ import absolute_import
import logging, re, numpy as np
from collections import OrderedDict
-from pbcore.io import FastaTable, splitFastaHeader
+from pbcore.io import ReferenceSet
from .windows import holes, kCoveredIntervals, enumerateIntervals
from .utils import die, nub
@@ -122,13 +122,14 @@ def loadFromFile(filename_, cmpH5):
# Load contigs
assert not isLoaded()
try:
- f = FastaTable(filename_)
+ f = ReferenceSet(filename_)
+ f.assertIndexed()
except IOError as e:
die(e)
- cmpContigNames = set(splitFastaHeader(x)[0] for x in cmpH5.referenceInfoTable.FullName)
+ cmpContigNames = set(cmpH5.refNames)
- for fastaRecord in f:
+ for fastaRecord in f.contigs:
refName = fastaRecord.id
if refName in cmpContigNames:
cmpH5RefEntry = cmpH5.referenceInfo(refName)
@@ -209,11 +210,22 @@ def fancyEnumerateChunks(cmpH5, refId, referenceStride,
Enumerate chunks, creating chunks with hasCoverage=False for
coverage cutouts.
"""
- startRow = cmpH5.referenceInfo(refId).StartRow
- endRow = cmpH5.referenceInfo(refId).EndRow
- goodMapQVs = (cmpH5.MapQV[startRow:endRow] >= minMapQV)
- tStart = cmpH5.tStart[startRow:endRow][goodMapQVs].view(np.int32)
- tEnd = cmpH5.tEnd [startRow:endRow][goodMapQVs].view(np.int32)
+
+ # The abstraction frays here, unless we want to push all of this into
+ # pbdataset. With the fairly specific nature of this query, a composition
+ # of standard accessors is probably ok:
+ tStart = []
+ tEnd = []
+ for reader in cmpH5.resourceReaders(refId):
+ startRow = reader.referenceInfo(refId).StartRow
+ endRow = reader.referenceInfo(refId).EndRow
+ goodMapQVs = (reader.MapQV[startRow:endRow] >= minMapQV)
+ tStart.extend(
+ reader.tStart[startRow:endRow][goodMapQVs].view(np.int32))
+ tEnd.extend(reader.tEnd[startRow:endRow][goodMapQVs].view(np.int32))
+ # Sort the intervals (not sure if it matters, but might as well be
+ # consistent with the inputs:
+ tStart, tEnd = map(list, zip(*sorted(zip(tStart, tEnd))))
for span in enumerateSpans(refId, referenceWindows):
_, spanStart, spanEnd = span
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbgenomicconsensus.git
More information about the debian-med-commit
mailing list