[med-svn] [Git][med-team/umis][upstream] New upstream version 1.0.6
Steffen Möller
gitlab at salsa.debian.org
Sat Jan 18 22:21:31 GMT 2020
Steffen Möller pushed to branch upstream at Debian Med / umis
Commits:
1279f98c by Steffen Moeller at 2020-01-18T22:48:39+01:00
New upstream version 1.0.6
- - - - -
3 changed files:
- HISTORY.md
- setup.py
- umis/umis.py
Changes:
=====================================
HISTORY.md
=====================================
@@ -1,4 +1,13 @@
-## 1.0.4 (in progress)
+## 1.0.6
+- Fix for the python3 fix.
+
+## 1.0.5
+- Fix for cb_filter with python3.
+
+## 1.0.4
+- Enable cb_histogram to be used on samples without UMIs.
+- Enable filtering of cells during `demultiplex_cells`.
+- Fix incorrect pandas.read_csv call with header=-1.
## 1.0.3
- Python 3 support
=====================================
setup.py
=====================================
@@ -8,7 +8,7 @@ def read(fname):
setup(
name='umis',
- version='1.0.3',
+ version='1.0.6',
description='Package for estimating UMI counts in Transcript Tag Counting data.',
packages=find_packages(),
install_requires=['click', 'pysam>=0.8.3', 'pandas', 'regex', 'scipy', 'toolz'],
=====================================
umis/umis.py
=====================================
@@ -24,7 +24,7 @@ import numpy as np
import scipy.io, scipy.sparse
import click
-VERSION = "1.0.3"
+VERSION = "1.0.6"
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -35,7 +35,10 @@ BARCODEINFO = {"sample": BarcodeInfo(bamtag="XS", readprefix="SAMPLE"),
"molecular": BarcodeInfo(bamtag="RX", readprefix="UMI")}
def open_gzipsafe(f):
- return gzip.open(f) if f.endswith(".gz") else open(f)
+ if is_python3():
+ return gzip.open(f, mode="rt") if f.endswith(".gz") else open(f)
+ else:
+ return gzip.open(f) if f.endswith(".gz") else open(f)
def safe_makedir(dname):
"""Make a directory if it doesn't exist, handling concurrent race conditions.
@@ -75,7 +78,7 @@ def read_fastq(filename):
if filename == "-":
filename_fh = sys.stdin
elif filename.endswith('gz'):
- if is_python3:
+ if is_python3():
filename_fh = gzip.open(filename, mode='rt')
else:
filename_fh = BufferedReader(gzip.open(filename, mode='rt'))
@@ -485,7 +488,7 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
cb_hist = None
filter_cb = False
if cb_histogram:
- cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t")
+ cb_hist = pd.read_csv(cb_histogram, index_col=0, header=None, squeeze=True, sep="\t")
total_num_cbs = cb_hist.shape[0]
cb_hist = cb_hist[cb_hist > cb_cutoff]
logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
@@ -712,9 +715,9 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
'read gene mapping information in stead of the mapping '
'target nane. Useful if e.g. reads have been mapped to '
'genome in stead of transcriptome.'))
- at click.option('--umi_matrix', required=False,
+ at click.option('--umi_matrix', required=False,
help=('Save a sparse matrix of counts without UMI deduping to this file.'))
-def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
+def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix):
''' Count up evidence for tagged molecules, this implementation assumes the
alignment file is coordinate sorted
@@ -758,7 +761,7 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
cb_hist = None
filter_cb = False
if cb_histogram:
- cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t")
+ cb_hist = pd.read_csv(cb_histogram, index_col=0, header=None, squeeze=True, sep="\t")
total_num_cbs = cb_hist.shape[0]
cb_hist = cb_hist[cb_hist > cb_cutoff]
logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
@@ -971,9 +974,9 @@ def cb_histogram(fastq, umi_histogram):
for read in read_fastq(fastq):
match = parser_re.search(read).groupdict()
cb = match['CB']
- umi = match['MB']
cb_counter[cb] += 1
if umi_histogram:
+ umi = match['MB']
umi_counter[(cb, umi)] += 1
for bc, count in cb_counter.most_common():
@@ -1054,9 +1057,9 @@ def cb_filter(fastq, bc1, bc2, bc3, cores, nedit):
''' Filters reads with non-matching barcodes
Expects formatted fastq files.
'''
-
with open_gzipsafe(bc1) as bc1_fh:
bc1 = set(cb.strip() for cb in bc1_fh)
+
if bc2:
with open_gzipsafe(bc2) as bc2_fh:
bc2 = set(cb.strip() for cb in bc2_fh)
@@ -1312,7 +1315,10 @@ def is_python3():
@click.option('--out_dir', default=".")
@click.option('--readnumber', default="")
@click.option('--prefix', default="")
-def demultiplex_cells(fastq, out_dir, readnumber, prefix=""):
+ at click.option('--cb_histogram', default=None)
+ at click.option('--cb_cutoff', default=0)
+def demultiplex_cells(fastq, out_dir, readnumber, prefix, cb_histogram,
+ cb_cutoff):
''' Demultiplex a fastqtransformed FASTQ file into a FASTQ file for
each cell.
'''
@@ -1321,7 +1327,9 @@ def demultiplex_cells(fastq, out_dir, readnumber, prefix=""):
parser_re = re.compile(re_string)
readstring = "" if not readnumber else "_R{}".format(readnumber)
filestring = "{prefix}{sample}{readstring}.fq"
-
+ cb_set = set()
+ if cb_histogram:
+ cb_set = get_cb_depth_set(cb_histogram, cb_cutoff)
sample_set = set()
batch = collections.defaultdict(list)
parsed = 0
@@ -1330,6 +1338,8 @@ def demultiplex_cells(fastq, out_dir, readnumber, prefix=""):
parsed += 1
match = parser_re.search(read).groupdict()
sample = match['CB']
+ if cb_set and sample not in cb_set:
+ continue
sample_set.add(sample)
batch[sample].append(read)
# write in batches to avoid opening up file handles repeatedly
View it on GitLab: https://salsa.debian.org/med-team/umis/commit/1279f98c26934b5800b87f5e3c549d24a85660ee
--
View it on GitLab: https://salsa.debian.org/med-team/umis/commit/1279f98c26934b5800b87f5e3c549d24a85660ee
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200118/d5fbf9b6/attachment-0001.html>
More information about the debian-med-commit
mailing list