[med-svn] [Git][med-team/umis][upstream] New upstream version 1.0.6

Sat Jan 18 22:21:31 GMT 2020


Steffen Möller pushed to branch upstream at Debian Med / umis


Commits:
1279f98c by Steffen Moeller at 2020-01-18T22:48:39+01:00
New upstream version 1.0.6
- - - - -


3 changed files:

- HISTORY.md
- setup.py
- umis/umis.py


Changes:

=====================================
HISTORY.md
=====================================
@@ -1,4 +1,13 @@
-## 1.0.4 (in progress)
+## 1.0.6
+- Fix for the python3 fix.
+
+## 1.0.5
+- Fix for cb_filter with python3.
+
+## 1.0.4
+- Enable cb_histogram to be used on samples without UMIs.
+- Enable filtering of cells during `demultiplex_cells`.
+- Fix incorrect pandas.read_csv call with header=-1.
 
 ## 1.0.3 
 - Python 3 support


=====================================
setup.py
=====================================
@@ -8,7 +8,7 @@ def read(fname):
 
 setup(
         name='umis',
-        version='1.0.3',
+        version='1.0.6',
         description='Package for estimating UMI counts in Transcript Tag Counting data.',
         packages=find_packages(),
         install_requires=['click', 'pysam>=0.8.3', 'pandas', 'regex', 'scipy', 'toolz'],


=====================================
umis/umis.py
=====================================
@@ -24,7 +24,7 @@ import numpy as np
 import scipy.io, scipy.sparse
 import click
 
-VERSION = "1.0.3"
+VERSION = "1.0.6"
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -35,7 +35,10 @@ BARCODEINFO = {"sample": BarcodeInfo(bamtag="XS", readprefix="SAMPLE"),
                "molecular": BarcodeInfo(bamtag="RX", readprefix="UMI")}
 
 def open_gzipsafe(f):
-    return gzip.open(f) if f.endswith(".gz") else open(f)
+    if is_python3():
+        return gzip.open(f, mode="rt") if f.endswith(".gz") else open(f)
+    else:
+        return gzip.open(f) if f.endswith(".gz") else open(f)
 
 def safe_makedir(dname):
     """Make a directory if it doesn't exist, handling concurrent race conditions.
@@ -75,7 +78,7 @@ def read_fastq(filename):
     if filename == "-":
         filename_fh = sys.stdin
     elif filename.endswith('gz'):
-        if is_python3:
+        if is_python3():
             filename_fh = gzip.open(filename, mode='rt')
         else:
             filename_fh = BufferedReader(gzip.open(filename, mode='rt'))
@@ -485,7 +488,7 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
     cb_hist = None
     filter_cb = False
     if cb_histogram:
-        cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t")
+        cb_hist = pd.read_csv(cb_histogram, index_col=0, header=None, squeeze=True, sep="\t")
         total_num_cbs = cb_hist.shape[0]
         cb_hist = cb_hist[cb_hist > cb_cutoff]
         logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
@@ -712,9 +715,9 @@ def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
                       'read gene mapping information in stead of the mapping '
                       'target nane. Useful if e.g. reads have been mapped to '
                       'genome in stead of transcriptome.'))
- at click.option('--umi_matrix', required=False, 
+ at click.option('--umi_matrix', required=False,
               help=('Save a sparse matrix of counts without UMI deduping to this file.'))
-def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, 
+def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
                  cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix):
     ''' Count up evidence for tagged molecules, this implementation assumes the
     alignment file is coordinate sorted
@@ -758,7 +761,7 @@ def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
     cb_hist = None
     filter_cb = False
     if cb_histogram:
-        cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t")
+        cb_hist = pd.read_csv(cb_histogram, index_col=0, header=None, squeeze=True, sep="\t")
         total_num_cbs = cb_hist.shape[0]
         cb_hist = cb_hist[cb_hist > cb_cutoff]
         logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
@@ -971,9 +974,9 @@ def cb_histogram(fastq, umi_histogram):
     for read in read_fastq(fastq):
         match = parser_re.search(read).groupdict()
         cb = match['CB']
-        umi = match['MB']
         cb_counter[cb] += 1
         if umi_histogram:
+            umi = match['MB']
             umi_counter[(cb, umi)] += 1
 
     for bc, count in cb_counter.most_common():
@@ -1054,9 +1057,9 @@ def cb_filter(fastq, bc1, bc2, bc3, cores, nedit):
     ''' Filters reads with non-matching barcodes
     Expects formatted fastq files.
     '''
-
     with open_gzipsafe(bc1) as bc1_fh:
         bc1 = set(cb.strip() for cb in bc1_fh)
+
     if bc2:
         with open_gzipsafe(bc2) as bc2_fh:
             bc2 = set(cb.strip() for cb in bc2_fh)
@@ -1312,7 +1315,10 @@ def is_python3():
 @click.option('--out_dir', default=".")
 @click.option('--readnumber', default="")
 @click.option('--prefix', default="")
-def demultiplex_cells(fastq, out_dir, readnumber, prefix=""):
+ at click.option('--cb_histogram', default=None)
+ at click.option('--cb_cutoff', default=0)
+def demultiplex_cells(fastq, out_dir, readnumber, prefix, cb_histogram,
+                      cb_cutoff):
     ''' Demultiplex a fastqtransformed FASTQ file into a FASTQ file for
     each cell.
     '''
@@ -1321,7 +1327,9 @@ def demultiplex_cells(fastq, out_dir, readnumber, prefix=""):
     parser_re = re.compile(re_string)
     readstring = "" if not readnumber else "_R{}".format(readnumber)
     filestring = "{prefix}{sample}{readstring}.fq"
-
+    cb_set = set()
+    if cb_histogram:
+        cb_set = get_cb_depth_set(cb_histogram, cb_cutoff)
     sample_set = set()
     batch = collections.defaultdict(list)
     parsed = 0
@@ -1330,6 +1338,8 @@ def demultiplex_cells(fastq, out_dir, readnumber, prefix=""):
         parsed += 1
         match = parser_re.search(read).groupdict()
         sample = match['CB']
+        if cb_set and sample not in cb_set:
+            continue
         sample_set.add(sample)
         batch[sample].append(read)
         # write in batches to avoid opening up file handles repeatedly



View it on GitLab: https://salsa.debian.org/med-team/umis/commit/1279f98c26934b5800b87f5e3c549d24a85660ee

-- 
View it on GitLab: https://salsa.debian.org/med-team/umis/commit/1279f98c26934b5800b87f5e3c549d24a85660ee
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200118/d5fbf9b6/attachment-0001.html>