[med-svn] [Git][med-team/pyranges][upstream] New upstream version 0.0.85+ds

Mon Nov 2 09:52:34 GMT 2020


Andreas Tille pushed to branch upstream at Debian Med / pyranges


Commits:
823b99cc by Andreas Tille at 2020-11-02T08:39:35+01:00
New upstream version 0.0.85+ds
- - - - -


13 changed files:

- CHANGELOG.txt
- + pyranges/helpers.py
- pyranges/methods/init.py
- pyranges/methods/join.py
- pyranges/multithreaded.py
- pyranges/pyranges.py
- pyranges/readers.py
- pyranges/statistics.py
- pyranges/version.py
- tests/test_binary.py
- + tests/test_change_chromosome_custom.py
- tests/test_io.py
- tests/test_unary.py


Changes:

=====================================
CHANGELOG.txt
=====================================
@@ -1,3 +1,21 @@
+# 0.0.85 (17.09.20)
+- fix error when parsing gtf-files with whitespace in value-tags
+
+# 0.0.84 (18.08.20)
+- add option to report overlap in join
+
+# 0.0.83 (18.08.20)
+- hotfix
+
+# 0.0.82 (18.08.20)
+- fix error introduced in 0.0.80
+
+# 0.0.81 (13.08.20)
+- fix Fisher's implementation
+
+# 0.0.80 (10.08.20)
+- fix reassigning chromosomes in apply
+
 # 0.0.79 (08.06.20)
 - fix bug in features.introns where the gene_id column was overwritten (issue #134)
 


=====================================
pyranges/helpers.py
=====================================
@@ -0,0 +1,44 @@
+
+def get_chromosomes_from_dict(dfs):
+
+    keys = list(dfs.keys())
+    if isinstance(keys[0], tuple):
+        chromosomes = [k[0] for k in keys]
+    else:
+        chromosomes = keys
+
+    return chromosomes
+
+
+
+def get_strands_from_dict(dfs):
+
+    keys = list(dfs.keys())
+    if isinstance(keys[0], tuple):
+        strands = [k[1] for k in keys]
+    else:
+        strands = keys
+
+    return strands
+
+
+
+def get_key_from_df(df):
+
+    chromosome = df.Chromosome.head(1).iloc[0]
+    if "Strand" in df:
+        strand = df.Strand.head(1).iloc[0]
+        return chromosome, strand
+
+    return chromosome
+
+
+def single_value_key(df):
+
+    if "Strand" in df:
+        return len(df[["Chromosome", "Strand"]].drop_duplicates(["Chromosome", "Strand"])) == 1
+    else:
+        return len(df.Chromosome.drop_duplicates()) == 1
+
+
+


=====================================
pyranges/methods/init.py
=====================================
@@ -6,6 +6,7 @@ from natsort import natsorted
 from pyranges.statistics import StatisticsMethods
 from pyranges.genomicfeatures import GenomicFeaturesMethods
 from pyranges import PyRanges
+from pyranges.helpers import single_value_key, get_key_from_df
 
 
 def set_dtypes(df, int64):
@@ -166,40 +167,32 @@ def _init(self,
     else:
 
         empty_removed = {k: v.copy() for k, v in df.items() if not v.empty}
-        if empty_removed:
-            first_key, first_df = list(empty_removed.items())[0]
-            # from pydbg import dbg;
-            # dbg(first_df)
-            stranded = "Strand" in first_df
-
-            all_strands_valid = True
-            if stranded:
-                all_strands_valid = all([
-                    len(set(df.Strand.drop_duplicates()) -
-                        set(["+", "-"])) == 0 for df in empty_removed.values()
-                ])
-
-            assert all(c in first_df for c in "Chromosome Start End".split(
-            )), "Columns Chromosome, Start and End must be in the dataframe!"
-
-            # if not has strand key, but is stranded, need to add strand key
-            has_strand_key = isinstance(first_key, tuple)
-            if not has_strand_key and stranded and all_strands_valid:
-                new_dfs = {}
-                for k, v in empty_removed.items():
-                    for s, sdf in v.groupby("Strand"):
-                        new_dfs[k, s] = sdf
-                empty_removed = new_dfs
-
-            # need to merge strand keys if not strands valid anymore
-            elif has_strand_key and (not all_strands_valid or not stranded):
-                new_dfs = {}
-                cs = set([k[0] for k in empty_removed.keys()])
-                for c in natsorted(cs):
-                    dfs = [empty_removed.get((c, s)) for s in "+-"]
-                    new_dfs[c] = pd.concat(
-                        [df for df in dfs if not df is None]).reset_index(drop=True)
-                empty_removed = new_dfs
+
+        _single_value_key = True
+        _key_same = True
+        _strand_valid = True
+        _has_strand = True
+        for key, df in empty_removed.items():
+
+            _key = get_key_from_df(df)
+            _single_value_key = single_value_key(df) and _single_value_key
+            _key_same = (_key == key) and _key_same
+
+            if isinstance(_key, tuple):
+                _strand_valid = _strand_valid and (_key[1] in ["+", "-"])
+            else:
+                _has_strand = False
+
+
+        if not all([_single_value_key, _key_same, _strand_valid]):
+            df = pd.concat(empty_removed.values()).reset_index(drop=True)
+
+            if _has_strand and _strand_valid:
+                empty_removed = df.groupby(["Chromosome", "Strand"])
+            else:
+                empty_removed = df.groupby("Chromosome")
+
+            empty_removed = {k: v for (k, v) in empty_removed}
 
         self.__dict__["dfs"] = empty_removed
 


=====================================
pyranges/methods/join.py
=====================================
@@ -124,8 +124,6 @@ def _write_both(scdf, ocdf, **kwargs):
     else:
         suffix = kwargs.get("suffixes", "_a _b".split())[1]
 
-    how = kwargs["how"]
-
     scdf, ocdf = _both_dfs(scdf, ocdf, how=how)
     nix = pd.Index(range(len(scdf)))
     scdf.index = nix
@@ -135,4 +133,7 @@ def _write_both(scdf, ocdf, **kwargs):
 
     df = scdf.join(ocdf, rsuffix=suffix)
 
+    if kwargs.get("report_overlap"):
+        df["Overlap"] = df[["End", "End"+suffix]].min(axis=1) - df[["Start", "Start"+suffix]].max(axis=1)
+
     return df


=====================================
pyranges/multithreaded.py
=====================================
@@ -8,6 +8,7 @@ from natsort import natsorted
 
 import os
 
+from collections import defaultdict
 
 def get_n_args(f):
 
@@ -104,6 +105,7 @@ def process_results(results, keys):
     for k in to_delete:
         del results_dict[k]
 
+
     return results_dict
 
 
@@ -286,9 +288,6 @@ def pyrange_apply(function, self, other, **kwargs):
 
                 df, odf = make_binary_sparse(kwargs, df, odf)
 
-                # dbg(df)
-                # dbg(odf)
-
                 result = call_f(function, nparams, df, odf, kwargs)
                 results.append(result)
 
@@ -389,6 +388,7 @@ def pyrange_apply_single(function, self, **kwargs):
 
     results = process_results(results, keys)
 
+
     return results
 
 


=====================================
pyranges/pyranges.py
=====================================
@@ -176,6 +176,7 @@ class PyRanges():
 
         _init(self, df, chromosomes, starts, ends, strands, int64, copy_df)
 
+
     def __array_ufunc__(self, *args, **kwargs):
 
         """Apply unary numpy-function.
@@ -1987,7 +1988,7 @@ class PyRanges():
 
         return natsorted([(k, df) for (k, df) in self.dfs.items()])
 
-    def join(self, other, strandedness=None, how=None, slack=0, suffix="_b", nb_cpu=1):
+    def join(self, other, strandedness=None, how=None, report_overlap=False, slack=0, suffix="_b", nb_cpu=1):
 
         """Join PyRanges on genomic location.
 
@@ -2008,6 +2009,10 @@ class PyRanges():
             How to handle intervals without overlap. None means only keep overlapping intervals.
             "left" keeps all intervals in self, "right" keeps all intervals in other.
 
+        report_overlap : bool, default False
+
+            Report amount of overlap in base pairs. 
+
         slack : int, default 0
 
             Lengthen intervals in self before joining.
@@ -2108,7 +2113,7 @@ class PyRanges():
 
         from pyranges.methods.join import _write_both
 
-        kwargs = {"strandedness": strandedness, "how": how, "suffix": suffix, "nb_cpu": nb_cpu}
+        kwargs = {"strandedness": strandedness, "how": how, "report_overlap":report_overlap, "suffix": suffix, "nb_cpu": nb_cpu}
         # slack = kwargs.get("slack")
         if slack:
             self.Start__slack = self.Start
@@ -2137,7 +2142,6 @@ class PyRanges():
             kwargs["example_header_self"] = self.head(1).df
 
         dfs = pyrange_apply(_write_both, self, other, **kwargs)
-
         gr = PyRanges(dfs)
 
         if slack:


=====================================
pyranges/readers.py
=====================================
@@ -313,7 +313,7 @@ def read_gtf_full(f, as_df=False, nrows=None, skiprows=0, duplicate_attr=False):
 
     names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split(
     )
-    # names = "Chromosome Start End Score Strand Source Feature Frame Attribute".split()
+
     df_iter = pd.read_csv(
         f,
         sep="\t",
@@ -352,8 +352,14 @@ def to_rows(anno):
         raise Exception("Invalid attribute string: {l}. If the file is in GFF3 format, use pr.read_gff3 instead.".format(l=l))
 
     for l in anno:
-        l = l.replace('"', '').replace(";", "").split()
-        rowdicts.append({k: v for k, v in zip(*([iter(l)] * 2))})
+        rowdicts.append({k: v
+                         for k, v in [kv.replace('"', '').split(None, 1)
+                                      # l[:-1] removes final ";" cheaply
+                                      for kv in l[:-1].split("; ")]})
+
+    # for l in anno:
+    #     l = l.replace('"', '').replace(";", "").split()
+    #     rowdicts.append({k: v for k, v in zip(*([iter(l)] * 2))})
 
     return pd.DataFrame.from_dict(rowdicts).set_index(anno.index)
 
@@ -362,8 +368,10 @@ def to_rows_keep_duplicates(anno):
     rowdicts = []
     for l in anno:
         rowdict = {}
-        l = l.replace('"', '').replace(";", "").split()
-        for k, v in zip(*([iter(l)] * 2)):
+
+        # l[:-1] removes final ";" cheaply
+        for k, v in (kv.replace('"', '').split(None, 1) for kv in l[:-1].split("; ")):
+
             if k not in rowdict:
                 rowdict[k] = v
             elif k in rowdict and isinstance(rowdict[k], list):
@@ -442,8 +450,6 @@ def read_gtf_restricted(f,
 def to_rows_gff3(anno):
     rowdicts = []
 
-    # anno = anno.str.replace(";$", "")
-
     for l in list(anno):
         l = (it.split("=") for it in l.split(";"))
         rowdicts.append({k: v for k, v in l})


=====================================
pyranges/statistics.py
=====================================
@@ -81,7 +81,7 @@ def fdr(p_vals):
     return fdr
 
 
-def fisher_exact(n1, d1, n2, d2, pseudocount=0):
+def fisher_exact(tp, fp, fn, tn, pseudocount=0):
 
     """Fisher's exact for contingency tables.
 
@@ -91,21 +91,21 @@ def fisher_exact(n1, d1, n2, d2, pseudocount=0):
 
     Parameters
     ----------
-    n1 : array-like of int
+    tp : array-like of int
 
-        Top left square of contingency table.
+        Top left square of contingency table (true positives).
 
-    d1 : array-like of int
+    fp : array-like of int
 
-        Bottom left square of contingency table.
+        Top right square of contingency table (false positives).
 
-    n2 : array-like of int
+    fn : array-like of int
 
-        Top right square of contingency table.
+        Bottom left square of contingency table (false negatives).
 
-    d2 : array-like of int
+    tn : array-like of int
 
-        Bottom right square of contingency table.
+        Bottom right square of contingency table (true negatives).
 
     pseudocount : float, default 0
 
@@ -116,7 +116,7 @@ def fisher_exact(n1, d1, n2, d2, pseudocount=0):
 
     The odds-ratio is computed thusly:
 
-    ``((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) / (d1 + pseudocount))``
+    ``((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount))``
 
     Returns
     -------
@@ -132,19 +132,17 @@ def fisher_exact(n1, d1, n2, d2, pseudocount=0):
     Examples
     --------
 
-    >>> d = {"TP": [1, 0, 8], "FP": [11, 12, 1], "TN": [9, 10, 2], "FN": [3, 2, 5]}
+    >>> d = {"TP": [12, 0], "FP": [5, 12], "TN": [29, 10], "FN": [2, 2]}
     >>> df = pd.DataFrame(d)
     >>> df
        TP  FP  TN  FN
-    0   1  11   9   3
+    0  12   5  29   2
     1   0  12  10   2
-    2   8   1   2   5
 
     >>> pr.stats.fisher_exact(df.TP, df.FP, df.TN, df.FN)
              OR         P     PLeft    PRight
-    0  0.407407  0.002759  0.001380  0.999966
+    0  0.165517  0.080269  0.044555  0.994525
     1  0.000000  0.000067  0.000034  1.000000
-    2  0.800000  0.034965  0.999126  0.024476
     """
 
 
@@ -155,14 +153,14 @@ def fisher_exact(n1, d1, n2, d2, pseudocount=0):
         print("fisher needs to be installed to use fisher exact. pip install fisher or conda install -c bioconda fisher.")
         sys.exit(-1)
 
-    n1 = np.array(n1, dtype=np.uint)
-    n2 = np.array(n2, dtype=np.uint)
-    d1 = np.array(d1, dtype=np.uint)
-    d2 = np.array(d2, dtype=np.uint)
+    tp = np.array(tp, dtype=np.uint)
+    fp = np.array(fp, dtype=np.uint)
+    fn = np.array(fn, dtype=np.uint)
+    tn = np.array(tn, dtype=np.uint)
 
-    left, right, twosided = pvalue_npy(n1, d1, n2, d2)
+    left, right, twosided = pvalue_npy(tp, fp, fn, tn)
 
-    OR = ((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) / (d1 + pseudocount))
+    OR = ((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount))
 
     df = pd.DataFrame({"OR": OR, "P": twosided, "PLeft": left, "PRight": right})
 


=====================================
pyranges/version.py
=====================================
@@ -1 +1 @@
-__version__ = "0.0.79"
+__version__ = "0.0.85"


=====================================
tests/test_binary.py
=====================================
@@ -162,6 +162,7 @@ def test_set_union(gr, gr2, strandedness):
     suppress_health_check=HealthCheck.all())
 @given(gr=dfs_min(), gr2=dfs_min())  # pylint: disable=no-value-for-parameter
 # @reproduce_failure('4.32.2', b'AXicY2RAA4wQzIgiCAAAgAAF')
+# @reproduce_failure('5.5.4', b'AXicY2RABYyMEAqKGRgAAHMABg==')
 def test_overlap(gr, gr2, strandedness):
 
     overlap_command = "bedtools intersect -u {strand} -a {f1} -b {f2}"


=====================================
tests/test_change_chromosome_custom.py
=====================================
@@ -0,0 +1,24 @@
+import pandas as pd
+import pyranges as pr
+
+def test_change_chromosomes():
+
+    df1 = pd.DataFrame({"Chromosome": ["chr1", "chr2"], "Start": [100, 200],
+                    "End": [150, 201]})
+    py1 = pr.PyRanges(df1)
+    df2 = pd.DataFrame({"Chromosome": ["1", "2"], "Start": [1000, 2000],
+                    "End": [1500, 20010]})
+    py2 = pr.PyRanges(df2)
+
+    def modify_chrom_series(df):
+        df.Chromosome = df.Chromosome.apply(lambda val: val.replace("chr", ""))
+        return df
+    def fix_chrom(regions):
+        return regions.apply(modify_chrom_series)
+
+    print(py1)
+
+    py1 = fix_chrom(py1)
+
+
+    assert py1.chromosomes == ["1", "2"]


=====================================
tests/test_io.py
=====================================
@@ -1,14 +1,11 @@
 import pyranges as pr
 
-# def test_read_bam():
-
-#     pr.read_bam("tests/test_data/test_sorted.bam")
-
 
 def test_read_gtf():
 
     gr = pr.read_gtf("tests/test_data/ensembl.gtf", full=True)
-    assert len(gr.columns) == 28
+
+    assert len(gr.columns) == 26
 
     df = gr.df
     transcript = df.iloc[1]
@@ -19,7 +16,8 @@ def test_read_gtf():
 
     gr = pr.read_gtf("tests/test_data/ensembl.gtf",
                      full=True, duplicate_attr=True)
-    assert len(gr.columns) == 28
+    print(gr.columns)
+    assert len(gr.columns) == 26
 
     df = gr.df
     transcript = df.iloc[1]


=====================================
tests/test_unary.py
=====================================
@@ -234,6 +234,7 @@ makewindows_command = "bedtools makewindows -w 10 -b <(sort -k1,1 -k2,2n {})"
     deadline=deadline,
     suppress_health_check=HealthCheck.all())
 @given(gr=dfs_min())  # pylint: disable=no-value-for-parameter
+# @reproduce_failure('5.5.4', b'AXicY2RgYGAEIzgAsRkBAFsABg==')
 def test_windows(gr):
 
     with tempfile.TemporaryDirectory() as temp_dir:



View it on GitLab: https://salsa.debian.org/med-team/pyranges/-/commit/823b99cc8d170fdb4cea05cad9dae23c6aa03791

-- 
View it on GitLab: https://salsa.debian.org/med-team/pyranges/-/commit/823b99cc8d170fdb4cea05cad9dae23c6aa03791
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201102/d6ead993/attachment-0001.html>