[med-svn] [Git][med-team/pyranges][upstream] New upstream version 0.0.85+ds
Andreas Tille
gitlab at salsa.debian.org
Mon Nov 2 09:52:34 GMT 2020
Andreas Tille pushed to branch upstream at Debian Med / pyranges
Commits:
823b99cc by Andreas Tille at 2020-11-02T08:39:35+01:00
New upstream version 0.0.85+ds
- - - - -
13 changed files:
- CHANGELOG.txt
- + pyranges/helpers.py
- pyranges/methods/init.py
- pyranges/methods/join.py
- pyranges/multithreaded.py
- pyranges/pyranges.py
- pyranges/readers.py
- pyranges/statistics.py
- pyranges/version.py
- tests/test_binary.py
- + tests/test_change_chromosome_custom.py
- tests/test_io.py
- tests/test_unary.py
Changes:
=====================================
CHANGELOG.txt
=====================================
@@ -1,3 +1,21 @@
+# 0.0.85 (17.09.20)
+- fix error when parsing gtf-files with whitespace in value-tags
+
+# 0.0.84 (18.08.20)
+- add option to report overlap in join
+
+# 0.0.83 (18.08.20)
+- hotfix
+
+# 0.0.82 (18.08.20)
+- fix error introduced in 0.0.80
+
+# 0.0.81 (13.08.20)
+- fix Fisher's implementation
+
+# 0.0.80 (10.08.20)
+- fix reassigning chromosomes in apply
+
# 0.0.79 (08.06.20)
- fix bug in features.introns where the gene_id column was overwritten (issue #134)
=====================================
pyranges/helpers.py
=====================================
@@ -0,0 +1,44 @@
+
+def get_chromosomes_from_dict(dfs):
+
+ keys = list(dfs.keys())
+ if isinstance(keys[0], tuple):
+ chromosomes = [k[0] for k in keys]
+ else:
+ chromosomes = keys
+
+ return chromosomes
+
+
+
+def get_strands_from_dict(dfs):
+
+ keys = list(dfs.keys())
+ if isinstance(keys[0], tuple):
+ strands = [k[1] for k in keys]
+ else:
+ strands = keys
+
+ return strands
+
+
+
+def get_key_from_df(df):
+
+ chromosome = df.Chromosome.head(1).iloc[0]
+ if "Strand" in df:
+ strand = df.Strand.head(1).iloc[0]
+ return chromosome, strand
+
+ return chromosome
+
+
+def single_value_key(df):
+
+ if "Strand" in df:
+ return len(df[["Chromosome", "Strand"]].drop_duplicates(["Chromosome", "Strand"])) == 1
+ else:
+ return len(df.Chromosome.drop_duplicates()) == 1
+
+
+
=====================================
pyranges/methods/init.py
=====================================
@@ -6,6 +6,7 @@ from natsort import natsorted
from pyranges.statistics import StatisticsMethods
from pyranges.genomicfeatures import GenomicFeaturesMethods
from pyranges import PyRanges
+from pyranges.helpers import single_value_key, get_key_from_df
def set_dtypes(df, int64):
@@ -166,40 +167,32 @@ def _init(self,
else:
empty_removed = {k: v.copy() for k, v in df.items() if not v.empty}
- if empty_removed:
- first_key, first_df = list(empty_removed.items())[0]
- # from pydbg import dbg;
- # dbg(first_df)
- stranded = "Strand" in first_df
-
- all_strands_valid = True
- if stranded:
- all_strands_valid = all([
- len(set(df.Strand.drop_duplicates()) -
- set(["+", "-"])) == 0 for df in empty_removed.values()
- ])
-
- assert all(c in first_df for c in "Chromosome Start End".split(
- )), "Columns Chromosome, Start and End must be in the dataframe!"
-
- # if not has strand key, but is stranded, need to add strand key
- has_strand_key = isinstance(first_key, tuple)
- if not has_strand_key and stranded and all_strands_valid:
- new_dfs = {}
- for k, v in empty_removed.items():
- for s, sdf in v.groupby("Strand"):
- new_dfs[k, s] = sdf
- empty_removed = new_dfs
-
- # need to merge strand keys if not strands valid anymore
- elif has_strand_key and (not all_strands_valid or not stranded):
- new_dfs = {}
- cs = set([k[0] for k in empty_removed.keys()])
- for c in natsorted(cs):
- dfs = [empty_removed.get((c, s)) for s in "+-"]
- new_dfs[c] = pd.concat(
- [df for df in dfs if not df is None]).reset_index(drop=True)
- empty_removed = new_dfs
+
+ _single_value_key = True
+ _key_same = True
+ _strand_valid = True
+ _has_strand = True
+ for key, df in empty_removed.items():
+
+ _key = get_key_from_df(df)
+ _single_value_key = single_value_key(df) and _single_value_key
+ _key_same = (_key == key) and _key_same
+
+ if isinstance(_key, tuple):
+ _strand_valid = _strand_valid and (_key[1] in ["+", "-"])
+ else:
+ _has_strand = False
+
+
+ if not all([_single_value_key, _key_same, _strand_valid]):
+ df = pd.concat(empty_removed.values()).reset_index(drop=True)
+
+ if _has_strand and _strand_valid:
+ empty_removed = df.groupby(["Chromosome", "Strand"])
+ else:
+ empty_removed = df.groupby("Chromosome")
+
+ empty_removed = {k: v for (k, v) in empty_removed}
self.__dict__["dfs"] = empty_removed
=====================================
pyranges/methods/join.py
=====================================
@@ -124,8 +124,6 @@ def _write_both(scdf, ocdf, **kwargs):
else:
suffix = kwargs.get("suffixes", "_a _b".split())[1]
- how = kwargs["how"]
-
scdf, ocdf = _both_dfs(scdf, ocdf, how=how)
nix = pd.Index(range(len(scdf)))
scdf.index = nix
@@ -135,4 +133,7 @@ def _write_both(scdf, ocdf, **kwargs):
df = scdf.join(ocdf, rsuffix=suffix)
+ if kwargs.get("report_overlap"):
+ df["Overlap"] = df[["End", "End"+suffix]].min(axis=1) - df[["Start", "Start"+suffix]].max(axis=1)
+
return df
=====================================
pyranges/multithreaded.py
=====================================
@@ -8,6 +8,7 @@ from natsort import natsorted
import os
+from collections import defaultdict
def get_n_args(f):
@@ -104,6 +105,7 @@ def process_results(results, keys):
for k in to_delete:
del results_dict[k]
+
return results_dict
@@ -286,9 +288,6 @@ def pyrange_apply(function, self, other, **kwargs):
df, odf = make_binary_sparse(kwargs, df, odf)
- # dbg(df)
- # dbg(odf)
-
result = call_f(function, nparams, df, odf, kwargs)
results.append(result)
@@ -389,6 +388,7 @@ def pyrange_apply_single(function, self, **kwargs):
results = process_results(results, keys)
+
return results
=====================================
pyranges/pyranges.py
=====================================
@@ -176,6 +176,7 @@ class PyRanges():
_init(self, df, chromosomes, starts, ends, strands, int64, copy_df)
+
def __array_ufunc__(self, *args, **kwargs):
"""Apply unary numpy-function.
@@ -1987,7 +1988,7 @@ class PyRanges():
return natsorted([(k, df) for (k, df) in self.dfs.items()])
- def join(self, other, strandedness=None, how=None, slack=0, suffix="_b", nb_cpu=1):
+ def join(self, other, strandedness=None, how=None, report_overlap=False, slack=0, suffix="_b", nb_cpu=1):
"""Join PyRanges on genomic location.
@@ -2008,6 +2009,10 @@ class PyRanges():
How to handle intervals without overlap. None means only keep overlapping intervals.
"left" keeps all intervals in self, "right" keeps all intervals in other.
+ report_overlap : bool, default False
+
+ Report amount of overlap in base pairs.
+
slack : int, default 0
Lengthen intervals in self before joining.
@@ -2108,7 +2113,7 @@ class PyRanges():
from pyranges.methods.join import _write_both
- kwargs = {"strandedness": strandedness, "how": how, "suffix": suffix, "nb_cpu": nb_cpu}
+ kwargs = {"strandedness": strandedness, "how": how, "report_overlap":report_overlap, "suffix": suffix, "nb_cpu": nb_cpu}
# slack = kwargs.get("slack")
if slack:
self.Start__slack = self.Start
@@ -2137,7 +2142,6 @@ class PyRanges():
kwargs["example_header_self"] = self.head(1).df
dfs = pyrange_apply(_write_both, self, other, **kwargs)
-
gr = PyRanges(dfs)
if slack:
=====================================
pyranges/readers.py
=====================================
@@ -313,7 +313,7 @@ def read_gtf_full(f, as_df=False, nrows=None, skiprows=0, duplicate_attr=False):
names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split(
)
- # names = "Chromosome Start End Score Strand Source Feature Frame Attribute".split()
+
df_iter = pd.read_csv(
f,
sep="\t",
@@ -352,8 +352,14 @@ def to_rows(anno):
raise Exception("Invalid attribute string: {l}. If the file is in GFF3 format, use pr.read_gff3 instead.".format(l=l))
for l in anno:
- l = l.replace('"', '').replace(";", "").split()
- rowdicts.append({k: v for k, v in zip(*([iter(l)] * 2))})
+ rowdicts.append({k: v
+ for k, v in [kv.replace('"', '').split(None, 1)
+ # l[:-1] removes final ";" cheaply
+ for kv in l[:-1].split("; ")]})
+
+ # for l in anno:
+ # l = l.replace('"', '').replace(";", "").split()
+ # rowdicts.append({k: v for k, v in zip(*([iter(l)] * 2))})
return pd.DataFrame.from_dict(rowdicts).set_index(anno.index)
@@ -362,8 +368,10 @@ def to_rows_keep_duplicates(anno):
rowdicts = []
for l in anno:
rowdict = {}
- l = l.replace('"', '').replace(";", "").split()
- for k, v in zip(*([iter(l)] * 2)):
+
+ # l[:-1] removes final ";" cheaply
+ for k, v in (kv.replace('"', '').split(None, 1) for kv in l[:-1].split("; ")):
+
if k not in rowdict:
rowdict[k] = v
elif k in rowdict and isinstance(rowdict[k], list):
@@ -442,8 +450,6 @@ def read_gtf_restricted(f,
def to_rows_gff3(anno):
rowdicts = []
- # anno = anno.str.replace(";$", "")
-
for l in list(anno):
l = (it.split("=") for it in l.split(";"))
rowdicts.append({k: v for k, v in l})
=====================================
pyranges/statistics.py
=====================================
@@ -81,7 +81,7 @@ def fdr(p_vals):
return fdr
-def fisher_exact(n1, d1, n2, d2, pseudocount=0):
+def fisher_exact(tp, fp, fn, tn, pseudocount=0):
"""Fisher's exact for contingency tables.
@@ -91,21 +91,21 @@ def fisher_exact(n1, d1, n2, d2, pseudocount=0):
Parameters
----------
- n1 : array-like of int
+ tp : array-like of int
- Top left square of contingency table.
+ Top left square of contingency table (true positives).
- d1 : array-like of int
+ fp : array-like of int
- Bottom left square of contingency table.
+ Top right square of contingency table (false positives).
- n2 : array-like of int
+ fn : array-like of int
- Top right square of contingency table.
+ Bottom left square of contingency table (false negatives).
- d2 : array-like of int
+ tn : array-like of int
- Bottom right square of contingency table.
+ Bottom right square of contingency table (true negatives).
pseudocount : float, default 0
@@ -116,7 +116,7 @@ def fisher_exact(n1, d1, n2, d2, pseudocount=0):
The odds-ratio is computed thusly:
- ``((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) / (d1 + pseudocount))``
+ ``((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount))``
Returns
-------
@@ -132,19 +132,17 @@ def fisher_exact(n1, d1, n2, d2, pseudocount=0):
Examples
--------
- >>> d = {"TP": [1, 0, 8], "FP": [11, 12, 1], "TN": [9, 10, 2], "FN": [3, 2, 5]}
+ >>> d = {"TP": [12, 0], "FP": [5, 12], "TN": [29, 10], "FN": [2, 2]}
>>> df = pd.DataFrame(d)
>>> df
TP FP TN FN
- 0 1 11 9 3
+ 0 12 5 29 2
1 0 12 10 2
- 2 8 1 2 5
>>> pr.stats.fisher_exact(df.TP, df.FP, df.TN, df.FN)
OR P PLeft PRight
- 0 0.407407 0.002759 0.001380 0.999966
+ 0 0.165517 0.080269 0.044555 0.994525
1 0.000000 0.000067 0.000034 1.000000
- 2 0.800000 0.034965 0.999126 0.024476
"""
@@ -155,14 +153,14 @@ def fisher_exact(n1, d1, n2, d2, pseudocount=0):
print("fisher needs to be installed to use fisher exact. pip install fisher or conda install -c bioconda fisher.")
sys.exit(-1)
- n1 = np.array(n1, dtype=np.uint)
- n2 = np.array(n2, dtype=np.uint)
- d1 = np.array(d1, dtype=np.uint)
- d2 = np.array(d2, dtype=np.uint)
+ tp = np.array(tp, dtype=np.uint)
+ fp = np.array(fp, dtype=np.uint)
+ fn = np.array(fn, dtype=np.uint)
+ tn = np.array(tn, dtype=np.uint)
- left, right, twosided = pvalue_npy(n1, d1, n2, d2)
+ left, right, twosided = pvalue_npy(tp, fp, fn, tn)
- OR = ((n1 + pseudocount) / (d2 + pseudocount)) / ((n2 + pseudocount) / (d1 + pseudocount))
+ OR = ((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount))
df = pd.DataFrame({"OR": OR, "P": twosided, "PLeft": left, "PRight": right})
=====================================
pyranges/version.py
=====================================
@@ -1 +1 @@
-__version__ = "0.0.79"
+__version__ = "0.0.85"
=====================================
tests/test_binary.py
=====================================
@@ -162,6 +162,7 @@ def test_set_union(gr, gr2, strandedness):
suppress_health_check=HealthCheck.all())
@given(gr=dfs_min(), gr2=dfs_min()) # pylint: disable=no-value-for-parameter
# @reproduce_failure('4.32.2', b'AXicY2RAA4wQzIgiCAAAgAAF')
+# @reproduce_failure('5.5.4', b'AXicY2RABYyMEAqKGRgAAHMABg==')
def test_overlap(gr, gr2, strandedness):
overlap_command = "bedtools intersect -u {strand} -a {f1} -b {f2}"
=====================================
tests/test_change_chromosome_custom.py
=====================================
@@ -0,0 +1,24 @@
+import pandas as pd
+import pyranges as pr
+
+def test_change_chromosomes():
+
+ df1 = pd.DataFrame({"Chromosome": ["chr1", "chr2"], "Start": [100, 200],
+ "End": [150, 201]})
+ py1 = pr.PyRanges(df1)
+ df2 = pd.DataFrame({"Chromosome": ["1", "2"], "Start": [1000, 2000],
+ "End": [1500, 20010]})
+ py2 = pr.PyRanges(df2)
+
+ def modify_chrom_series(df):
+ df.Chromosome = df.Chromosome.apply(lambda val: val.replace("chr", ""))
+ return df
+ def fix_chrom(regions):
+ return regions.apply(modify_chrom_series)
+
+ print(py1)
+
+ py1 = fix_chrom(py1)
+
+
+ assert py1.chromosomes == ["1", "2"]
=====================================
tests/test_io.py
=====================================
@@ -1,14 +1,11 @@
import pyranges as pr
-# def test_read_bam():
-
-# pr.read_bam("tests/test_data/test_sorted.bam")
-
def test_read_gtf():
gr = pr.read_gtf("tests/test_data/ensembl.gtf", full=True)
- assert len(gr.columns) == 28
+
+ assert len(gr.columns) == 26
df = gr.df
transcript = df.iloc[1]
@@ -19,7 +16,8 @@ def test_read_gtf():
gr = pr.read_gtf("tests/test_data/ensembl.gtf",
full=True, duplicate_attr=True)
- assert len(gr.columns) == 28
+ print(gr.columns)
+ assert len(gr.columns) == 26
df = gr.df
transcript = df.iloc[1]
=====================================
tests/test_unary.py
=====================================
@@ -234,6 +234,7 @@ makewindows_command = "bedtools makewindows -w 10 -b <(sort -k1,1 -k2,2n {})"
deadline=deadline,
suppress_health_check=HealthCheck.all())
@given(gr=dfs_min()) # pylint: disable=no-value-for-parameter
+# @reproduce_failure('5.5.4', b'AXicY2RgYGAEIzgAsRkBAFsABg==')
def test_windows(gr):
with tempfile.TemporaryDirectory() as temp_dir:
View it on GitLab: https://salsa.debian.org/med-team/pyranges/-/commit/823b99cc8d170fdb4cea05cad9dae23c6aa03791
--
View it on GitLab: https://salsa.debian.org/med-team/pyranges/-/commit/823b99cc8d170fdb4cea05cad9dae23c6aa03791
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201102/d6ead993/attachment-0001.html>
More information about the debian-med-commit
mailing list