[med-svn] [Git][med-team/python-gtfparse][master] 8 commits: New upstream version 1.3.0+ds
Andreas Tille (@tille)
gitlab at salsa.debian.org
Fri Jan 27 14:36:44 GMT 2023
Andreas Tille pushed to branch master at Debian Med / python-gtfparse
Commits:
22228862 by Mohammed Bilal at 2022-12-04T10:54:30+00:00
New upstream version 1.3.0+ds
- - - - -
ca8a1796 by Andreas Tille at 2023-01-27T14:46:45+01:00
Section: python
- - - - -
5241722c by Andreas Tille at 2023-01-27T14:46:59+01:00
New upstream version 2.0.1+ds
- - - - -
e41bc463 by Andreas Tille at 2023-01-27T14:46:59+01:00
routine-update: New upstream version
- - - - -
21441a18 by Andreas Tille at 2023-01-27T14:46:59+01:00
Update upstream source from tag 'upstream/2.0.1+ds'
Update to upstream version '2.0.1+ds'
with Debian dir 8a7dd8cf39f0ab0d86c6dce75c09a4eaa920c86c
- - - - -
ede6ba07 by Andreas Tille at 2023-01-27T14:47:00+01:00
routine-update: Standards-Version: 4.6.2
- - - - -
4b5bede7 by Andreas Tille at 2023-01-27T14:51:24+01:00
Upstream has applied patch
- - - - -
302000b9 by Andreas Tille at 2023-01-27T15:36:10+01:00
TODO: New dependency: https://github.com/pola-rs/polars
- - - - -
20 changed files:
- PKG-INFO
- debian/changelog
- debian/control
- − debian/patches/series
- − debian/patches/switch-to-pytest.patch
- gtfparse.egg-info/PKG-INFO
- gtfparse.egg-info/requires.txt
- gtfparse/__init__.py
- gtfparse/attribute_parsing.py
- gtfparse/create_missing_features.py
- gtfparse/read_gtf.py
- − gtfparse/required_columns.py
- gtfparse/version.py
- + requirements.txt
- setup.py
- test/test_create_missing_features.py
- test/test_ensembl_gtf.py
- test/test_expand_attributes.py
- test/test_multiple_values_for_tag_attribute.py
- test/test_parse_gtf_lines.py
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gtfparse
-Version: 1.3.0
+Version: 2.0.1
Summary: GTF Parsing
Home-page: https://github.com/openvax/gtfparse
Author: Alex Rubinsteyn
=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+python-gtfparse (2.0.1+ds-1) UNRELEASED; urgency=medium
+
+ * Team upload.
+ * Section: python
+ * Standards-Version: 4.6.2 (routine-update)
+ TODO: New dependency: https://github.com/pola-rs/polars
+
+ -- Andreas Tille <tille at debian.org> Fri, 27 Jan 2023 14:46:27 +0100
+
python-gtfparse (1.3.0+ds-1) unstable; urgency=medium
* Team upload.
=====================================
debian/control
=====================================
@@ -4,7 +4,7 @@ Priority: optional
Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
Uploaders: Steffen Moeller <moeller at debian.org>
Build-Depends: debhelper-compat (= 13), dh-python, python3-setuptools, python3-all, python3-six, python3-pandas, python3-pytest
-Standards-Version: 4.6.1
+Standards-Version: 4.6.2
Homepage: https://github.com/openvax/gtfparse
Vcs-Browser: https://salsa.debian.org/med-team/python-gtfparse
Vcs-Git: https://salsa.debian.org/med-team/python-gtfparse.git
@@ -13,6 +13,7 @@ Rules-Requires-Root: no
Package: python3-gtfparse
Architecture: all
+Section: python
Depends: ${python3:Depends}, ${misc:Depends}
Description: parser for gene transfer format (aka GFF2)
You find a gene in the genome? Or a feature about it?
=====================================
debian/patches/series deleted
=====================================
@@ -1 +0,0 @@
-switch-to-pytest.patch
=====================================
debian/patches/switch-to-pytest.patch deleted
=====================================
@@ -1,141 +0,0 @@
-Description: Switch to pytest from nose since the latter is now deprecated
-Author: Mohammed Bilal <mdbilal at disroot.org>
-Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1018506
-Forwarded: https://github.com/openvax/gtfparse/issues/29
-Last-Update: 2022-12-04
----
-This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- a/test/test_expand_attributes.py
-+++ b/test/test_expand_attributes.py
-@@ -1,5 +1,4 @@
- from gtfparse import expand_attribute_strings
--from nose.tools import eq_
-
- def test_attributes_in_quotes():
- attributes = [
-@@ -7,10 +6,10 @@
- "gene_id \"ENSG002\"; tag \"wolfpuppy\"; version \"2\";"
- ]
- parsed_dict = expand_attribute_strings(attributes)
-- eq_(list(sorted(parsed_dict.keys())), ["gene_id", "tag", "version"])
-- eq_(parsed_dict["gene_id"], ["ENSG001", "ENSG002"])
-- eq_(parsed_dict["tag"], ["bogotron", "wolfpuppy"])
-- eq_(parsed_dict["version"], ["1", "2"])
-+ assert list(sorted(parsed_dict.keys())) == ["gene_id", "tag", "version"]
-+ assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002"]
-+ assert parsed_dict["tag"] == ["bogotron", "wolfpuppy"]
-+ assert parsed_dict["version"] == ["1", "2"]
-
-
- def test_attributes_without_quotes():
-@@ -19,10 +18,10 @@
- "gene_id ENSG002; tag wolfpuppy; version 2"
- ]
- parsed_dict = expand_attribute_strings(attributes)
-- eq_(list(sorted(parsed_dict.keys())), ["gene_id", "tag", "version"])
-- eq_(parsed_dict["gene_id"], ["ENSG001", "ENSG002"])
-- eq_(parsed_dict["tag"], ["bogotron", "wolfpuppy"])
-- eq_(parsed_dict["version"], ["1", "2"])
-+ assert list(sorted(parsed_dict.keys())) == ["gene_id", "tag", "version"]
-+ assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002"]
-+ assert parsed_dict["tag"] == ["bogotron", "wolfpuppy"]
-+ assert parsed_dict["version"] == ["1", "2"]
-
-
- def test_optional_attributes():
-@@ -32,6 +31,6 @@
- "gene_id ENSG003; sometimes-present wolfpuppy;",
- ]
- parsed_dict = expand_attribute_strings(attributes)
-- eq_(list(sorted(parsed_dict.keys())), ["gene_id", "sometimes-present"])
-- eq_(parsed_dict["gene_id"], ["ENSG001", "ENSG002", "ENSG003"])
-- eq_(parsed_dict["sometimes-present"], ["bogotron", "", "wolfpuppy"])
-+ assert list(sorted(parsed_dict.keys())) == ["gene_id", "sometimes-present"]
-+ assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002", "ENSG003"]
-+ assert parsed_dict["sometimes-present"] == ["bogotron", "", "wolfpuppy"]
---- a/test/test_multiple_values_for_tag_attribute.py
-+++ b/test/test_multiple_values_for_tag_attribute.py
-@@ -1,6 +1,5 @@
- from six import StringIO
- from gtfparse import parse_gtf_and_expand_attributes
--from nose.tools import eq_
-
- # failing example from https://github.com/openvax/gtfparse/issues/2
- GTF_TEXT = (
-@@ -15,18 +14,18 @@
- def test_parse_tag_attributes():
- parsed = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT))
- tag_column = parsed["tag"]
-- eq_(len(tag_column), 1)
-+ assert len(tag_column) == 1
- tags = tag_column[0]
-- eq_(tags, 'cds_end_NF,mRNA_end_NF')
-+ assert tags == 'cds_end_NF,mRNA_end_NF'
-
- def test_parse_tag_attributes_with_usecols():
- parsed = parse_gtf_and_expand_attributes(
- StringIO(GTF_TEXT),
- restrict_attribute_columns=["tag"])
- tag_column = parsed["tag"]
-- eq_(len(tag_column), 1)
-+ assert len(tag_column) == 1
- tags = tag_column[0]
-- eq_(tags, 'cds_end_NF,mRNA_end_NF')
-+ assert tags == 'cds_end_NF,mRNA_end_NF'
-
- def test_parse_tag_attributes_with_usecols_other_column():
- parsed = parse_gtf_and_expand_attributes(
---- a/test/test_parse_gtf_lines.py
-+++ b/test/test_parse_gtf_lines.py
-@@ -1,5 +1,4 @@
- import numpy as np
--from nose.tools import eq_, assert_raises
- from gtfparse import (
- parse_gtf,
- parse_gtf_and_expand_attributes,
-@@ -7,6 +6,7 @@
- ParsingError
- )
- from six import StringIO
-+from pytest import raises as assert_raises
-
- gtf_text = """
- # sample GTF data copied from:
-@@ -28,27 +28,27 @@
- "transcript_source",
- ]
- # convert to list since Py3's dictionary keys are a distinct collection type
-- eq_(list(parsed_dict.keys()), expected_columns)
-- eq_(list(parsed_dict["seqname"]), ["1", "1"])
-+ assert list(parsed_dict.keys()) == expected_columns
-+ assert list(parsed_dict["seqname"]) == ["1", "1"]
- # convert to list for comparison since numerical columns may be NumPy arrays
-- eq_(list(parsed_dict["start"]), [11869, 11869])
-- eq_(list(parsed_dict["end"]), [14409, 14409])
-+ assert list(parsed_dict["start"]) == [11869, 11869]
-+ assert list(parsed_dict["end"]) == [14409, 14409]
- # can't compare NaN with equality
- scores = list(parsed_dict["score"])
- assert np.isnan(scores).all(), "Unexpected scores: %s" % scores
-- eq_(list(parsed_dict["gene_id"]), ["ENSG00000223972", "ENSG00000223972"])
-- eq_(list(parsed_dict["transcript_id"]), ["", "ENST00000456328"])
-+ assert list(parsed_dict["gene_id"]) == ["ENSG00000223972", "ENSG00000223972"]
-+ assert list(parsed_dict["transcript_id"]) == ["", "ENST00000456328"]
-
-
- def test_parse_gtf_lines_without_expand_attributes():
- parsed_dict = parse_gtf(StringIO(gtf_text))
-
- # convert to list since Py3's dictionary keys are a distinct collection type
-- eq_(list(parsed_dict.keys()), REQUIRED_COLUMNS)
-- eq_(list(parsed_dict["seqname"]), ["1", "1"])
-+ assert list(parsed_dict.keys()) == REQUIRED_COLUMNS
-+ assert list(parsed_dict["seqname"]) == ["1", "1"]
- # convert to list for comparison since numerical columns may be NumPy arrays
-- eq_(list(parsed_dict["start"]), [11869, 11869])
-- eq_(list(parsed_dict["end"]), [14409, 14409])
-+ assert list(parsed_dict["start"]) == [11869, 11869]
-+ assert list(parsed_dict["end"]) == [14409, 14409]
- # can't compare NaN with equality
- scores = list(parsed_dict["score"])
- assert np.isnan(scores).all(), "Unexpected scores: %s" % scores
=====================================
gtfparse.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gtfparse
-Version: 1.3.0
+Version: 2.0.1
Summary: GTF Parsing
Home-page: https://github.com/openvax/gtfparse
Author: Alex Rubinsteyn
=====================================
gtfparse.egg-info/requires.txt
=====================================
@@ -1,2 +1 @@
-numpy>=1.7
-pandas>=0.15
+polars
=====================================
gtfparse/__init__.py
=====================================
@@ -12,17 +12,25 @@
from .attribute_parsing import expand_attribute_strings
from .create_missing_features import create_missing_features
-from .required_columns import REQUIRED_COLUMNS
from .parsing_error import ParsingError
-from .read_gtf import read_gtf, parse_gtf, parse_gtf_and_expand_attributes
+from .read_gtf import (
+ read_gtf,
+ parse_gtf,
+ parse_gtf_pandas,
+ parse_gtf_and_expand_attributes,
+ REQUIRED_COLUMNS,
+)
+
__all__ = [
"expand_attribute_strings",
"create_missing_features",
- "parse_gtf",
+
"parse_gtf_and_expand_attributes",
"REQUIRED_COLUMNS",
"ParsingError",
"read_gtf",
+ "parse_gtf",
+ "parse_gtf_pandas",
]
=====================================
gtfparse/attribute_parsing.py
=====================================
@@ -18,9 +18,10 @@ logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
+
def expand_attribute_strings(
attribute_strings,
- quote_char='\"',
+ quote_char="'",
missing_value="",
usecols=None):
"""
@@ -64,10 +65,11 @@ def expand_attribute_strings(
# using a local dictionary, hence the two dictionaries below
# and pair of try/except blocks in the loop.
column_interned_strings = {}
- value_interned_strings = {}
- for (i, attribute_string) in enumerate(attribute_strings):
- for kv in attribute_string.split(";"):
+ for (i, kv_strings) in enumerate(attribute_strings):
+ if type(kv_strings) is str:
+ kv_strings = kv_strings.split(";")
+ for kv in kv_strings:
# We're slicing the first two elements out of split() because
# Ensembl release 79 added values like:
# transcript_support_level "1 (assigned to previous version 5)";
@@ -88,28 +90,25 @@ def expand_attribute_strings(
if usecols is not None and column_name not in usecols:
continue
+ if value[0] == quote_char:
+ value = value.replace(quote_char, "")
+
try:
column = extra_columns[column_name]
+ # if an attribute is used repeatedly then
+ # keep track of all its values in a list
+ old_value = column[i]
+ if old_value is missing_value:
+ column[i] = value
+ else:
+ column[i] = "%s,%s" % (old_value, value)
except KeyError:
column = [missing_value] * n
+ column[i] = value
extra_columns[column_name] = column
column_order.append(column_name)
- value = value.replace(quote_char, "") if value.startswith(quote_char) else value
-
- try:
- value = value_interned_strings[value]
- except KeyError:
- value = intern(str(value))
- value_interned_strings[value] = value
- # if an attribute is used repeatedly then
- # keep track of all its values in a list
- old_value = column[i]
- if old_value is missing_value:
- column[i] = value
- else:
- column[i] = "%s,%s" % (old_value, value)
logging.info("Extracted GTF attributes: %s" % column_order)
return OrderedDict(
=====================================
gtfparse/create_missing_features.py
=====================================
@@ -55,7 +55,7 @@ def create_missing_features(
extra_dataframes = []
existing_features = set(dataframe["feature"])
- existing_columns = set(dataframe.keys())
+ existing_columns = set(dataframe.columns)
for (feature_name, groupby_key) in unique_keys.items():
if feature_name in existing_features:
=====================================
gtfparse/read_gtf.py
=====================================
@@ -12,118 +12,159 @@
import logging
from os.path import exists
+from io import StringIO
+import gzip
-from sys import intern
-import numpy as np
-import pandas as pd
+import polars
from .attribute_parsing import expand_attribute_strings
from .parsing_error import ParsingError
-from .required_columns import REQUIRED_COLUMNS
+
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
-def parse_gtf(
+"""
+Columns of a GTF file:
+
+ seqname - name of the chromosome or scaffold; chromosome names
+ without a 'chr' in Ensembl (but sometimes with a 'chr'
+ elsewhere)
+ source - name of the program that generated this feature, or
+ the data source (database or project name)
+ feature - feature type name.
+ Features currently in Ensembl GTFs:
+ gene
+ transcript
+ exon
+ CDS
+ Selenocysteine
+ start_codon
+ stop_codon
+ UTR
+ Older Ensembl releases may be missing some of these features.
+ start - start position of the feature, with sequence numbering
+ starting at 1.
+ end - end position of the feature, with sequence numbering
+ starting at 1.
+ score - a floating point value indiciating the score of a feature
+ strand - defined as + (forward) or - (reverse).
+ frame - one of '0', '1' or '2'. Frame indicates the number of base pairs
+ before you encounter a full codon. '0' indicates the feature
+ begins with a whole codon. '1' indicates there is an extra
+ base (the 3rd base of the prior codon) at the start of this feature.
+ '2' indicates there are two extra bases (2nd and 3rd base of the
+ prior exon) before the first codon. All values are given with
+ relation to the 5' end.
+ attribute - a semicolon-separated list of tag-value pairs (separated by a space),
+ providing additional information about each feature. A key can be
+ repeated multiple times.
+
+(from ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/README)
+"""
+
+REQUIRED_COLUMNS = [
+ "seqname",
+ "source",
+ "feature",
+ "start",
+ "end",
+ "score",
+ "strand",
+ "frame",
+ "attribute",
+]
+
+
+def parse_with_polars_lazy(
filepath_or_buffer,
- chunksize=1024 * 1024,
+ split_attributes=True,
features=None,
- intern_columns=["seqname", "source", "strand", "frame"],
fix_quotes_columns=["attribute"]):
- """
- Parameters
- ----------
-
- filepath_or_buffer : str or buffer object
-
- chunksize : int
-
- features : set or None
- Drop entries which aren't one of these features
-
- intern_columns : list
- These columns are short strings which should be interned
+ # use a global string cache so that all strings get intern'd into
+ # a single numbering system
+ polars.toggle_string_cache(True)
+ kwargs = dict(
+ has_header=False,
+ sep="\t",
+ comment_char="#",
+ null_values=".",
+ dtypes={
+ "seqname": polars.Categorical,
+ "source": polars.Categorical,
+
+ "start": polars.Int64,
+ "end": polars.Int64,
+ "score": polars.Float32,
+
+ "feature": polars.Categorical,
+ "strand": polars.Categorical,
+ "frame": polars.UInt32,
+ })
+ try:
+ if type(filepath_or_buffer) is StringIO:
+ df = polars.read_csv(
+ filepath_or_buffer,
+ new_columns=REQUIRED_COLUMNS,
+ **kwargs).lazy()
+ elif filepath_or_buffer.endswith(".gz") or filepath_or_buffer.endswith(".gzip"):
+ with gzip.open(filepath_or_buffer) as f:
+ df = polars.read_csv(
+ f,
+ new_columns=REQUIRED_COLUMNS,
+ **kwargs).lazy()
+ else:
+ df = polars.scan_csv(
+ filepath_or_buffer,
+ with_column_names=lambda cols: REQUIRED_COLUMNS,
+ **kwargs).lazy()
+ except polars.ShapeError:
+ raise ParsingError("Wrong number of columns")
+
+ df = df.with_columns([
+ polars.col("frame").fill_null(0),
+ polars.col("attribute").str.replace_all('"', "'")
+ ])
+
+ for fix_quotes_column in fix_quotes_columns:
+ # Catch mistaken semicolons by replacing "xyz;" with "xyz"
+ # Required to do this since the Ensembl GTF for Ensembl
+ # release 78 has mistakes such as:
+ # gene_name = "PRAMEF6;" transcript_name = "PRAMEF6;-201"
+ df = df.with_columns([
+ polars.col(fix_quotes_column).str.replace(';\"', '\"').str.replace(";-", "-")
+ ])
- fix_quotes_columns : list
- Most commonly the 'attribute' column which had broken quotes on
- some Ensembl release GTF files.
- """
if features is not None:
- features = set(features)
+ features = sorted(set(features))
+ df = df.filter(polars.col("feature").is_in(features))
- dataframes = []
- def parse_frame(s):
- if s == ".":
- return 0
- else:
- return int(s)
-
- # GTF columns:
- # 1) seqname: str ("1", "X", "chrX", etc...)
- # 2) source : str
- # Different versions of GTF use second column as of:
- # (a) gene biotype
- # (b) transcript biotype
- # (c) the annotation source
- # See: https://www.biostars.org/p/120306/#120321
- # 3) feature : str ("gene", "transcript", &c)
- # 4) start : int
- # 5) end : int
- # 6) score : float or "."
- # 7) strand : "+", "-", or "."
- # 8) frame : 0, 1, 2 or "."
- # 9) attribute : key-value pairs separated by semicolons
- # (see more complete description in docstring at top of file)
-
- chunk_iterator = pd.read_csv(
- filepath_or_buffer,
- sep="\t",
- comment="#",
- names=REQUIRED_COLUMNS,
- skipinitialspace=True,
- skip_blank_lines=True,
- on_bad_lines="error",
- chunksize=chunksize,
- engine="c",
- dtype={
- "start": np.int64,
- "end": np.int64,
- "score": np.float32,
- "seqname": str,
- },
- na_values=".",
- converters={"frame": parse_frame})
- dataframes = []
- try:
- for df in chunk_iterator:
- for intern_column in intern_columns:
- df[intern_column] = [intern(str(s)) for s in df[intern_column]]
-
- # compare feature strings after interning
- if features is not None:
- df = df[df["feature"].isin(features)]
-
- for fix_quotes_column in fix_quotes_columns:
- # Catch mistaken semicolons by replacing "xyz;" with "xyz"
- # Required to do this since the Ensembl GTF for Ensembl
- # release 78 has mistakes such as:
- # gene_name = "PRAMEF6;" transcript_name = "PRAMEF6;-201"
- df[fix_quotes_column] = [
- s.replace(';\"', '\"').replace(";-", "-")
- for s in df[fix_quotes_column]
- ]
- dataframes.append(df)
- except Exception as e:
- raise ParsingError(str(e))
- df = pd.concat(dataframes)
+ if split_attributes:
+ df = df.with_columns([
+ polars.col("attribute").str.split(";").alias("attribute_split")
+ ])
return df
+def parse_gtf(
+ filepath_or_buffer,
+ split_attributes=True,
+ features=None,
+ fix_quotes_columns=["attribute"]):
+ df_lazy = parse_with_polars_lazy(
+ filepath_or_buffer=filepath_or_buffer,
+ split_attributes=split_attributes,
+ features=features,
+ fix_quotes_columns=fix_quotes_columns)
+ return df_lazy.collect()
+
+def parse_gtf_pandas(*args, **kwargs):
+ return parse_gtf(*args, **kwargs).to_pandas()
+
def parse_gtf_and_expand_attributes(
filepath_or_buffer,
- chunksize=1024 * 1024,
restrict_attribute_columns=None,
features=None):
"""
@@ -140,21 +181,27 @@ def parse_gtf_and_expand_attributes(
chunksize : int
restrict_attribute_columns : list/set of str or None
- If given, then only usese attribute columns.
+ If given, then only use these attribute columns.
features : set or None
Ignore entries which don't correspond to one of the supplied features
"""
- result = parse_gtf(
- filepath_or_buffer,
- chunksize=chunksize,
- features=features)
- attribute_values = result["attribute"]
- del result["attribute"]
- for column_name, values in expand_attribute_strings(
- attribute_values, usecols=restrict_attribute_columns).items():
- result[column_name] = values
- return result
+ df = parse_gtf(
+ filepath_or_buffer=filepath_or_buffer,
+ features=features,
+ split_attributes=True)
+ if type(restrict_attribute_columns) is str:
+ restrict_attribute_columns = {restrict_attribute_columns}
+ elif restrict_attribute_columns:
+ restrict_attribute_columns = set(restrict_attribute_columns)
+ df.drop_in_place("attribute")
+ attribute_pairs = df.drop_in_place("attribute_split")
+ return df.with_columns([
+ polars.Series(k, vs)
+ for (k, vs) in
+ expand_attribute_strings(attribute_pairs).items()
+ if restrict_attribute_columns is None or k in restrict_attribute_columns
+ ])
def read_gtf(
@@ -164,7 +211,7 @@ def read_gtf(
column_converters={},
usecols=None,
features=None,
- chunksize=1024 * 1024):
+ result_type='polars'):
"""
Parse a GTF into a dictionary mapping column names to sequences of values.
@@ -196,7 +243,9 @@ def read_gtf(
features : set of str or None
Drop rows which aren't one of the features in the supplied set
- chunksize : int
+ result_type : One of 'polars', 'pandas', or 'dict'
+ Default behavior is to return a Polars DataFrame, but will convert to
+ Pandas DataFrame or dictionary if specified.
"""
if type(filepath_or_buffer) is str and not exists(filepath_or_buffer):
raise ValueError("GTF file does not exist: %s" % filepath_or_buffer)
@@ -204,18 +253,17 @@ def read_gtf(
if expand_attribute_column:
result_df = parse_gtf_and_expand_attributes(
filepath_or_buffer,
- chunksize=chunksize,
restrict_attribute_columns=usecols,
features=features)
else:
result_df = parse_gtf(result_df, features=features)
- for column_name, column_type in list(column_converters.items()):
- result_df[column_name] = [
- column_type(string_value) if len(string_value) > 0 else None
- for string_value
- in result_df[column_name]
+ result_df = result_df.with_columns(
+ [
+ polars.col(column_name).apply(lambda x: column_type(x) if len(x) > 0 else None)
+ for column_name, column_type in column_converters.items()
]
+ )
# Hackishly infer whether the values in the 'source' column of this GTF
# are actually representing a biotype by checking for the most common
@@ -230,14 +278,20 @@ def read_gtf(
# gene_biotype)
if "gene_biotype" not in column_names:
logging.info("Using column 'source' to replace missing 'gene_biotype'")
- result_df["gene_biotype"] = result_df["source"]
+ result_df = result_df.with_column(polars.col("source").alias("gene_biotype"))
if "transcript_biotype" not in column_names:
logging.info("Using column 'source' to replace missing 'transcript_biotype'")
- result_df["transcript_biotype"] = result_df["source"]
+ result_df = result_df.with_column(polars.col("source").alias("transcript_biotype"))
if usecols is not None:
column_names = set(result_df.columns)
valid_columns = [c for c in usecols if c in column_names]
- result_df = result_df[valid_columns]
-
- return result_df
+ result_df = result_df.select(valid_columns)
+
+ if result_type == "pandas":
+ result = result_df.to_pandas()
+ elif result_type == "polars":
+ result = result_df
+ elif result_type == "dict":
+ result = result_df.to_dict()
+ return result
=====================================
gtfparse/required_columns.py deleted
=====================================
@@ -1,62 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Columns of a GTF file:
-
- seqname - name of the chromosome or scaffold; chromosome names
- without a 'chr' in Ensembl (but sometimes with a 'chr'
- elsewhere)
- source - name of the program that generated this feature, or
- the data source (database or project name)
- feature - feature type name.
- Features currently in Ensembl GTFs:
- gene
- transcript
- exon
- CDS
- Selenocysteine
- start_codon
- stop_codon
- UTR
- Older Ensembl releases may be missing some of these features.
- start - start position of the feature, with sequence numbering
- starting at 1.
- end - end position of the feature, with sequence numbering
- starting at 1.
- score - a floating point value indiciating the score of a feature
- strand - defined as + (forward) or - (reverse).
- frame - one of '0', '1' or '2'. Frame indicates the number of base pairs
- before you encounter a full codon. '0' indicates the feature
- begins with a whole codon. '1' indicates there is an extra
- base (the 3rd base of the prior codon) at the start of this feature.
- '2' indicates there are two extra bases (2nd and 3rd base of the
- prior exon) before the first codon. All values are given with
- relation to the 5' end.
- attribute - a semicolon-separated list of tag-value pairs (separated by a space),
- providing additional information about each feature. A key can be
- repeated multiple times.
-
-(from ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/README)
-"""
-
-REQUIRED_COLUMNS = [
- "seqname",
- "source",
- "feature",
- "start",
- "end",
- "score",
- "strand",
- "frame",
- "attribute",
-]
=====================================
gtfparse/version.py
=====================================
@@ -1 +1 @@
-__version__ = "1.3.0"
\ No newline at end of file
+__version__ = "2.0.1"
=====================================
requirements.txt
=====================================
@@ -0,0 +1 @@
+polars
=====================================
setup.py
=====================================
@@ -10,15 +10,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from __future__ import print_function
import os
import re
from setuptools import setup, find_packages
-readme_filename = "README.md"
+
+
+package_name = "gtfparse"
current_directory = os.path.dirname(__file__)
+readme_filename = 'README.md'
readme_path = os.path.join(current_directory, readme_filename)
+github_url = "https://github.com/openvax/%s" % package_name
+
readme_markdown = ""
try:
@@ -34,15 +38,19 @@ with open('gtfparse/version.py', 'r') as f:
f.read(),
re.MULTILINE).group(1)
+with open("requirements.txt") as f:
+ requirements = [l.strip() for l in f]
+
+
if __name__ == '__main__':
setup(
- name='gtfparse',
+ name=package_name,
packages=find_packages(),
version=version,
description="GTF Parsing",
long_description=readme_markdown,
long_description_content_type='text/markdown',
- url="https://github.com/openvax/gtfparse",
+ url=github_url,
author="Alex Rubinsteyn",
license="http://www.apache.org/licenses/LICENSE-2.0.html",
classifiers=[
@@ -54,8 +62,8 @@ if __name__ == '__main__':
'Programming Language :: Python',
'Topic :: Scientific/Engineering :: Bio-Informatics',
],
- install_requires=[
- 'numpy>=1.7',
- 'pandas>=0.15',
- ],
+ install_requires=requirements,
+ package_data={
+ package_name: ['../requirements.txt'],
+ },
)
=====================================
test/test_create_missing_features.py
=====================================
@@ -1,5 +1,5 @@
from gtfparse import create_missing_features, parse_gtf_and_expand_attributes
-from six import StringIO
+from io import StringIO
# two lines from the Ensembl 54 human GTF containing only a stop_codon and
# exon features, but from which gene and transcript information could be
@@ -18,7 +18,7 @@ GTF_TEXT = "\n".join([
])
-GTF_DATAFRAME = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT))
+GTF_DATAFRAME = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT)).to_pandas()
def test_create_missing_features_identity():
df_should_be_same = create_missing_features(GTF_DATAFRAME, {})
=====================================
test/test_ensembl_gtf.py
=====================================
@@ -1,6 +1,6 @@
from data import data_path
from gtfparse import read_gtf
-from nose.tools import eq_
+
ENSEMBL_GTF_PATH = data_path("ensembl_grch37.head.gtf")
@@ -18,7 +18,7 @@ EXPECTED_FEATURES = set([
def test_ensembl_gtf_columns():
df = read_gtf(ENSEMBL_GTF_PATH)
features = set(df["feature"])
- eq_(features, EXPECTED_FEATURES)
+ assert features == EXPECTED_FEATURES
# first 1000 lines of GTF only contained these genes
EXPECTED_GENE_NAMES = {
=====================================
test/test_expand_attributes.py
=====================================
@@ -1,16 +1,15 @@
from gtfparse import expand_attribute_strings
-from nose.tools import eq_
def test_attributes_in_quotes():
attributes = [
"gene_id \"ENSG001\"; tag \"bogotron\"; version \"1\";",
"gene_id \"ENSG002\"; tag \"wolfpuppy\"; version \"2\";"
]
- parsed_dict = expand_attribute_strings(attributes)
- eq_(list(sorted(parsed_dict.keys())), ["gene_id", "tag", "version"])
- eq_(parsed_dict["gene_id"], ["ENSG001", "ENSG002"])
- eq_(parsed_dict["tag"], ["bogotron", "wolfpuppy"])
- eq_(parsed_dict["version"], ["1", "2"])
+ parsed_dict = expand_attribute_strings(attributes, quote_char='"')
+ assert list(sorted(parsed_dict.keys())), ["gene_id", "tag", "version"]
+ assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002"]
+ assert parsed_dict["tag"] == ["bogotron", "wolfpuppy"]
+ assert parsed_dict["version"] == ["1", "2"]
def test_attributes_without_quotes():
@@ -19,10 +18,10 @@ def test_attributes_without_quotes():
"gene_id ENSG002; tag wolfpuppy; version 2"
]
parsed_dict = expand_attribute_strings(attributes)
- eq_(list(sorted(parsed_dict.keys())), ["gene_id", "tag", "version"])
- eq_(parsed_dict["gene_id"], ["ENSG001", "ENSG002"])
- eq_(parsed_dict["tag"], ["bogotron", "wolfpuppy"])
- eq_(parsed_dict["version"], ["1", "2"])
+ assert list(sorted(parsed_dict.keys())) == ["gene_id", "tag", "version"]
+ assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002"]
+ assert parsed_dict["tag"] == ["bogotron", "wolfpuppy"]
+ assert parsed_dict["version"] == ["1", "2"]
def test_optional_attributes():
@@ -32,6 +31,6 @@ def test_optional_attributes():
"gene_id ENSG003; sometimes-present wolfpuppy;",
]
parsed_dict = expand_attribute_strings(attributes)
- eq_(list(sorted(parsed_dict.keys())), ["gene_id", "sometimes-present"])
- eq_(parsed_dict["gene_id"], ["ENSG001", "ENSG002", "ENSG003"])
- eq_(parsed_dict["sometimes-present"], ["bogotron", "", "wolfpuppy"])
+ assert list(sorted(parsed_dict.keys())) == ["gene_id", "sometimes-present"]
+ assert parsed_dict["gene_id"] == ["ENSG001", "ENSG002", "ENSG003"]
+ assert parsed_dict["sometimes-present"] == ["bogotron", "", "wolfpuppy"]
=====================================
test/test_multiple_values_for_tag_attribute.py
=====================================
@@ -1,6 +1,5 @@
-from six import StringIO
+from io import StringIO
from gtfparse import parse_gtf_and_expand_attributes
-from nose.tools import eq_
# failing example from https://github.com/openvax/gtfparse/issues/2
GTF_TEXT = (
@@ -15,23 +14,22 @@ GTF_TEXT = (
def test_parse_tag_attributes():
parsed = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT))
tag_column = parsed["tag"]
- eq_(len(tag_column), 1)
+ assert len(tag_column) == 1
tags = tag_column[0]
- eq_(tags, 'cds_end_NF,mRNA_end_NF')
+ assert tags == 'cds_end_NF,mRNA_end_NF'
def test_parse_tag_attributes_with_usecols():
parsed = parse_gtf_and_expand_attributes(
StringIO(GTF_TEXT),
restrict_attribute_columns=["tag"])
tag_column = parsed["tag"]
- eq_(len(tag_column), 1)
+ assert len(tag_column) == 1
tags = tag_column[0]
- eq_(tags, 'cds_end_NF,mRNA_end_NF')
+ assert tags == 'cds_end_NF,mRNA_end_NF'
def test_parse_tag_attributes_with_usecols_other_column():
parsed = parse_gtf_and_expand_attributes(
StringIO(GTF_TEXT),
restrict_attribute_columns=["exon_id"])
- tag_column = parsed.get("tag")
- assert tag_column is None, "Expected 'tag' to get dropped but got %s" % (parsed,)
+ assert "tag" not in parsed, "Expected 'tag' to get dropped but got %s" % (parsed,)
=====================================
test/test_parse_gtf_lines.py
=====================================
@@ -1,12 +1,11 @@
-import numpy as np
-from nose.tools import eq_, assert_raises
+from pytest import raises
from gtfparse import (
parse_gtf,
parse_gtf_and_expand_attributes,
REQUIRED_COLUMNS,
ParsingError
)
-from six import StringIO
+from io import StringIO
gtf_text = """
# sample GTF data copied from:
@@ -16,7 +15,9 @@ gtf_text = """
"""
def test_parse_gtf_lines_with_expand_attributes():
- parsed_dict = parse_gtf_and_expand_attributes(StringIO(gtf_text))
+ df = parse_gtf_and_expand_attributes(StringIO(gtf_text))
+
+
# excluding 'attribute' column from required names
expected_columns = REQUIRED_COLUMNS[:8] + [
"gene_id",
@@ -28,40 +29,31 @@ def test_parse_gtf_lines_with_expand_attributes():
"transcript_source",
]
# convert to list since Py3's dictionary keys are a distinct collection type
- eq_(list(parsed_dict.keys()), expected_columns)
- eq_(list(parsed_dict["seqname"]), ["1", "1"])
+ assert list(df.columns) == expected_columns
+ assert list(df["seqname"]) == ["1", "1"]
# convert to list for comparison since numerical columns may be NumPy arrays
- eq_(list(parsed_dict["start"]), [11869, 11869])
- eq_(list(parsed_dict["end"]), [14409, 14409])
- # can't compare NaN with equality
- scores = list(parsed_dict["score"])
- assert np.isnan(scores).all(), "Unexpected scores: %s" % scores
- eq_(list(parsed_dict["gene_id"]), ["ENSG00000223972", "ENSG00000223972"])
- eq_(list(parsed_dict["transcript_id"]), ["", "ENST00000456328"])
+ assert list(df["start"]) == [11869, 11869]
+ assert list(df["end"]) == [14409, 14409]
+
+ assert df["score"].is_null().all(), "Unexpected scores: %s" % (df["score"],)
+ assert list(df["gene_id"]) == ["ENSG00000223972", "ENSG00000223972"]
+ assert list(df["transcript_id"]) == ["", "ENST00000456328"]
def test_parse_gtf_lines_without_expand_attributes():
- parsed_dict = parse_gtf(StringIO(gtf_text))
+ df = parse_gtf(StringIO(gtf_text), split_attributes=False)
# convert to list since Py3's dictionary keys are a distinct collection type
- eq_(list(parsed_dict.keys()), REQUIRED_COLUMNS)
- eq_(list(parsed_dict["seqname"]), ["1", "1"])
+ assert list(df.columns) == REQUIRED_COLUMNS
+ assert list(df["seqname"]) == ["1", "1"]
# convert to list for comparison since numerical columns may be NumPy arrays
- eq_(list(parsed_dict["start"]), [11869, 11869])
- eq_(list(parsed_dict["end"]), [14409, 14409])
- # can't compare NaN with equality
- scores = list(parsed_dict["score"])
- assert np.isnan(scores).all(), "Unexpected scores: %s" % scores
- assert len(parsed_dict["attribute"]) == 2
-
-def test_parse_gtf_lines_error_too_many_fields():
- bad_gtf_text = gtf_text.replace(" ", "\t")
- # pylint: disable=no-value-for-parameter
- with assert_raises(ParsingError):
- parse_gtf(StringIO(bad_gtf_text))
+ assert list(df["start"]) == [11869, 11869]
+ assert list(df["end"]) == [14409, 14409]
+ assert df["score"].is_null().all(), "Unexpected scores: %s" % (df["score"],)
+ assert len(df["attribute"]) == 2
def test_parse_gtf_lines_error_too_few_fields():
bad_gtf_text = gtf_text.replace("\t", " ")
# pylint: disable=no-value-for-parameter
- with assert_raises(ParsingError):
+ with raises(ParsingError):
parse_gtf(StringIO(bad_gtf_text))
View it on GitLab: https://salsa.debian.org/med-team/python-gtfparse/-/compare/690e1e33c2561e0f5b6333fd2d78094c69df70e3...302000b997aaffe77d9a14b1a6691f92c2d94abb
--
View it on GitLab: https://salsa.debian.org/med-team/python-gtfparse/-/compare/690e1e33c2561e0f5b6333fd2d78094c69df70e3...302000b997aaffe77d9a14b1a6691f92c2d94abb
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230127/5ab32054/attachment-0001.htm>
More information about the debian-med-commit
mailing list