[med-svn] [Git][med-team/python-gffutils][upstream] New upstream version 0.12
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Thu Jul 20 00:04:37 BST 2023
Étienne Mollier pushed to branch upstream at Debian Med / python-gffutils
Commits:
e65119c5 by Étienne Mollier at 2023-07-20T00:52:49+02:00
New upstream version 0.12
- - - - -
14 changed files:
- PKG-INFO
- README.rst
- gffutils.egg-info/PKG-INFO
- gffutils.egg-info/SOURCES.txt
- gffutils/interface.py
- gffutils/parser.py
- + gffutils/test/conftest.py
- − gffutils/test/data/issue181.gff
- gffutils/test/parser_test.py
- gffutils/test/performance_evaluation.py
- gffutils/test/test.py → gffutils/test/test_1.py
- gffutils/test/test_issues.py
- gffutils/version.py
- setup.py
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gffutils
-Version: 0.11.1
+Version: 0.12
Summary: Work with GFF and GTF files in a flexible database framework
Home-page: https://github.com/daler/gffutils
Author: Ryan Dale
@@ -19,3 +19,16 @@ Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Topic :: Software Development :: Libraries :: Python Modules
License-File: LICENSE
+
+gffutils
+========
+
+``gffutils`` is a Python package for working with and manipulating the GFF and
+GTF format files typically used for genomic annotations.
+
+Files are loaded into a sqlite3 database, allowing much more complex
+manipulation of hierarchical features (e.g., genes, transcripts, and exons)
+than is possible with plain-text methods alone.
+
+See documentation at https://daler.github.io/gffutils, and GitHub repo at
+https:/github.com/daler/gffutils.
=====================================
README.rst
=====================================
@@ -1,19 +1,12 @@
-
-.. image:: https://travis-ci.org/daler/gffutils.png?branch=master
- :target: https://travis-ci.org/daler/gffutils
-
-.. image:: https://badge.fury.io/py/gffutils.svg
- :target: http://badge.fury.io/py/gffutils
-
-.. image:: https://pypip.in/d/gffutils/badge.png
- :target: https://pypi.python.org/pypi/gffutils
-
-
+gffutils
+========
``gffutils`` is a Python package for working with and manipulating the GFF and
-GTF format files typically used for genomic annotations. Files are loaded into
-a sqlite3 database, allowing much more complex manipulation of hierarchical
-features (e.g., genes, transcripts, and exons) than is possible with plain-text
-methods alone.
+GTF format files typically used for genomic annotations.
+
+Files are loaded into a sqlite3 database, allowing much more complex
+manipulation of hierarchical features (e.g., genes, transcripts, and exons)
+than is possible with plain-text methods alone.
-See documentation at **http://daler.github.io/gffutils**.
+See documentation at https://daler.github.io/gffutils, and GitHub repo at
+https:/github.com/daler/gffutils.
=====================================
gffutils.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gffutils
-Version: 0.11.1
+Version: 0.12
Summary: Work with GFF and GTF files in a flexible database framework
Home-page: https://github.com/daler/gffutils
Author: Ryan Dale
@@ -19,3 +19,16 @@ Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Topic :: Software Development :: Libraries :: Python Modules
License-File: LICENSE
+
+gffutils
+========
+
+``gffutils`` is a Python package for working with and manipulating the GFF and
+GTF format files typically used for genomic annotations.
+
+Files are loaded into a sqlite3 database, allowing much more complex
+manipulation of hierarchical features (e.g., genes, transcripts, and exons)
+than is possible with plain-text methods alone.
+
+See documentation at https://daler.github.io/gffutils, and GitHub repo at
+https:/github.com/daler/gffutils.
=====================================
gffutils.egg-info/SOURCES.txt
=====================================
@@ -31,13 +31,14 @@ gffutils/scripts/gffutils-cli
gffutils/scripts/gffutils-flybase-convert.py
gffutils/test/__init__.py
gffutils/test/attr_test_cases.py
+gffutils/test/conftest.py
gffutils/test/expected.py
gffutils/test/feature_test.py
gffutils/test/helpers_test.py
gffutils/test/parser_test.py
gffutils/test/performance_evaluation.py
gffutils/test/synth_test_base.py
-gffutils/test/test.py
+gffutils/test/test_1.py
gffutils/test/test_biopython_integration.py
gffutils/test/test_issues.py
gffutils/test/test_iterators.py
@@ -70,7 +71,6 @@ gffutils/test/data/hybrid1.gff3
gffutils/test/data/intro_docs_example.gff
gffutils/test/data/issue167.gff
gffutils/test/data/issue174.gtf
-gffutils/test/data/issue181.gff
gffutils/test/data/issue_197.gff
gffutils/test/data/jgi_gff2.txt
gffutils/test/data/keep-order-test.gtf
=====================================
gffutils/interface.py
=====================================
@@ -102,7 +102,7 @@ class FeatureDB(object):
keep_order=False,
pragmas=constants.default_pragmas,
sort_attribute_values=False,
- text_factory=sqlite3.OptimizedUnicode,
+ text_factory=str
):
"""
Connect to a database created by :func:`gffutils.create_db`.
@@ -117,8 +117,7 @@ class FeatureDB(object):
text_factory : callable
Optionally set the way sqlite3 handles strings. Default is
- sqlite3.OptimizedUnicode, which returns ascii when possible,
- unicode otherwise
+ str
default_encoding : str
@@ -895,7 +894,14 @@ class FeatureDB(object):
if d['start'] > d['end']:
return None
- return self._feature_returner(**d)
+ new_feature = self._feature_returner(**d)
+
+ # concat list of ID to create uniq IDs because feature with
+ # multiple values for their ID are no longer permitted since v0.11
+ if "ID" in new_feature.attributes and len(new_feature.attributes["ID"]) > 1:
+ new_id = '-'.join(new_feature.attributes["ID"])
+ new_feature.attributes["ID"] = [new_id]
+ return new_feature
# If not provided, use a no-op function instead.
if not attribute_func:
@@ -1267,6 +1273,128 @@ class FeatureDB(object):
):
yield intron
+ def create_splice_sites(
+ self,
+ exon_featuretype="exon",
+ grandparent_featuretype="gene",
+ parent_featuretype=None,
+ merge_attributes=True,
+ numeric_sort=False,
+ ):
+ """
+ Create splice sites from existing annotations.
+
+
+ Parameters
+ ----------
+ exon_featuretype : string
+ Feature type to use in order to infer splice sites. Typically `"exon"`.
+
+ grandparent_featuretype : string
+ If `grandparent_featuretype` is not None, then group exons by
+ children of this featuretype. If `granparent_featuretype` is
+ "gene" (default), then splice sites will be created for all first-level
+ children of genes. This may include mRNA, rRNA, ncRNA, etc. If
+ you only want to infer splice sites from one of these featuretypes
+ (e.g., mRNA), then use the `parent_featuretype` kwarg which is
+ mutually exclusive with `grandparent_featuretype`.
+
+ parent_featuretype : string
+ If `parent_featuretype` is not None, then only use this featuretype
+ to infer splice sites. Use this if you only want a subset of
+ featuretypes to have splice sites (e.g., "mRNA" only, and not ncRNA or
+ rRNA). Mutually exclusive with `grandparent_featuretype`.
+
+ merge_attributes : bool
+ Whether or not to merge attributes from all exons. If False then no
+ attributes will be created for the splice sites.
+
+ numeric_sort : bool
+ If True, then merged attributes that can be cast to float will be
+ sorted by their numeric values (but will still be returned as
+ string). This is useful, for example, when creating splice sites between
+ exons and the exons have exon_number attributes as an integer.
+ Using numeric_sort=True will ensure that the returned exons have
+ merged exon_number attribute of ['9', '10'] (numerically sorted)
+ rather than ['10', '9'] (alphabetically sorted).
+
+ Returns
+ -------
+ A generator object that yields :class:`Feature` objects representing
+ new splice sites
+
+ Notes
+ -----
+ The returned generator can be passed directly to the
+ :meth:`FeatureDB.update` method to permanently add them to the
+ database, e.g., ::
+
+ db.update(db.create_splice sites())
+
+ """
+ if (grandparent_featuretype and parent_featuretype) or (
+ grandparent_featuretype is None and parent_featuretype is None
+ ):
+ raise ValueError(
+ "exactly one of `grandparent_featuretype` or "
+ "`parent_featuretype` should be provided"
+ )
+
+ if grandparent_featuretype:
+
+ def child_gen():
+ for gene in self.features_of_type(grandparent_featuretype):
+ for child in self.children(gene, level=1):
+ yield child
+
+ elif parent_featuretype:
+
+ def child_gen():
+ for child in self.features_of_type(parent_featuretype):
+ yield child
+
+ # Two splice features need to be created for each interleave
+ for side in ["left", "right"]:
+ for child in child_gen():
+ exons = self.children(
+ child, level=1, featuretype=exon_featuretype, order_by="start"
+ )
+
+ # get strand
+ strand = child.strand
+
+ new_featuretype = "splice_site"
+ if side == "left":
+ if strand == "+":
+ new_featuretype = "five_prime_cis_splice_site"
+ elif strand == "-":
+ new_featuretype = "three_prime_cis_splice_site"
+
+ if side == "right":
+ if strand == "+":
+ new_featuretype = "three_prime_cis_splice_site"
+ elif strand == "-":
+ new_featuretype = "five_prime_cis_splice_site"
+
+ for splice_site in self.interfeatures(
+ exons,
+ new_featuretype=new_featuretype,
+ merge_attributes=merge_attributes,
+ numeric_sort=numeric_sort,
+ dialect=self.dialect,
+ ):
+
+ if side == "left":
+ splice_site.end = splice_site.start + 1
+ if side == "right":
+ splice_site.start = splice_site.end - 1
+
+ # make ID uniq by adding suffix
+ splice_site.attributes["ID"] = [new_featuretype + "_" + splice_site.attributes["ID"][0]]
+
+ yield splice_site
+
+
def _old_merge(self, features, ignore_strand=False):
"""
DEPRECATED, only retained here for backwards compatibility. Please use
=====================================
gffutils/parser.py
=====================================
@@ -341,16 +341,29 @@ def _split_keyvals(keyval_str, dialect=None):
val = val[1:-1]
dialect["quoted GFF2 values"] = True
if val:
+
# TODO: if there are extra commas for a value, just use empty
# strings
# quals[key].extend([v for v in val.split(',') if v])
- vals = val.split(",")
- if (len(vals) > 1) and dialect["repeated keys"]:
- raise AttributeStringError(
- "Internally inconsistent attributes formatting: "
- "some have repeated keys, some do not."
- )
- quals[key].extend(vals)
+
+ # See issue #198, where commas within a description can incorrectly
+ # cause the dialect inference to conclude that there are not
+ # repeated keys.
+ #
+ # More description in PR #208.
+ if dialect["repeated keys"]:
+ quals[key].append(val)
+ else:
+ vals = val.split(",")
+
+ # If anything starts with a leading space, then we infer that
+ # it was part of a description or some other typographical
+ # interpretation, not a character to split multiple vals on --
+ # and append the original val rather than the split vals.
+ if any([i[0] == " " for i in vals if i]):
+ quals[key].append(val)
+ else:
+ quals[key].extend(vals)
# keep track of the order of keys
dialect["order"].append(key)
=====================================
gffutils/test/conftest.py
=====================================
@@ -0,0 +1 @@
+collect_ignore=["data"]
=====================================
gffutils/test/data/issue181.gff deleted
=====================================
@@ -1,14 +0,0 @@
-#OriSeqID=Chr1 Accession=GWHACDB00000001.1
-GWHACDB00000001.1 . gene 9738 14906 1 + . ID=Hc.01G000010;Accession=GWHGACDB000001.1;Augustus_transcriptSupport_percentage=100;Augustus_intronSupport=4/4;Source=augustus000002;Integrity=complete;;transl_table=1
-GWHACDB00000001.1 . mRNA 9738 14906 1 + . ID=Hc.01G000010.t1;Accession=GWHTACDB000001.1;Parent=Hc.01G000010;Parent_Accession=GWHGACDB000001.1;IntronSupport=4/4;Integrity=complete;;transl_table=1
-GWHACDB00000001.1 . exon 9738 11686 . + . ID=Hc.01G000010.t1.exon1;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . CDS 9738 11686 1 + 0 ID=Hc.01G000010.t1.CDS1;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . exon 11897 12671 . + . ID=Hc.01G000010.t1.exon2;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . CDS 11897 12671 1 + 1 ID=Hc.01G000010.t1.CDS2;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . exon 13090 13368 . + . ID=Hc.01G000010.t1.exon3;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . CDS 13090 13368 1 + 0 ID=Hc.01G000010.t1.CDS3;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . exon 13650 14396 . + . ID=Hc.01G000010.t1.exon4;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . CDS 13650 14396 1 + 0 ID=Hc.01G000010.t1.CDS4;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . exon 14601 14906 . + . ID=Hc.01G000010.t1.exon5;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . CDS 14601 14708 1 + 0 ID=Hc.01G000010.t1.CDS5;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1 . three_prime_UTR 14709 14906 . + . ID=Hc.01G000010.t1.3UTR1;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
=====================================
gffutils/test/parser_test.py
=====================================
@@ -1,10 +1,11 @@
import tempfile
-from nose.tools import assert_raises
from gffutils import parser, create, feature, iterators, constants, helpers, exceptions
from gffutils import example_filename, create_db
from . import attr_test_cases
from textwrap import dedent
+import pytest
+
TEST_FILENAMES = [
example_filename(i)
for i in [
@@ -42,15 +43,8 @@ def test_directives():
assert db.directives == ["directive1 example"], db.directives
-def test_split_attrs():
- # nosetests generator for all the test cases in attr_test_cases. (note no
- # docstring for this test function so that nosetests -v will print the test
- # cases)
- for (attr_str, attr_dict, acceptable_reconstruction) in attr_test_cases.attrs:
- yield attrs_OK, attr_str, attr_dict, acceptable_reconstruction
-
-
-def attrs_OK(attr_str, attr_dict, acceptable_reconstruction=None):
+ at pytest.mark.parametrize("item", attr_test_cases.attrs)
+def test_attrs_OK(item):
"""
Given an attribute string and a dictionary of what you expect, test the
attribute splitting and reconstruction (invariant roundtrip).
@@ -59,7 +53,9 @@ def attrs_OK(attr_str, attr_dict, acceptable_reconstruction=None):
(see attr_test_cases.py for details); `acceptable_reconstruction` handles
those.
"""
+ attr_str, attr_dict, acceptable_reconstruction = item
result, dialect = parser._split_keyvals(attr_str)
+ result = dict(result)
assert result == attr_dict, result
reconstructed = parser._reconstruct(result, dialect, keep_order=True)
@@ -88,10 +84,10 @@ def test_empty_recontruct():
reconstructing attributes with incomplete information returns empty string
"""
assert parser._reconstruct(None, constants.dialect) == ""
- assert_raises(
- exceptions.AttributeStringError, parser._reconstruct, dict(ID="asdf"), None
- )
- assert_raises(exceptions.AttributeStringError, parser._reconstruct, None, None)
+ with pytest.raises(exceptions.AttributeStringError):
+ parser._reconstruct(dict(ID="asdf"), None)
+ with pytest.raises(exceptions.AttributeStringError):
+ parser._reconstruct(None, None)
def test_empty_split_keyvals():
=====================================
gffutils/test/performance_evaluation.py
=====================================
@@ -1,8 +1,8 @@
"""
-Performance testing. Run them with https://github.com/mahmoudimus/nose-timer:
+Performance testing. Run them with https://pypi.org/project/pytest-timer/:
```
-nosetests --nocapture -a slow --with-timer
+pytest --capture=no -m slow --with-timer
```
WARNING: These tests can take about 1.5 hours to run!
@@ -15,7 +15,7 @@ import random
import unittest
import os
-from nose.plugins import attrib
+import pytest
import gffutils
@@ -187,7 +187,7 @@ class PerformanceTestFeatureDB(object):
)
- at attrib.attr("slow")
+ at pytest.mark.slow
class TestPerformanceOnSacCer(PerformanceTestFeatureDB, unittest.TestCase):
"""
Test frequent scenarios on medium size genome of yeast.
@@ -205,7 +205,7 @@ class TestPerformanceOnSacCer(PerformanceTestFeatureDB, unittest.TestCase):
)
- at attrib.attr("slow")
+ at pytest.mark.slow
class TestPerformanceOnMouse(PerformanceTestFeatureDB, unittest.TestCase):
"""
Test frequent scenarios on large genome of mouse.
=====================================
gffutils/test/test.py → gffutils/test/test_1.py
=====================================
@@ -1,7 +1,7 @@
import warnings
from textwrap import dedent
from . import expected
-from gffutils import example_filename, create, parser, feature
+from gffutils import example_filename, create, feature
import gffutils
import gffutils.helpers as helpers
import gffutils.gffwriter as gffwriter
@@ -13,8 +13,6 @@ import six
import shutil
import threading
import tempfile
-from textwrap import dedent
-from nose.tools import assert_raises
from six.moves import SimpleHTTPServer
if sys.version_info.major == 3:
@@ -24,11 +22,10 @@ else:
import multiprocessing
import json
-import tempfile
-import shutil
-import glob
import difflib
+import pytest
+
testdbfn_gtf = ":memory:"
testdbfn_gff = ":memory:"
@@ -631,17 +628,16 @@ def test_feature_merge():
x = db["fake"]
y = db["fake_1"]
- assert_raises(
- ValueError,
- gffutils.create_db,
- gtfdata,
- ":memory:",
- from_string=True,
- merge_strategy="merge",
- id_spec="gene_id",
- force_merge_fields=["start"],
- keep_order=True,
- )
+ with pytest.raises(ValueError):
+ gffutils.create_db(
+ gtfdata,
+ ":memory:",
+ from_string=True,
+ merge_strategy="merge",
+ id_spec="gene_id",
+ force_merge_fields=["start"],
+ keep_order=True,
+ )
# test that warnings are raised because of strand and frame
with warnings.catch_warnings(record=True) as w:
@@ -750,7 +746,8 @@ def test_empty_files():
fn = tempfile.NamedTemporaryFile(delete=False).name
a = open(fn, "w")
a.close()
- assert_raises(gffutils.exceptions.EmptyInputError, gffutils.create_db, fn, fn + ".db")
+ with pytest.raises(gffutils.exceptions.EmptyInputError):
+ gffutils.create_db(fn, fn + ".db")
def test_false_function():
@@ -937,17 +934,17 @@ def test_iterator_update():
[(i.start, i.stop) for i in db.features_of_type("exon")]
)
+def clean_tempdir():
+ tempfile.tempdir = tempdir
+ if os.path.exists(tempdir):
+ shutil.rmtree(tempdir)
+ os.makedirs(tempdir)
-def test_tempfiles():
+# specify a writeable temp dir for testing
+tempdir = "/tmp/gffutils-test"
- # specifiy a writeable temp dir for testing
- tempdir = "/tmp/gffutils-test"
+def test_tempfiles():
- def clean_tempdir():
- tempfile.tempdir = tempdir
- if os.path.exists(tempdir):
- shutil.rmtree(tempdir)
- os.makedirs(tempdir)
clean_tempdir()
@@ -995,6 +992,10 @@ def test_tempfiles():
assert len(filelist) == 1, filelist
assert filelist[0].endswith(".GFFtmp")
+ at pytest.mark.skip(reason="Unclear if still needed; currently failing")
+def test_parallel_db():
+ # DISABLING in v0.12
+
# Test n parallel instances of gffutils across PROCESSES processes.
#
# Note that travis-ci doesn't like it when you use multiple cores, so the
@@ -1013,6 +1014,7 @@ def test_tempfiles():
res = pool.map(make_db, range(n))
finally:
pool.close()
+
assert sorted(list(res)) == list(range(n))
filelist = os.listdir(tempdir)
assert len(filelist) == n, len(filelist)
@@ -1107,23 +1109,21 @@ def test_deprecation_handler():
return
# TODO: when infer_gene_extent actually gets deprecated, test here.
- assert_raises(
- ValueError,
- gffutils.create_db,
- gffutils.example_filename("FBgn0031208.gtf"),
- ":memory:",
- infer_gene_extent=False,
- )
+ with pytest.raises(ValueError):
+ gffutils.create_db(
+ gffutils.example_filename("FBgn0031208.gtf"),
+ ":memory:",
+ infer_gene_extent=False,
+ )
def test_nonsense_kwarg():
- assert_raises(
- TypeError,
- gffutils.create_db,
- gffutils.example_filename("FBgn0031208.gtf"),
- ":memory:",
- asdf=True,
- )
+ with pytest.raises(TypeError):
+ gffutils.create_db(
+ gffutils.example_filename("FBgn0031208.gtf"),
+ ":memory:",
+ asdf=True,
+ )
def test_infer_gene_extent():
@@ -1237,6 +1237,36 @@ def test_db_unquoting():
assert db["f"]["Note"] == [","]
+def test_create_splice_sites():
+ fn = gffutils.example_filename("gff_example1.gff3")
+ db = gffutils.create_db(fn, ":memory:")
+ db = db.update(db.create_splice_sites())
+ observed = "\n".join(str(feature) for feature in db.all_features())
+ expected = dedent("""\
+ chr1 ensGene gene 4763287 4775820 . - . Name=ENSMUSG00000033845;ID=ENSMUSG00000033845;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845
+ chr1 ensGene mRNA 4764517 4775779 . - . Name=ENSMUST00000045689;Parent=ENSMUSG00000033845;ID=ENSMUST00000045689;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845
+ chr1 ensGene CDS 4775654 4775758 . - 0 Name=ENSMUST00000045689.cds0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.cds0;gid=ENSMUSG00000033845
+ chr1 ensGene CDS 4772761 4772814 . - 0 Name=ENSMUST00000045689.cds1;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.cds1;gid=ENSMUSG00000033845
+ chr1 ensGene exon 4775654 4775779 . - . Name=ENSMUST00000045689.exon0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon0;gid=ENSMUSG00000033845
+ chr1 ensGene exon 4772649 4772814 . - . Name=ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon1;gid=ENSMUSG00000033845
+ chr1 ensGene exon 4767606 4767729 . - . Name=ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon2;gid=ENSMUSG00000033845
+ chr1 ensGene exon 4764517 4764597 . - . Name=ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon3;gid=ENSMUSG00000033845
+ chr1 ensGene five_prime_UTR 4775759 4775779 . - . Name=ENSMUST00000045689.utr0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr0;gid=ENSMUSG00000033845
+ chr1 ensGene three_prime_UTR 4772649 4772760 . - . Name=ENSMUST00000045689.utr1;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr1;gid=ENSMUSG00000033845
+ chr1 ensGene three_prime_UTR 4767606 4767729 . - . Name=ENSMUST00000045689.utr2;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr2;gid=ENSMUSG00000033845
+ chr1 ensGene three_prime_UTR 4764517 4764597 . - . Name=ENSMUST00000045689.utr3;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr3;gid=ENSMUSG00000033845
+ chr1 gffutils_derived three_prime_cis_splice_site 4764598 4764599 . - . Name=ENSMUST00000045689.exon2,ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon2-ENSMUST00000045689.exon3;gid=ENSMUSG00000033845
+ chr1 gffutils_derived three_prime_cis_splice_site 4767730 4767731 . - . Name=ENSMUST00000045689.exon1,ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon1-ENSMUST00000045689.exon2;gid=ENSMUSG00000033845
+ chr1 gffutils_derived three_prime_cis_splice_site 4772815 4772816 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845
+ chr1 gffutils_derived five_prime_cis_splice_site 4767604 4767605 . - . Name=ENSMUST00000045689.exon2,ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon2-ENSMUST00000045689.exon3;gid=ENSMUSG00000033845
+ chr1 gffutils_derived five_prime_cis_splice_site 4772647 4772648 . - . Name=ENSMUST00000045689.exon1,ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon1-ENSMUST00000045689.exon2;gid=ENSMUSG00000033845
+ chr1 gffutils_derived five_prime_cis_splice_site 4775652 4775653 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845""")
+
+ assert observed == expected
+
+
+
+
if __name__ == "__main__":
# this test case fails
# test_attributes_modify()
=====================================
gffutils/test/test_issues.py
=====================================
@@ -10,8 +10,8 @@ from textwrap import dedent
import gffutils
from gffutils import feature
from gffutils import merge_criteria as mc
-from nose.tools import assert_raises
+import pytest
def test_issue_79():
gtf = gffutils.example_filename("keep-order-test.gtf")
@@ -91,8 +91,8 @@ def test_issue_107():
db.interfeatures(db.features_of_type("gene", order_by=("seqid", "start")))
)
assert [str(i) for i in interfeatures] == [
- "chr1\tgffutils_derived\tinter_gene_gene\t6\t9\t.\t.\t.\tID=a,b;",
- "chr2\tgffutils_derived\tinter_gene_gene\t51\t54\t.\t-\t.\tID=c,d;",
+ "chr1\tgffutils_derived\tinter_gene_gene\t6\t9\t.\t.\t.\tID=a-b;",
+ "chr2\tgffutils_derived\tinter_gene_gene\t51\t54\t.\t-\t.\tID=c-d;",
]
@@ -324,9 +324,12 @@ def test_issue_157():
# TypeError: merge() got an unexpected keyword argument 'ignore_strand'
#
# Now changing to ValueError and suggesting a fix.
- assert_raises(ValueError, db.children_bp, gene, child_featuretype='exon', merge=True, ignore_strand=True)
- assert_raises(ValueError, db.children_bp, gene, ignore_strand=True, nonexistent=True)
- assert_raises(TypeError, db.children_bp, gene, nonexistent=True)
+ with pytest.raises(ValueError):
+ db.children_bp(gene, child_featuretype='exon', merge=True, ignore_strand=True)
+ with pytest.raises(ValueError):
+ db.children_bp(gene, ignore_strand=True, nonexistent=True)
+ with pytest.raises(TypeError):
+ db.children_bp(gene, nonexistent=True)
# The way to do it now is the following (we can omit the mc.feature_type
# since we're preselecting for exons anyway):
@@ -385,22 +388,6 @@ def test_issue_174():
assert observed[8] == ['9', '10']
assert observed[9] == ['10', '11']
-
-def test_issue_181():
- db = gffutils.create_db(
- gffutils.example_filename('issue181.gff'),
- ':memory:')
- introns = db.create_introns()
-
- # This now warns that the provided ID key has multiple values.
- assert_raises(ValueError, db.update, introns)
-
- # The fix is to provide a custom intron ID converter.
- def intron_id(f):
- return ','.join(f['ID'])
-
- db.update(introns, id_spec={'intron': [intron_id]})
-
def test_issue_197():
# Previously this would fail with ValueError due to using the stop position
@@ -410,11 +397,18 @@ def test_issue_197():
genes = list(db.features_of_type('gene'))
igss = list( db.interfeatures(genes,new_featuretype='intergenic_space') )
+
+ # Prior to PR #219, multiple IDs could be created by interfeatures, which
+ # in turn was patched here by providing the transform to db.update. With
+ # #219, this ends up being a no-op because ID is a single value by the time
+ # it gets to the transform function.
+ #
+ # However, keeping the test as-is to ensure backward-compatibility.
def transform(f):
f['ID'] = [ '-'.join(f.attributes['ID']) ]
return f
- db = db.update(igss, transform=transform, merge_strategy='error')
+ db = db.update(igss, transform=transform, merge_strategy='error')
obs = list(db.features_of_type('intergenic_space'))
for i in obs:
@@ -427,3 +421,187 @@ def test_issue_197():
'tig00000492\tgffutils_derived\tintergenic_space\t50071\t50071\t.\t-\t.\tID=gene4-gene5',
'tig00000492\tgffutils_derived\tintergenic_space\t50076\t50089\t.\t-\t.\tID=gene5-gene6',
]
+
+def test_issue_198():
+ line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; db_xref "HGNC:HGNC:38034"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";'
+
+ # Original issue #198 is that this fails with:
+ #
+ # gffutils.exceptions.AttributeStringError: Internally inconsistent
+ # attributes formatting: some have repeated keys, some do not.
+ #
+ # This is because the dialect inference sees the two db_xref keys, and
+ # correctly assumes the dialect uses repeated keys rather than
+ # multiple, comma-separated values -- but there's a comma in the
+ # description.
+ #
+ # So we need to figure out the best way of interpreting a comma in cases
+ # like this. It seems like the best solution is to assume that the presence
+ # of repeated keys always wins.
+ f = feature.feature_from_line(line)
+
+ assert f.attributes['description'] == ['WASP family homolog 7, pseudogene']
+
+ # If we remove one of the db_xref keys, then the parser sees the comma and
+ # figures it's a multivalue key.
+ line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";'
+ f = feature.feature_from_line(line)
+
+ # Previous result, note leading space --------------------------->| |
+ # assert f.attributes['description'] == ['WASP family homolog 7', ' pseudogene']
+ assert f.attributes['description'] == ['WASP family homolog 7, pseudogene']
+
+ # But removing that space before "pseudogene" means it's interpreted as
+ # a multivalue attribute
+ line = 'NC_000001.11 BestRefSeq gene 14362 29370 . - . gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7,pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";'
+ f = feature.feature_from_line(line)
+ assert f.attributes['description'] == ['WASP family homolog 7', 'pseudogene']
+
+ # Confirm behavior of corner cases like a trailing comma
+ line = "chr17 RefSeq CDS 6806527 6806553 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,"
+ f = feature.feature_from_line(line)
+ assert f.attributes['Parent'] == ['XM_001475631.1', '']
+
+
+def test_issue_207():
+
+ def _check(txt, expected_keys, dialect_trailing_semicolon):
+ db = gffutils.create_db(txt.replace(' ', '\t'), ':memory:', from_string=True)
+ assert [list(f.attributes.keys()) for f in db.all_features()] == expected_keys
+ assert db.dialect['trailing semicolon'] == dialect_trailing_semicolon
+
+ # All lines have trailing semicolon
+ _check(
+ txt=dedent("""\
+ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
+ chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903;
+ chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1;
+ chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1;
+ """),
+ expected_keys = [
+ ['ID'],
+ ['ID', 'Parent'],
+ ['ID', 'Parent'],
+ ['ID', 'Parent'],
+ ],
+ dialect_trailing_semicolon=True
+ )
+
+ # First two lines have trailing semicolon. However, the heuristics of
+ # dialect selection, which favor attributes with more values (assuming more
+ # information), decides that this file does NOT have trailing semicolons.
+ _check(
+ txt=dedent("""\
+ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
+ chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903;
+ chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1
+ chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1
+ """),
+ expected_keys = [
+ ['ID', ''],
+ ['ID', 'Parent', ''],
+ ['ID', 'Parent'],
+ ['ID', 'Parent'],
+ ],
+ dialect_trailing_semicolon=False,
+ )
+
+ # APPARENTLY INCONSISTENT: The only thing difference here is that the
+ # Parent attribute has been removed, otherwise matches above (first two
+ # have trailing semicolon). But now there are no empty keys.
+ #
+ # This is expected behavior, because there are no attributes with more keys
+ # as above to give higher weight, and to break the tie between with and
+ # without trailing semicolon, falls back to first dialect observed.
+ _check(
+ txt=dedent("""\
+ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
+ chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;
+ chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1
+ chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1
+ """),
+ expected_keys=[
+ ['ID'],
+ ['ID'],
+ ['ID'],
+ ['ID']
+ ],
+ dialect_trailing_semicolon=True,
+ )
+
+ # We can convince the heuristics to think there should be NO trailing
+ # semicolon by giving one more line as evidence. Only difference is from
+ # above is the last line.
+ _check(
+ txt=dedent("""\
+ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
+ chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;
+ chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1
+ chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1
+ chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1904.t1.d1.1
+ """),
+ expected_keys=[
+ ['ID', ''],
+ ['ID', ''],
+ ['ID'],
+ ['ID'],
+ ['ID'],
+ ],
+ dialect_trailing_semicolon=False,
+ )
+
+
+ # Again seems inconsistent at first, but heuristics break ties by
+ # preferring first dialect, which here is no trailing semicolon.
+ _check(
+ txt=dedent("""\
+ chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903
+ chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1
+ chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;
+ chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;
+ """),
+ expected_keys=[
+ ['ID'],
+ ['ID'],
+ ['ID', ''],
+ ['ID', '']
+ ],
+ dialect_trailing_semicolon=False,
+ )
+
+
+def test_issue_213():
+ # GFF header directives seem to be not parsed when building a db from
+ # a file, even though it seems to work fine from a string.
+ data = dedent(
+ """
+ ##gff-version 3
+ . . . . . . . .
+ . . . . . . . .
+ . . . . . . . .
+ . . . . . . . .
+ """
+ )
+
+ # Ensure directives are parsed from DataIterator
+ it = gffutils.iterators.DataIterator(data, from_string=True)
+ assert it.directives == ["gff-version 3"]
+
+
+ # Ensure they're parsed into the db from a string
+ db = gffutils.create_db(data, dbfn=":memory:", from_string=True, verbose=False)
+ assert db.directives == ["gff-version 3"], db.directives
+
+ # Ensure they're parsed into the db from a file
+ tmp = tempfile.NamedTemporaryFile(delete=False).name
+ with open(tmp, "w") as fout:
+ fout.write(data + "\n")
+ db = gffutils.create_db(tmp, ":memory:")
+ assert db.directives == ["gff-version 3"], db.directives
+ assert len(db.directives) == 1
+
+ # Ensure they're parsed into the db from a file, and going to a file (to
+ # exactly replicate example in #213)
+ db = gffutils.create_db(tmp, dbfn='issue_213.db', force=True)
+ assert db.directives == ["gff-version 3"], db.directives
+ assert len(db.directives) == 1
=====================================
gffutils/version.py
=====================================
@@ -1 +1 @@
-version = "0.11.1"
+version = "0.12"
=====================================
setup.py
=====================================
@@ -18,6 +18,7 @@ setup(
package_data = {'gffutils': ['test/data/*']},
description="Work with GFF and GTF files in a flexible "
"database framework",
+ long_description=open("README.rst").read(),
author_email='dalerr at niddk.nih.gov',
url='https://github.com/daler/gffutils',
classifiers=[
View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/-/commit/e65119c5024f80c6128aef6292b2cfe49b0aebc1
--
View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/-/commit/e65119c5024f80c6128aef6292b2cfe49b0aebc1
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230719/61e35e37/attachment-0001.htm>
More information about the debian-med-commit
mailing list