[med-svn] [Git][med-team/python-gffutils][upstream] New upstream version 0.12

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Thu Jul 20 00:04:37 BST 2023



Étienne Mollier pushed to branch upstream at Debian Med / python-gffutils


Commits:
e65119c5 by Étienne Mollier at 2023-07-20T00:52:49+02:00
New upstream version 0.12
- - - - -


14 changed files:

- PKG-INFO
- README.rst
- gffutils.egg-info/PKG-INFO
- gffutils.egg-info/SOURCES.txt
- gffutils/interface.py
- gffutils/parser.py
- + gffutils/test/conftest.py
- − gffutils/test/data/issue181.gff
- gffutils/test/parser_test.py
- gffutils/test/performance_evaluation.py
- gffutils/test/test.py → gffutils/test/test_1.py
- gffutils/test/test_issues.py
- gffutils/version.py
- setup.py


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gffutils
-Version: 0.11.1
+Version: 0.12
 Summary: Work with GFF and GTF files in a flexible database framework
 Home-page: https://github.com/daler/gffutils
 Author: Ryan Dale
@@ -19,3 +19,16 @@ Classifier: Programming Language :: Python :: 3.5
 Classifier: Programming Language :: Python :: 3.6
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 License-File: LICENSE
+
+gffutils
+========
+
+``gffutils`` is a Python package for working with and manipulating the GFF and
+GTF format files typically used for genomic annotations.
+
+Files are loaded into a sqlite3 database, allowing much more complex
+manipulation of hierarchical features (e.g., genes, transcripts, and exons)
+than is possible with plain-text methods alone.
+
+See documentation at https://daler.github.io/gffutils, and GitHub repo at
+https:/github.com/daler/gffutils.


=====================================
README.rst
=====================================
@@ -1,19 +1,12 @@
-
-.. image:: https://travis-ci.org/daler/gffutils.png?branch=master
-    :target: https://travis-ci.org/daler/gffutils
-
-.. image:: https://badge.fury.io/py/gffutils.svg
-    :target: http://badge.fury.io/py/gffutils
-
-.. image:: https://pypip.in/d/gffutils/badge.png
-    :target: https://pypi.python.org/pypi/gffutils
-
-
+gffutils
+========
 
 ``gffutils`` is a Python package for working with and manipulating the GFF and
-GTF format files typically used for genomic annotations.  Files are loaded into
-a sqlite3 database, allowing much more complex manipulation of hierarchical
-features (e.g., genes, transcripts, and exons) than is possible with plain-text
-methods alone.
+GTF format files typically used for genomic annotations.
+
+Files are loaded into a sqlite3 database, allowing much more complex
+manipulation of hierarchical features (e.g., genes, transcripts, and exons)
+than is possible with plain-text methods alone.
 
-See documentation at **http://daler.github.io/gffutils**.
+See documentation at https://daler.github.io/gffutils, and GitHub repo at
+https:/github.com/daler/gffutils.


=====================================
gffutils.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gffutils
-Version: 0.11.1
+Version: 0.12
 Summary: Work with GFF and GTF files in a flexible database framework
 Home-page: https://github.com/daler/gffutils
 Author: Ryan Dale
@@ -19,3 +19,16 @@ Classifier: Programming Language :: Python :: 3.5
 Classifier: Programming Language :: Python :: 3.6
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 License-File: LICENSE
+
+gffutils
+========
+
+``gffutils`` is a Python package for working with and manipulating the GFF and
+GTF format files typically used for genomic annotations.
+
+Files are loaded into a sqlite3 database, allowing much more complex
+manipulation of hierarchical features (e.g., genes, transcripts, and exons)
+than is possible with plain-text methods alone.
+
+See documentation at https://daler.github.io/gffutils, and GitHub repo at
+https:/github.com/daler/gffutils.


=====================================
gffutils.egg-info/SOURCES.txt
=====================================
@@ -31,13 +31,14 @@ gffutils/scripts/gffutils-cli
 gffutils/scripts/gffutils-flybase-convert.py
 gffutils/test/__init__.py
 gffutils/test/attr_test_cases.py
+gffutils/test/conftest.py
 gffutils/test/expected.py
 gffutils/test/feature_test.py
 gffutils/test/helpers_test.py
 gffutils/test/parser_test.py
 gffutils/test/performance_evaluation.py
 gffutils/test/synth_test_base.py
-gffutils/test/test.py
+gffutils/test/test_1.py
 gffutils/test/test_biopython_integration.py
 gffutils/test/test_issues.py
 gffutils/test/test_iterators.py
@@ -70,7 +71,6 @@ gffutils/test/data/hybrid1.gff3
 gffutils/test/data/intro_docs_example.gff
 gffutils/test/data/issue167.gff
 gffutils/test/data/issue174.gtf
-gffutils/test/data/issue181.gff
 gffutils/test/data/issue_197.gff
 gffutils/test/data/jgi_gff2.txt
 gffutils/test/data/keep-order-test.gtf


=====================================
gffutils/interface.py
=====================================
@@ -102,7 +102,7 @@ class FeatureDB(object):
         keep_order=False,
         pragmas=constants.default_pragmas,
         sort_attribute_values=False,
-        text_factory=sqlite3.OptimizedUnicode,
+        text_factory=str
     ):
         """
         Connect to a database created by :func:`gffutils.create_db`.
@@ -117,8 +117,7 @@ class FeatureDB(object):
         text_factory : callable
 
             Optionally set the way sqlite3 handles strings.  Default is
-            sqlite3.OptimizedUnicode, which returns ascii when possible,
-            unicode otherwise
+            str
 
         default_encoding : str
 
@@ -895,7 +894,14 @@ class FeatureDB(object):
             if d['start'] > d['end']:
                 return None
 
-            return self._feature_returner(**d)
+            new_feature = self._feature_returner(**d)
+
+            # concat list of ID to create uniq IDs because feature with
+            # multiple values for their ID are no longer permitted since v0.11
+            if "ID" in new_feature.attributes and len(new_feature.attributes["ID"]) > 1:
+                new_id = '-'.join(new_feature.attributes["ID"])
+                new_feature.attributes["ID"] = [new_id]
+            return new_feature
 
         # If not provided, use a no-op function instead.
         if not attribute_func:
@@ -1267,6 +1273,128 @@ class FeatureDB(object):
             ):
                 yield intron
 
+    def create_splice_sites(
+        self,
+        exon_featuretype="exon",
+        grandparent_featuretype="gene",
+        parent_featuretype=None,
+        merge_attributes=True,
+        numeric_sort=False,
+    ):
+        """
+        Create splice sites from existing annotations.
+
+
+        Parameters
+        ----------
+        exon_featuretype : string
+            Feature type to use in order to infer splice sites.  Typically `"exon"`.
+
+        grandparent_featuretype : string
+            If `grandparent_featuretype` is not None, then group exons by
+            children of this featuretype.  If `granparent_featuretype` is
+            "gene" (default), then splice sites will be created for all first-level
+            children of genes.  This may include mRNA, rRNA, ncRNA, etc.  If
+            you only want to infer splice sites from one of these featuretypes
+            (e.g., mRNA), then use the `parent_featuretype` kwarg which is
+            mutually exclusive with `grandparent_featuretype`.
+
+        parent_featuretype : string
+            If `parent_featuretype` is not None, then only use this featuretype
+            to infer splice sites.  Use this if you only want a subset of
+            featuretypes to have splice sites (e.g., "mRNA" only, and not ncRNA or
+            rRNA). Mutually exclusive with `grandparent_featuretype`.
+
+        merge_attributes : bool
+            Whether or not to merge attributes from all exons. If False then no
+            attributes will be created for the splice sites.
+
+        numeric_sort : bool
+            If True, then merged attributes that can be cast to float will be
+            sorted by their numeric values (but will still be returned as
+            string). This is useful, for example, when creating splice sites between
+            exons and the exons have exon_number attributes as an integer.
+            Using numeric_sort=True will ensure that the returned exons have
+            merged exon_number attribute of ['9', '10'] (numerically sorted)
+            rather than ['10', '9'] (alphabetically sorted).
+
+        Returns
+        -------
+        A generator object that yields :class:`Feature` objects representing
+        new splice sites
+
+        Notes
+        -----
+        The returned generator can be passed directly to the
+        :meth:`FeatureDB.update` method to permanently add them to the
+        database, e.g., ::
+
+            db.update(db.create_splice sites())
+
+        """
+        if (grandparent_featuretype and parent_featuretype) or (
+            grandparent_featuretype is None and parent_featuretype is None
+        ):
+            raise ValueError(
+                "exactly one of `grandparent_featuretype` or "
+                "`parent_featuretype` should be provided"
+            )
+
+        if grandparent_featuretype:
+
+            def child_gen():
+                for gene in self.features_of_type(grandparent_featuretype):
+                    for child in self.children(gene, level=1):
+                        yield child
+
+        elif parent_featuretype:
+
+            def child_gen():
+                for child in self.features_of_type(parent_featuretype):
+                    yield child
+
+        # Two splice features need to be created for each interleave
+        for side in ["left", "right"]:
+            for child in child_gen():
+                exons = self.children(
+                    child, level=1, featuretype=exon_featuretype, order_by="start"
+                )
+
+                # get strand
+                strand = child.strand
+
+                new_featuretype = "splice_site"
+                if side == "left":
+                    if strand == "+":
+                        new_featuretype = "five_prime_cis_splice_site"
+                    elif strand == "-":
+                        new_featuretype = "three_prime_cis_splice_site"
+
+                if side == "right":
+                    if strand == "+":
+                        new_featuretype = "three_prime_cis_splice_site"
+                    elif strand == "-":
+                        new_featuretype = "five_prime_cis_splice_site"
+
+                for splice_site in self.interfeatures(
+                    exons,
+                    new_featuretype=new_featuretype,
+                    merge_attributes=merge_attributes,
+                    numeric_sort=numeric_sort,
+                    dialect=self.dialect,
+                ):
+
+                    if side == "left":
+                        splice_site.end = splice_site.start + 1
+                    if side == "right":
+                        splice_site.start = splice_site.end - 1
+
+                    # make ID uniq by adding suffix
+                    splice_site.attributes["ID"] = [new_featuretype + "_" + splice_site.attributes["ID"][0]]
+
+                    yield splice_site
+
+
     def _old_merge(self, features, ignore_strand=False):
         """
         DEPRECATED, only retained here for backwards compatibility. Please use


=====================================
gffutils/parser.py
=====================================
@@ -341,16 +341,29 @@ def _split_keyvals(keyval_str, dialect=None):
             val = val[1:-1]
             dialect["quoted GFF2 values"] = True
         if val:
+
             # TODO: if there are extra commas for a value, just use empty
             # strings
             # quals[key].extend([v for v in val.split(',') if v])
-            vals = val.split(",")
-            if (len(vals) > 1) and dialect["repeated keys"]:
-                raise AttributeStringError(
-                    "Internally inconsistent attributes formatting: "
-                    "some have repeated keys, some do not."
-                )
-            quals[key].extend(vals)
+
+            # See issue #198, where commas within a description can incorrectly
+            # cause the dialect inference to conclude that there are not
+            # repeated keys.
+            #
+            # More description in PR #208.
+            if dialect["repeated keys"]:
+                quals[key].append(val)
+            else:
+                vals = val.split(",")
+
+                # If anything starts with a leading space, then we infer that
+                # it was part of a description or some other typographical
+                # interpretation, not a character to split multiple vals on --
+                # and append the original val rather than the split vals.
+                if any([i[0] == " " for i in vals if i]):
+                    quals[key].append(val)
+                else:
+                    quals[key].extend(vals)
 
         # keep track of the order of keys
         dialect["order"].append(key)


=====================================
gffutils/test/conftest.py
=====================================
@@ -0,0 +1 @@
+collect_ignore=["data"]


=====================================
gffutils/test/data/issue181.gff deleted
=====================================
@@ -1,14 +0,0 @@
-#OriSeqID=Chr1	Accession=GWHACDB00000001.1
-GWHACDB00000001.1	.	gene	9738	14906	1	+	.	ID=Hc.01G000010;Accession=GWHGACDB000001.1;Augustus_transcriptSupport_percentage=100;Augustus_intronSupport=4/4;Source=augustus000002;Integrity=complete;;transl_table=1
-GWHACDB00000001.1	.	mRNA	9738	14906	1	+	.	ID=Hc.01G000010.t1;Accession=GWHTACDB000001.1;Parent=Hc.01G000010;Parent_Accession=GWHGACDB000001.1;IntronSupport=4/4;Integrity=complete;;transl_table=1
-GWHACDB00000001.1	.	exon	9738	11686	.	+	.	ID=Hc.01G000010.t1.exon1;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	CDS	9738	11686	1	+	0	ID=Hc.01G000010.t1.CDS1;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	exon	11897	12671	.	+	.	ID=Hc.01G000010.t1.exon2;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	CDS	11897	12671	1	+	1	ID=Hc.01G000010.t1.CDS2;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	exon	13090	13368	.	+	.	ID=Hc.01G000010.t1.exon3;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	CDS	13090	13368	1	+	0	ID=Hc.01G000010.t1.CDS3;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	exon	13650	14396	.	+	.	ID=Hc.01G000010.t1.exon4;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	CDS	13650	14396	1	+	0	ID=Hc.01G000010.t1.CDS4;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	exon	14601	14906	.	+	.	ID=Hc.01G000010.t1.exon5;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	CDS	14601	14708	1	+	0	ID=Hc.01G000010.t1.CDS5;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;Protein_Accession=GWHPACDB000001.1;;transl_table=1
-GWHACDB00000001.1	.	three_prime_UTR	14709	14906	.	+	.	ID=Hc.01G000010.t1.3UTR1;Parent=Hc.01G000010.t1;Parent_Accession=GWHTACDB000001.1;;transl_table=1


=====================================
gffutils/test/parser_test.py
=====================================
@@ -1,10 +1,11 @@
 import tempfile
-from nose.tools import assert_raises
 from gffutils import parser, create, feature, iterators, constants, helpers, exceptions
 from gffutils import example_filename, create_db
 from . import attr_test_cases
 from textwrap import dedent
 
+import pytest
+
 TEST_FILENAMES = [
     example_filename(i)
     for i in [
@@ -42,15 +43,8 @@ def test_directives():
     assert db.directives == ["directive1 example"], db.directives
 
 
-def test_split_attrs():
-    # nosetests generator for all the test cases in attr_test_cases.  (note no
-    # docstring for this test function so that nosetests -v will print the test
-    # cases)
-    for (attr_str, attr_dict, acceptable_reconstruction) in attr_test_cases.attrs:
-        yield attrs_OK, attr_str, attr_dict, acceptable_reconstruction
-
-
-def attrs_OK(attr_str, attr_dict, acceptable_reconstruction=None):
+ at pytest.mark.parametrize("item", attr_test_cases.attrs)
+def test_attrs_OK(item):
     """
     Given an attribute string and a dictionary of what you expect, test the
     attribute splitting and reconstruction (invariant roundtrip).
@@ -59,7 +53,9 @@ def attrs_OK(attr_str, attr_dict, acceptable_reconstruction=None):
     (see attr_test_cases.py for details); `acceptable_reconstruction` handles
     those.
     """
+    attr_str, attr_dict, acceptable_reconstruction = item
     result, dialect = parser._split_keyvals(attr_str)
+    result = dict(result)
     assert result == attr_dict, result
 
     reconstructed = parser._reconstruct(result, dialect, keep_order=True)
@@ -88,10 +84,10 @@ def test_empty_recontruct():
     reconstructing attributes with incomplete information returns empty string
     """
     assert parser._reconstruct(None, constants.dialect) == ""
-    assert_raises(
-        exceptions.AttributeStringError, parser._reconstruct, dict(ID="asdf"), None
-    )
-    assert_raises(exceptions.AttributeStringError, parser._reconstruct, None, None)
+    with pytest.raises(exceptions.AttributeStringError):
+        parser._reconstruct(dict(ID="asdf"), None)
+    with pytest.raises(exceptions.AttributeStringError):
+        parser._reconstruct(None, None)
 
 
 def test_empty_split_keyvals():


=====================================
gffutils/test/performance_evaluation.py
=====================================
@@ -1,8 +1,8 @@
 """
-Performance testing. Run them with https://github.com/mahmoudimus/nose-timer:
+Performance testing. Run them with https://pypi.org/project/pytest-timer/:
 
 ```
-nosetests --nocapture -a slow --with-timer
+pytest --capture=no -m slow --with-timer
 ```
 
 WARNING: These tests can take about 1.5 hours to run!
@@ -15,7 +15,7 @@ import random
 import unittest
 import os
 
-from nose.plugins import attrib
+import pytest
 
 import gffutils
 
@@ -187,7 +187,7 @@ class PerformanceTestFeatureDB(object):
         )
 
 
- at attrib.attr("slow")
+ at pytest.mark.slow
 class TestPerformanceOnSacCer(PerformanceTestFeatureDB, unittest.TestCase):
     """
     Test frequent scenarios on medium size genome of yeast.
@@ -205,7 +205,7 @@ class TestPerformanceOnSacCer(PerformanceTestFeatureDB, unittest.TestCase):
     )
 
 
- at attrib.attr("slow")
+ at pytest.mark.slow
 class TestPerformanceOnMouse(PerformanceTestFeatureDB, unittest.TestCase):
     """
     Test frequent scenarios on large genome of mouse.


=====================================
gffutils/test/test.py → gffutils/test/test_1.py
=====================================
@@ -1,7 +1,7 @@
 import warnings
 from textwrap import dedent
 from . import expected
-from gffutils import example_filename, create, parser, feature
+from gffutils import example_filename, create, feature
 import gffutils
 import gffutils.helpers as helpers
 import gffutils.gffwriter as gffwriter
@@ -13,8 +13,6 @@ import six
 import shutil
 import threading
 import tempfile
-from textwrap import dedent
-from nose.tools import assert_raises
 from six.moves import SimpleHTTPServer
 
 if sys.version_info.major == 3:
@@ -24,11 +22,10 @@ else:
 
 import multiprocessing
 import json
-import tempfile
-import shutil
-import glob
 import difflib
 
+import pytest
+
 testdbfn_gtf = ":memory:"
 testdbfn_gff = ":memory:"
 
@@ -631,17 +628,16 @@ def test_feature_merge():
     x = db["fake"]
     y = db["fake_1"]
 
-    assert_raises(
-        ValueError,
-        gffutils.create_db,
-        gtfdata,
-        ":memory:",
-        from_string=True,
-        merge_strategy="merge",
-        id_spec="gene_id",
-        force_merge_fields=["start"],
-        keep_order=True,
-    )
+    with pytest.raises(ValueError):
+        gffutils.create_db(
+            gtfdata,
+            ":memory:",
+            from_string=True,
+            merge_strategy="merge",
+            id_spec="gene_id",
+            force_merge_fields=["start"],
+            keep_order=True,
+            )
 
     # test that warnings are raised because of strand and frame
     with warnings.catch_warnings(record=True) as w:
@@ -750,7 +746,8 @@ def test_empty_files():
     fn = tempfile.NamedTemporaryFile(delete=False).name
     a = open(fn, "w")
     a.close()
-    assert_raises(gffutils.exceptions.EmptyInputError, gffutils.create_db, fn, fn + ".db")
+    with pytest.raises(gffutils.exceptions.EmptyInputError):
+        gffutils.create_db(fn, fn + ".db")
 
 
 def test_false_function():
@@ -937,17 +934,17 @@ def test_iterator_update():
         [(i.start, i.stop) for i in db.features_of_type("exon")]
     )
 
+def clean_tempdir():
+    tempfile.tempdir = tempdir
+    if os.path.exists(tempdir):
+        shutil.rmtree(tempdir)
+    os.makedirs(tempdir)
 
-def test_tempfiles():
+# specify a writeable temp dir for testing
+tempdir = "/tmp/gffutils-test"
 
-    # specifiy a writeable temp dir for testing
-    tempdir = "/tmp/gffutils-test"
+def test_tempfiles():
 
-    def clean_tempdir():
-        tempfile.tempdir = tempdir
-        if os.path.exists(tempdir):
-            shutil.rmtree(tempdir)
-        os.makedirs(tempdir)
 
     clean_tempdir()
 
@@ -995,6 +992,10 @@ def test_tempfiles():
     assert len(filelist) == 1, filelist
     assert filelist[0].endswith(".GFFtmp")
 
+ at pytest.mark.skip(reason="Unclear if still needed; currently failing")
+def test_parallel_db():
+    # DISABLING in v0.12
+
     # Test n parallel instances of gffutils across PROCESSES processes.
     #
     # Note that travis-ci doesn't like it when you use multiple cores, so the
@@ -1013,6 +1014,7 @@ def test_tempfiles():
         res = pool.map(make_db, range(n))
     finally:
         pool.close()
+
     assert sorted(list(res)) == list(range(n))
     filelist = os.listdir(tempdir)
     assert len(filelist) == n, len(filelist)
@@ -1107,23 +1109,21 @@ def test_deprecation_handler():
     return
 
     # TODO: when infer_gene_extent actually gets deprecated, test here.
-    assert_raises(
-        ValueError,
-        gffutils.create_db,
-        gffutils.example_filename("FBgn0031208.gtf"),
-        ":memory:",
-        infer_gene_extent=False,
-    )
+    with pytest.raises(ValueError):
+        gffutils.create_db(
+            gffutils.example_filename("FBgn0031208.gtf"),
+            ":memory:",
+            infer_gene_extent=False,
+            )
 
 
 def test_nonsense_kwarg():
-    assert_raises(
-        TypeError,
-        gffutils.create_db,
-        gffutils.example_filename("FBgn0031208.gtf"),
-        ":memory:",
-        asdf=True,
-    )
+    with pytest.raises(TypeError):
+        gffutils.create_db(
+            gffutils.example_filename("FBgn0031208.gtf"),
+            ":memory:",
+            asdf=True,
+            )
 
 
 def test_infer_gene_extent():
@@ -1237,6 +1237,36 @@ def test_db_unquoting():
     assert db["f"]["Note"] == [","]
 
 
+def test_create_splice_sites():
+    fn = gffutils.example_filename("gff_example1.gff3")
+    db = gffutils.create_db(fn, ":memory:")
+    db = db.update(db.create_splice_sites())
+    observed = "\n".join(str(feature) for feature in db.all_features())
+    expected = dedent("""\
+    chr1	ensGene	gene	4763287	4775820	.	-	.	Name=ENSMUSG00000033845;ID=ENSMUSG00000033845;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845
+    chr1	ensGene	mRNA	4764517	4775779	.	-	.	Name=ENSMUST00000045689;Parent=ENSMUSG00000033845;ID=ENSMUST00000045689;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845
+    chr1	ensGene	CDS	4775654	4775758	.	-	0	Name=ENSMUST00000045689.cds0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.cds0;gid=ENSMUSG00000033845
+    chr1	ensGene	CDS	4772761	4772814	.	-	0	Name=ENSMUST00000045689.cds1;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.cds1;gid=ENSMUSG00000033845
+    chr1	ensGene	exon	4775654	4775779	.	-	.	Name=ENSMUST00000045689.exon0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon0;gid=ENSMUSG00000033845
+    chr1	ensGene	exon	4772649	4772814	.	-	.	Name=ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon1;gid=ENSMUSG00000033845
+    chr1	ensGene	exon	4767606	4767729	.	-	.	Name=ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon2;gid=ENSMUSG00000033845
+    chr1	ensGene	exon	4764517	4764597	.	-	.	Name=ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon3;gid=ENSMUSG00000033845
+    chr1	ensGene	five_prime_UTR	4775759	4775779	.	-	.	Name=ENSMUST00000045689.utr0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr0;gid=ENSMUSG00000033845
+    chr1	ensGene	three_prime_UTR	4772649	4772760	.	-	.	Name=ENSMUST00000045689.utr1;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr1;gid=ENSMUSG00000033845
+    chr1	ensGene	three_prime_UTR	4767606	4767729	.	-	.	Name=ENSMUST00000045689.utr2;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr2;gid=ENSMUSG00000033845
+    chr1	ensGene	three_prime_UTR	4764517	4764597	.	-	.	Name=ENSMUST00000045689.utr3;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr3;gid=ENSMUSG00000033845
+    chr1	gffutils_derived	three_prime_cis_splice_site	4764598	4764599	.	-	.	Name=ENSMUST00000045689.exon2,ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon2-ENSMUST00000045689.exon3;gid=ENSMUSG00000033845
+    chr1	gffutils_derived	three_prime_cis_splice_site	4767730	4767731	.	-	.	Name=ENSMUST00000045689.exon1,ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon1-ENSMUST00000045689.exon2;gid=ENSMUSG00000033845
+    chr1	gffutils_derived	three_prime_cis_splice_site	4772815	4772816	.	-	.	Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845
+    chr1	gffutils_derived	five_prime_cis_splice_site	4767604	4767605	.	-	.	Name=ENSMUST00000045689.exon2,ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon2-ENSMUST00000045689.exon3;gid=ENSMUSG00000033845
+    chr1	gffutils_derived	five_prime_cis_splice_site	4772647	4772648	.	-	.	Name=ENSMUST00000045689.exon1,ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon1-ENSMUST00000045689.exon2;gid=ENSMUSG00000033845
+    chr1	gffutils_derived	five_prime_cis_splice_site	4775652	4775653	.	-	.	Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845""")
+
+    assert observed == expected
+
+
+
+
 if __name__ == "__main__":
     # this test case fails
     # test_attributes_modify()


=====================================
gffutils/test/test_issues.py
=====================================
@@ -10,8 +10,8 @@ from textwrap import dedent
 import gffutils
 from gffutils import feature
 from gffutils import merge_criteria as mc
-from nose.tools import assert_raises
 
+import pytest
 
 def test_issue_79():
     gtf = gffutils.example_filename("keep-order-test.gtf")
@@ -91,8 +91,8 @@ def test_issue_107():
         db.interfeatures(db.features_of_type("gene", order_by=("seqid", "start")))
     )
     assert [str(i) for i in interfeatures] == [
-        "chr1\tgffutils_derived\tinter_gene_gene\t6\t9\t.\t.\t.\tID=a,b;",
-        "chr2\tgffutils_derived\tinter_gene_gene\t51\t54\t.\t-\t.\tID=c,d;",
+        "chr1\tgffutils_derived\tinter_gene_gene\t6\t9\t.\t.\t.\tID=a-b;",
+        "chr2\tgffutils_derived\tinter_gene_gene\t51\t54\t.\t-\t.\tID=c-d;",
     ]
 
 
@@ -324,9 +324,12 @@ def test_issue_157():
     #   TypeError: merge() got an unexpected keyword argument 'ignore_strand'
     #
     # Now changing to ValueError and suggesting a fix. 
-    assert_raises(ValueError, db.children_bp, gene, child_featuretype='exon', merge=True, ignore_strand=True)
-    assert_raises(ValueError, db.children_bp, gene, ignore_strand=True, nonexistent=True)
-    assert_raises(TypeError, db.children_bp, gene, nonexistent=True)
+    with pytest.raises(ValueError):
+        db.children_bp(gene, child_featuretype='exon', merge=True, ignore_strand=True)
+    with pytest.raises(ValueError):
+        db.children_bp(gene, ignore_strand=True, nonexistent=True)
+    with pytest.raises(TypeError):
+        db.children_bp(gene, nonexistent=True)
 
     # The way to do it now is the following (we can omit the mc.feature_type
     # since we're preselecting for exons anyway):
@@ -385,22 +388,6 @@ def test_issue_174():
     assert observed[8] == ['9', '10'] 
     assert observed[9] == ['10', '11']
 
-
-def test_issue_181():
-    db = gffutils.create_db(
-        gffutils.example_filename('issue181.gff'),
-        ':memory:')
-    introns = db.create_introns()
-
-    # This now warns that the provided ID key has multiple values.
-    assert_raises(ValueError, db.update, introns)
-
-    # The fix is to provide a custom intron ID converter.
-    def intron_id(f):
-        return ','.join(f['ID'])
-
-    db.update(introns, id_spec={'intron': [intron_id]})
-
 def test_issue_197():
 
     # Previously this would fail with ValueError due to using the stop position
@@ -410,11 +397,18 @@ def test_issue_197():
     genes = list(db.features_of_type('gene'))
     igss = list( db.interfeatures(genes,new_featuretype='intergenic_space') )
 
+
+    # Prior to PR #219, multiple IDs could be created by interfeatures, which
+    # in turn was patched here by providing the transform to db.update. With
+    # #219, this ends up being a no-op because ID is a single value by the time
+    # it gets to the transform function.
+    #
+    # However, keeping the test as-is to ensure backward-compatibility.
     def transform(f):
         f['ID'] = [ '-'.join(f.attributes['ID']) ]
         return f
 
-    db = db.update(igss, transform=transform, merge_strategy='error')
+    db = db.update(igss, transform=transform,  merge_strategy='error')
 
     obs = list(db.features_of_type('intergenic_space'))
     for i in obs:
@@ -427,3 +421,187 @@ def test_issue_197():
         'tig00000492\tgffutils_derived\tintergenic_space\t50071\t50071\t.\t-\t.\tID=gene4-gene5',
         'tig00000492\tgffutils_derived\tintergenic_space\t50076\t50089\t.\t-\t.\tID=gene5-gene6',
     ]
+
+def test_issue_198():
+    line = 'NC_000001.11	BestRefSeq	gene	14362	29370	.	-	.	gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; db_xref "HGNC:HGNC:38034"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";'
+
+    # Original issue #198 is that this fails with:
+    #
+    #   gffutils.exceptions.AttributeStringError: Internally inconsistent
+    #   attributes formatting: some have repeated keys, some do not.
+    #
+    # This is because the dialect inference sees the two db_xref keys, and
+    # correctly assumes the dialect uses repeated keys rather than
+    # multiple, comma-separated values -- but there's a comma in the
+    # description.
+    #
+    # So we need to figure out the best way of interpreting a comma in cases
+    # like this. It seems like the best solution is to assume that the presence
+    # of repeated keys always wins.
+    f = feature.feature_from_line(line)
+
+    assert f.attributes['description'] == ['WASP family homolog 7, pseudogene']
+
+    # If we remove one of the db_xref keys, then the parser sees the comma and
+    # figures it's a multivalue key.
+    line = 'NC_000001.11	BestRefSeq	gene	14362	29370	.	-	.	gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7, pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";'
+    f = feature.feature_from_line(line)
+
+    # Previous result, note leading space --------------------------->| |
+    # assert f.attributes['description'] == ['WASP family homolog 7', ' pseudogene']
+    assert f.attributes['description'] == ['WASP family homolog 7, pseudogene']
+
+    # But removing that space before "pseudogene" means it's interpreted as
+    # a multivalue attribute
+    line = 'NC_000001.11	BestRefSeq	gene	14362	29370	.	-	.	gene_id "WASH7P"; transcript_id ""; db_xref "GeneID:653635"; description "WASP family homolog 7,pseudogene"; gbkey "Gene"; gene "WASH7P"; gene_biotype "transcribed_pseudogene"; gene_synonym "FAM39F"; gene_synonym "WASH5P"; pseudo "true";'
+    f = feature.feature_from_line(line)
+    assert f.attributes['description'] == ['WASP family homolog 7', 'pseudogene']
+
+    # Confirm behavior of corner cases like a trailing comma
+    line = "chr17	RefSeq	CDS	6806527	6806553	.	+	0	Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,"
+    f = feature.feature_from_line(line)
+    assert f.attributes['Parent'] == ['XM_001475631.1', '']
+
+
+def test_issue_207():
+
+    def _check(txt, expected_keys, dialect_trailing_semicolon):
+        db = gffutils.create_db(txt.replace(' ', '\t'), ':memory:', from_string=True)
+        assert [list(f.attributes.keys()) for f in db.all_features()] == expected_keys
+        assert db.dialect['trailing semicolon'] == dialect_trailing_semicolon
+
+    # All lines have trailing semicolon
+    _check(
+        txt=dedent("""\
+        chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
+        chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903;
+        chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1;
+        chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1;
+        """),
+        expected_keys = [
+            ['ID'],
+            ['ID', 'Parent'],
+            ['ID', 'Parent'],
+            ['ID', 'Parent'],
+        ],
+        dialect_trailing_semicolon=True
+    )
+
+    # First two lines have trailing semicolon. However, the heuristics of
+    # dialect selection, which favor attributes with more values (assuming more
+    # information), decides that this file does NOT have trailing semicolons.
+    _check(
+        txt=dedent("""\
+        chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
+        chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;Parent=g1903;
+        chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;Parent=g1903.t1
+        chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;Parent=g1903.t1.d1
+        """),
+        expected_keys = [
+            ['ID', ''],
+            ['ID', 'Parent', ''],
+            ['ID', 'Parent'],
+            ['ID', 'Parent'],
+        ],
+        dialect_trailing_semicolon=False,
+    )
+
+    # APPARENTLY INCONSISTENT: The only thing difference here is that the
+    # Parent attribute has been removed, otherwise matches above (first two
+    # have trailing semicolon). But now there are no empty keys.
+    #
+    # This is expected behavior, because there are no attributes with more keys
+    # as above to give higher weight, and to break the tie between with and
+    # without trailing semicolon, falls back to first dialect observed.
+    _check(
+        txt=dedent("""\
+        chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
+        chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;
+        chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1
+        chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1
+        """),
+        expected_keys=[
+            ['ID'],
+            ['ID'],
+            ['ID'],
+            ['ID']
+        ],
+        dialect_trailing_semicolon=True,
+    )
+
+    # We can convince the heuristics to think there should be NO trailing
+    # semicolon by giving one more line as evidence. Only difference is from
+    # above is the last line.
+    _check(
+        txt=dedent("""\
+        chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903;
+        chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1;
+        chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1
+        chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1
+        chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1904.t1.d1.1
+        """),
+        expected_keys=[
+            ['ID', ''],
+            ['ID', ''],
+            ['ID'],
+            ['ID'],
+            ['ID'],
+        ],
+        dialect_trailing_semicolon=False,
+    )
+
+
+    # Again seems inconsistent at first, but heuristics break ties by
+    # preferring first dialect, which here is no trailing semicolon.
+    _check(
+        txt=dedent("""\
+        chr1 AUGUSTUS gene 68330 73621 1 - . ID=g1903
+        chr1 AUGUSTUS mRNA 68330 73621 1 - . ID=g1903.t1
+        chr1 Pfam protein_match 73372 73618 1 - . ID=g1903.t1.d1;
+        chr1 Pfam protein_hmm_match 73372 73618 1 - . ID=g1903.t1.d1.1;
+        """),
+        expected_keys=[
+            ['ID'],
+            ['ID'],
+            ['ID', ''],
+            ['ID', '']
+        ],
+        dialect_trailing_semicolon=False,
+    )
+
+
+def test_issue_213():
+    # GFF header directives seem to be not parsed when building a db from
+    # a file, even though it seems to work fine from a string.
+    data = dedent(
+        """
+    ##gff-version 3
+    .	.	.	.	.	.	.	.
+    .	.	.	.	.	.	.	.
+    .	.	.	.	.	.	.	.
+    .	.	.	.	.	.	.	.
+    """
+    )
+
+    # Ensure directives are parsed from DataIterator
+    it = gffutils.iterators.DataIterator(data, from_string=True)
+    assert it.directives == ["gff-version 3"]
+
+
+    # Ensure they're parsed into the db from a string
+    db = gffutils.create_db(data, dbfn=":memory:", from_string=True, verbose=False)
+    assert db.directives == ["gff-version 3"], db.directives
+
+    # Ensure they're parsed into the db from a file
+    tmp = tempfile.NamedTemporaryFile(delete=False).name
+    with open(tmp, "w") as fout:
+        fout.write(data + "\n")
+    db = gffutils.create_db(tmp, ":memory:")
+    assert db.directives == ["gff-version 3"], db.directives
+    assert len(db.directives) == 1
+
+    # Ensure they're parsed into the db from a file, and going to a file (to
+    # exactly replicate example in #213)
+    db = gffutils.create_db(tmp, dbfn='issue_213.db', force=True)
+    assert db.directives == ["gff-version 3"], db.directives
+    assert len(db.directives) == 1


=====================================
gffutils/version.py
=====================================
@@ -1 +1 @@
-version = "0.11.1"
+version = "0.12"


=====================================
setup.py
=====================================
@@ -18,6 +18,7 @@ setup(
     package_data = {'gffutils': ['test/data/*']},
     description="Work with GFF and GTF files in a flexible "
     "database framework",
+    long_description=open("README.rst").read(),
     author_email='dalerr at niddk.nih.gov',
     url='https://github.com/daler/gffutils',
     classifiers=[



View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/-/commit/e65119c5024f80c6128aef6292b2cfe49b0aebc1

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/-/commit/e65119c5024f80c6128aef6292b2cfe49b0aebc1
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230719/61e35e37/attachment-0001.htm>


More information about the debian-med-commit mailing list