[med-svn] [Git][med-team/python-gffutils][upstream] New upstream version 0.11.1

Thu Oct 13 17:46:55 BST 2022


Nilesh Patra pushed to branch upstream at Debian Med / python-gffutils


Commits:
98709811 by Nilesh Patra at 2022-10-13T22:09:53+05:30
New upstream version 0.11.1
- - - - -


10 changed files:

- PKG-INFO
- gffutils.egg-info/PKG-INFO
- gffutils.egg-info/SOURCES.txt
- gffutils/create.py
- gffutils/interface.py
- + gffutils/test/data/a.py
- + gffutils/test/data/dm6-chr2L.fa.fai
- + gffutils/test/data/issue_197.gff
- gffutils/test/test_issues.py
- gffutils/version.py


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gffutils
-Version: 0.11.0
+Version: 0.11.1
 Summary: Work with GFF and GTF files in a flexible database framework
 Home-page: https://github.com/daler/gffutils
 Author: Ryan Dale


=====================================
gffutils.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gffutils
-Version: 0.11.0
+Version: 0.11.1
 Summary: Work with GFF and GTF files in a flexible database framework
 Home-page: https://github.com/daler/gffutils
 Author: Ryan Dale


=====================================
gffutils.egg-info/SOURCES.txt
=====================================
@@ -49,10 +49,12 @@ gffutils/test/data/FBgn0031208.gtf
 gffutils/test/data/Saccharomyces_cerevisiae.R64-1-1.83.5000_gene_ids.txt
 gffutils/test/data/Saccharomyces_cerevisiae.R64-1-1.83.5000_transcript_ids.txt
 gffutils/test/data/Saccharomyces_cerevisiae.R64-1-1.83.chromsizes.txt
+gffutils/test/data/a.py
 gffutils/test/data/c_elegans_WS199_ann_gff.txt
 gffutils/test/data/c_elegans_WS199_dna_shortened.fa
 gffutils/test/data/c_elegans_WS199_shortened_gff.txt
 gffutils/test/data/dm6-chr2L.fa
+gffutils/test/data/dm6-chr2L.fa.fai
 gffutils/test/data/dmel-all-no-analysis-r5.49_50k_lines.gff
 gffutils/test/data/download-large-annotation-files.sh
 gffutils/test/data/ensembl_gtf.txt
@@ -69,6 +71,7 @@ gffutils/test/data/intro_docs_example.gff
 gffutils/test/data/issue167.gff
 gffutils/test/data/issue174.gtf
 gffutils/test/data/issue181.gff
+gffutils/test/data/issue_197.gff
 gffutils/test/data/jgi_gff2.txt
 gffutils/test/data/keep-order-test.gtf
 gffutils/test/data/keyval_sep_in_attrs.gff


=====================================
gffutils/create.py
=====================================
@@ -67,7 +67,7 @@ class _DBCreator(object):
         disable_infer_transcripts=False,
         infer_gene_extent=True,
         force_merge_fields=None,
-        text_factory=sqlite3.OptimizedUnicode,
+        text_factory=str,
         pragmas=constants.default_pragmas,
         _keep_tempfiles=False,
         directives=None,
@@ -1111,7 +1111,7 @@ def create_db(
     force_dialect_check=False,
     from_string=False,
     keep_order=False,
-    text_factory=sqlite3.OptimizedUnicode,
+    text_factory=str,
     force_merge_fields=None,
     pragmas=constants.default_pragmas,
     sort_attribute_values=False,
@@ -1321,11 +1321,7 @@ def create_db(
         available, since these fields need to be integers.
 
     text_factory : callable
-        Text factory to use for the sqlite3 database.  See
-        https://docs.python.org/2/library/\
-                sqlite3.html#sqlite3.Connection.text_factory
-        for details. The default sqlite3.OptimizedUnicode will return Unicode
-        objects only for non-ASCII data, and bytestrings otherwise.
+        Text factory to use for the sqlite3 database.
 
     pragmas : dict
         Dictionary of pragmas used when creating the sqlite3 database. See


=====================================
gffutils/interface.py
=====================================
@@ -819,7 +819,8 @@ class FeatureDB(object):
         Providing N features will return N - 1 new features.
 
         This method purposefully does *not* do any merging or sorting of
-        coordinates, so you may want to use :meth:`FeatureDB.merge` first, or
+        coordinates. So nested or overlapping features may not behave as you
+        might expect. You may want to use :meth:`FeatureDB.merge` first, and
         when selecting features use the `order_by` kwarg, e.g.,
         `db.features_of_type('gene', order_by=('seqid', 'start'))`.
 
@@ -866,43 +867,89 @@ class FeatureDB(object):
         -------
         A generator that yields :class:`Feature` objects
         """
+
+        def _init_interfeature(f):
+            """
+            Used to initialize a new interfeature that is ready to be updated
+            in-place.
+            """
+            keys = ['id', 'seqid', 'source', 'featuretype', 'start', 'end',
+                    'score', 'strand', 'frame', 'attributes', 'bin']
+            d = dict(zip(keys, f.astuple()))
+            d['source'] = 'gffutils_derived'
+            return d
+
+        def _prep_for_yield(d):
+            """
+            Finalize the interfeature by adjusting coords, recalculating the
+            bin, and creating a feature using self._feature_returner.
+
+            If start is greater than stop (which happens when trying to get
+            interfeatures for overlapping features), then return None.
+            """
+            d['start'] += 1
+            d['end'] -= 1
+            new_bin = bins.bins(d['start'], d['end'], one=True)
+            d['bin'] = new_bin
+
+            if d['start'] > d['end']:
+                return None
+
+            return self._feature_returner(**d)
+
+        # If not provided, use a no-op function instead.
+        if not attribute_func:
+            def attribute_func(a):
+                return a
+
         for i, f in enumerate(features):
-            # no inter-feature for the first one
+            # First feature: initialize an interfeature and continue to the next.
             if i == 0:
-                interfeature_start = f.stop
+                interfeature = _init_interfeature(f)
+                last_feature = f
+                nfeatures = 1
+                continue
+
+            # Yield the last interfeature (if we saw at least 2 features) and
+            # start a new interfeature on this chrom.
+            if f.chrom != last_feature.chrom:
+                if nfeatures > 1:
+                    new_feature = _prep_for_yield(interfeature)
+                    if new_feature:
+                        yield new_feature
+                interfeature = _init_interfeature(f)
                 last_feature = f
+                nfeatures = 1
                 continue
 
-            interfeature_stop = f.start
+            # Otherwise, we've already seen a feature on this chrom so
+            # this is the second.
+            nfeatures += 1
+
+            # Adjust the interfeature dict in-place with coords...
+            interfeature['start'] = last_feature.stop
+            interfeature['end'] = f.start
+
+            # ...featuretype
             if new_featuretype is None:
-                new_featuretype = "inter_%s_%s" % (
+                interfeature['featuretype'] = "inter_%s_%s" % (
                     last_feature.featuretype,
                     f.featuretype,
                 )
-            if last_feature.strand != f.strand:
-                new_strand = "."
             else:
-                new_strand = f.strand
-
-            if last_feature.chrom != f.chrom:
-                # We've moved to a new chromosome.  For example, if we're
-                # getting intergenic regions from all genes, they will be on
-                # different chromosomes. We still assume sorted features, but
-                # don't complain if they're on different chromosomes -- just
-                # move on.
-                last_feature = f
-                continue
-
-            strand = new_strand
-            chrom = last_feature.chrom
+                interfeature['featuretype'] = new_featuretype
 
-            # Shrink
-            interfeature_start += 1
-            interfeature_stop -= 1
+            # ...strand
+            if last_feature.strand != f.strand:
+                interfeature['strand'] = '.'
+            else:
+                interfeature['strand'] = f.strand
 
+            # and attributes
             if merge_attributes:
                 new_attributes = helpers.merge_attributes(
-                    last_feature.attributes, f.attributes,
+                    attribute_func(last_feature.attributes),
+                    attribute_func(f.attributes),
                     numeric_sort=numeric_sort,
                 )
             else:
@@ -911,31 +958,14 @@ class FeatureDB(object):
             if update_attributes:
                 new_attributes.update(update_attributes)
 
-            new_bin = bins.bins(interfeature_start, interfeature_stop, one=True)
-            _id = None
-            fields = dict(
-                seqid=chrom,
-                source="gffutils_derived",
-                featuretype=new_featuretype,
-                start=interfeature_start,
-                end=interfeature_stop,
-                score=".",
-                strand=strand,
-                frame=".",
-                attributes=new_attributes,
-                bin=new_bin,
-            )
+            interfeature['attributes'] = new_attributes
+
+            # Ready to yield
+            new_feature = _prep_for_yield(interfeature)
+            if new_feature:
+                yield new_feature
+            nfeatures = 1
 
-            if dialect is None:
-                # Support for @classmethod -- if calling from the class, then
-                # self.dialect is not defined, so defer to Feature's default
-                # (which will be constants.dialect, or GFF3).
-                try:
-                    dialect = self.dialect
-                except AttributeError:
-                    dialect = None
-            yield self._feature_returner(**fields)
-            interfeature_start = f.stop
             last_feature = f
 
     def delete(self, features, make_backup=True, **kwargs):


=====================================
gffutils/test/data/a.py
=====================================
@@ -0,0 +1,22 @@
+import gffutils
+
+db = gffutils.create_db('issue_197.gff', ':memory:', merge_strategy='error')
+genes = list(db.features_of_type('gene'))
+
+genes = list(db.merge(genes))
+
+igss = list( db.interfeatures(genes,new_featuretype='intergenic_space') )
+
+def transform(f):
+    f['ID'] = [ '-'.join(f.attributes['ID']) ]
+    return f
+
+print('------')
+for i in igss:
+    print(transform(i))
+print('------')
+
+db = db.update(igss, transform=transform, merge_strategy='error')
+
+for i in db.all_features(order_by=('seqid', 'start')):
+    print(i)


=====================================
gffutils/test/data/dm6-chr2L.fa.fai
=====================================
@@ -0,0 +1 @@
+chr2L	2450	7	50	51


=====================================
gffutils/test/data/issue_197.gff
=====================================
@@ -0,0 +1,39 @@
+tig00000012	EVM	gene	2181975	2182655	.	+	.	ID=ctg012.gene0754;Name=gene0754
+tig00000012	EVM	mRNA	2181975	2182655	.	+	.	ID=ctg012.mRNA0754;Parent=ctg012.gene0754;Name=mRNA0754
+tig00000012	EVM	exon	2181975	2182655	.	+	.	ID=ctg012.mRNA0754.exon01;Parent=ctg012.mRNA0754
+tig00000012	EVM	CDS	2181975	2182655	.	+	0	ID=ctg012.mRNA0754.CDS01;Parent=ctg012.mRNA0754
+tig00000492	EVM	gene	46225	47235	.	-	.	ID=ctg492.gene0001;Name=gene0001
+tig00000492	EVM	mRNA	46225	47235	.	-	.	ID=ctg492.mRNA0001;Parent=ctg492.gene0001;Name=mRNA0001
+tig00000492	EVM	exon	46225	47235	.	-	.	ID=ctg492.mRNA0001.exon01;Parent=ctg492.mRNA0001
+tig00000492	EVM	CDS	46225	47235	.	-	0	ID=ctg492.mRNA0001.CDS01;Parent=ctg492.mRNA0001
+tig00000492	EVM	gene	47351	48256	.	-	.	ID=ctg492.gene0002;Name=gene0002
+tig00000492	EVM	mRNA	47351	48256	.	-	.	ID=ctg492.mRNA0002;Parent=ctg492.gene0002;Name=mRNA0002
+tig00000492	EVM	exon	47351	48256	.	-	.	ID=ctg492.mRNA0002.exon01;Parent=ctg492.mRNA0002
+tig00000492	EVM	CDS	47351	48256	.	-	0	ID=ctg492.mRNA0002.CDS01;Parent=ctg492.mRNA0002
+
+tig00000492	EVM	gene	50000	50009	.	-	.	ID=gene0
+
+# This is a long gene overlapping others. It should not yield an interfeature
+# with the previous gene (since it overlaps) but it also should not prevent
+# subsequent interfeatures. The docstring points out that nested features like
+# this should be merged. When genes are merged, then the next interfeature
+# shouldn't be until 50086 to 50089.
+tig00000492	EVM	gene	50000	50085	.	-	.	ID=gene00
+tig00000492	EVM	gene	50009	50029	.	-	.	ID=gene1
+
+# (no interfeature here since genes are contiguous)
+
+tig00000492	EVM	gene	50030	50032	.	-	.	ID=gene2
+
+# gene3 overlaps with gene2, so should not give interfeature here
+
+tig00000492	EVM	gene	50030	50049	.	-	.	ID=gene3
+tig00000492	EVM	gene	50055	50070	.	-	.	ID=gene4
+
+# interfeature created here should be length 1 (50071 to 50071)
+
+tig00000492	EVM	gene	50072	50075	.	-	.	ID=gene5
+
+# interfeature should be 50076 to 50089
+
+tig00000492	EVM	gene	50090	50100	.	-	.	ID=gene6


=====================================
gffutils/test/test_issues.py
=====================================
@@ -92,7 +92,7 @@ def test_issue_107():
     )
     assert [str(i) for i in interfeatures] == [
         "chr1\tgffutils_derived\tinter_gene_gene\t6\t9\t.\t.\t.\tID=a,b;",
-        "chr2\tgffutils_derived\tinter_gene_gene\t16\t54\t.\t-\t.\tID=c,d;",
+        "chr2\tgffutils_derived\tinter_gene_gene\t51\t54\t.\t-\t.\tID=c,d;",
     ]
 
 
@@ -184,9 +184,10 @@ def test_pr_139():
     inter = list(db.interfeatures(exons))
 
     # previously, the first exon's attributes would show up in subsequent merged features
-    assert exons[0].attributes["Name"][0] not in inter[1].attributes["Name"]
-    assert exons[0].attributes["Name"][0] not in inter[2].attributes["Name"]
-    assert exons[0].attributes["Name"][0] not in inter[3].attributes["Name"]
+    first_name = exons[0].attributes["Name"][0]
+    for i in inter[1:]:
+        if "Name" in i.attributes:
+            assert first_name not in i.attributes["Name"], str(i)
 
 
 def test_pr_144():
@@ -399,3 +400,30 @@ def test_issue_181():
         return ','.join(f['ID'])
 
     db.update(introns, id_spec={'intron': [intron_id]})
+
+def test_issue_197():
+
+    # Previously this would fail with ValueError due to using the stop position
+    # of the last item on the previous chrom as the start position.
+
+    db = gffutils.create_db(gffutils.example_filename('issue_197.gff'), ':memory:', merge_strategy='error')
+    genes = list(db.features_of_type('gene'))
+    igss = list( db.interfeatures(genes,new_featuretype='intergenic_space') )
+
+    def transform(f):
+        f['ID'] = [ '-'.join(f.attributes['ID']) ]
+        return f
+
+    db = db.update(igss, transform=transform, merge_strategy='error')
+
+    obs = list(db.features_of_type('intergenic_space'))
+    for i in obs:
+        print(i)
+
+    assert [str(i) for i in obs] == [
+        'tig00000492\tgffutils_derived\tintergenic_space\t47236\t47350\t.\t-\t.\tID=ctg492.gene0001-ctg492.gene0002;Name=gene0001,gene0002',
+        'tig00000492\tgffutils_derived\tintergenic_space\t48257\t49999\t.\t-\t.\tID=ctg492.gene0002-gene0;Name=gene0002',
+        'tig00000492\tgffutils_derived\tintergenic_space\t50050\t50054\t.\t-\t.\tID=gene3-gene4',
+        'tig00000492\tgffutils_derived\tintergenic_space\t50071\t50071\t.\t-\t.\tID=gene4-gene5',
+        'tig00000492\tgffutils_derived\tintergenic_space\t50076\t50089\t.\t-\t.\tID=gene5-gene6',
+    ]


=====================================
gffutils/version.py
=====================================
@@ -1 +1 @@
-version = "0.11.0"
+version = "0.11.1"



View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/-/commit/98709811a05602518cc414151fa7dccb65fde572

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/-/commit/98709811a05602518cc414151fa7dccb65fde572
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221013/5a3c4170/attachment-0001.htm>