[med-svn] [Git][med-team/python-gffutils][upstream] New upstream version 0.11.1
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Thu Oct 13 17:46:55 BST 2022
Nilesh Patra pushed to branch upstream at Debian Med / python-gffutils
Commits:
98709811 by Nilesh Patra at 2022-10-13T22:09:53+05:30
New upstream version 0.11.1
- - - - -
10 changed files:
- PKG-INFO
- gffutils.egg-info/PKG-INFO
- gffutils.egg-info/SOURCES.txt
- gffutils/create.py
- gffutils/interface.py
- + gffutils/test/data/a.py
- + gffutils/test/data/dm6-chr2L.fa.fai
- + gffutils/test/data/issue_197.gff
- gffutils/test/test_issues.py
- gffutils/version.py
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gffutils
-Version: 0.11.0
+Version: 0.11.1
Summary: Work with GFF and GTF files in a flexible database framework
Home-page: https://github.com/daler/gffutils
Author: Ryan Dale
=====================================
gffutils.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gffutils
-Version: 0.11.0
+Version: 0.11.1
Summary: Work with GFF and GTF files in a flexible database framework
Home-page: https://github.com/daler/gffutils
Author: Ryan Dale
=====================================
gffutils.egg-info/SOURCES.txt
=====================================
@@ -49,10 +49,12 @@ gffutils/test/data/FBgn0031208.gtf
gffutils/test/data/Saccharomyces_cerevisiae.R64-1-1.83.5000_gene_ids.txt
gffutils/test/data/Saccharomyces_cerevisiae.R64-1-1.83.5000_transcript_ids.txt
gffutils/test/data/Saccharomyces_cerevisiae.R64-1-1.83.chromsizes.txt
+gffutils/test/data/a.py
gffutils/test/data/c_elegans_WS199_ann_gff.txt
gffutils/test/data/c_elegans_WS199_dna_shortened.fa
gffutils/test/data/c_elegans_WS199_shortened_gff.txt
gffutils/test/data/dm6-chr2L.fa
+gffutils/test/data/dm6-chr2L.fa.fai
gffutils/test/data/dmel-all-no-analysis-r5.49_50k_lines.gff
gffutils/test/data/download-large-annotation-files.sh
gffutils/test/data/ensembl_gtf.txt
@@ -69,6 +71,7 @@ gffutils/test/data/intro_docs_example.gff
gffutils/test/data/issue167.gff
gffutils/test/data/issue174.gtf
gffutils/test/data/issue181.gff
+gffutils/test/data/issue_197.gff
gffutils/test/data/jgi_gff2.txt
gffutils/test/data/keep-order-test.gtf
gffutils/test/data/keyval_sep_in_attrs.gff
=====================================
gffutils/create.py
=====================================
@@ -67,7 +67,7 @@ class _DBCreator(object):
disable_infer_transcripts=False,
infer_gene_extent=True,
force_merge_fields=None,
- text_factory=sqlite3.OptimizedUnicode,
+ text_factory=str,
pragmas=constants.default_pragmas,
_keep_tempfiles=False,
directives=None,
@@ -1111,7 +1111,7 @@ def create_db(
force_dialect_check=False,
from_string=False,
keep_order=False,
- text_factory=sqlite3.OptimizedUnicode,
+ text_factory=str,
force_merge_fields=None,
pragmas=constants.default_pragmas,
sort_attribute_values=False,
@@ -1321,11 +1321,7 @@ def create_db(
available, since these fields need to be integers.
text_factory : callable
- Text factory to use for the sqlite3 database. See
- https://docs.python.org/2/library/\
- sqlite3.html#sqlite3.Connection.text_factory
- for details. The default sqlite3.OptimizedUnicode will return Unicode
- objects only for non-ASCII data, and bytestrings otherwise.
+ Text factory to use for the sqlite3 database.
pragmas : dict
Dictionary of pragmas used when creating the sqlite3 database. See
=====================================
gffutils/interface.py
=====================================
@@ -819,7 +819,8 @@ class FeatureDB(object):
Providing N features will return N - 1 new features.
This method purposefully does *not* do any merging or sorting of
- coordinates, so you may want to use :meth:`FeatureDB.merge` first, or
+ coordinates. So nested or overlapping features may not behave as you
+ might expect. You may want to use :meth:`FeatureDB.merge` first, and
when selecting features use the `order_by` kwarg, e.g.,
`db.features_of_type('gene', order_by=('seqid', 'start'))`.
@@ -866,43 +867,89 @@ class FeatureDB(object):
-------
A generator that yields :class:`Feature` objects
"""
+
+ def _init_interfeature(f):
+ """
+ Used to initialize a new interfeature that is ready to be updated
+ in-place.
+ """
+ keys = ['id', 'seqid', 'source', 'featuretype', 'start', 'end',
+ 'score', 'strand', 'frame', 'attributes', 'bin']
+ d = dict(zip(keys, f.astuple()))
+ d['source'] = 'gffutils_derived'
+ return d
+
+ def _prep_for_yield(d):
+ """
+ Finalize the interfeature by adjusting coords, recalculating the
+ bin, and creating a feature using self._feature_returner.
+
+ If start is greater than stop (which happens when trying to get
+ interfeatures for overlapping features), then return None.
+ """
+ d['start'] += 1
+ d['end'] -= 1
+ new_bin = bins.bins(d['start'], d['end'], one=True)
+ d['bin'] = new_bin
+
+ if d['start'] > d['end']:
+ return None
+
+ return self._feature_returner(**d)
+
+ # If not provided, use a no-op function instead.
+ if not attribute_func:
+ def attribute_func(a):
+ return a
+
for i, f in enumerate(features):
- # no inter-feature for the first one
+ # First feature: initialize an interfeature and continue to the next.
if i == 0:
- interfeature_start = f.stop
+ interfeature = _init_interfeature(f)
+ last_feature = f
+ nfeatures = 1
+ continue
+
+ # Yield the last interfeature (if we saw at least 2 features) and
+ # start a new interfeature on this chrom.
+ if f.chrom != last_feature.chrom:
+ if nfeatures > 1:
+ new_feature = _prep_for_yield(interfeature)
+ if new_feature:
+ yield new_feature
+ interfeature = _init_interfeature(f)
last_feature = f
+ nfeatures = 1
continue
- interfeature_stop = f.start
+ # Otherwise, we've already seen a feature on this chrom so
+ # this is the second.
+ nfeatures += 1
+
+ # Adjust the interfeature dict in-place with coords...
+ interfeature['start'] = last_feature.stop
+ interfeature['end'] = f.start
+
+ # ...featuretype
if new_featuretype is None:
- new_featuretype = "inter_%s_%s" % (
+ interfeature['featuretype'] = "inter_%s_%s" % (
last_feature.featuretype,
f.featuretype,
)
- if last_feature.strand != f.strand:
- new_strand = "."
else:
- new_strand = f.strand
-
- if last_feature.chrom != f.chrom:
- # We've moved to a new chromosome. For example, if we're
- # getting intergenic regions from all genes, they will be on
- # different chromosomes. We still assume sorted features, but
- # don't complain if they're on different chromosomes -- just
- # move on.
- last_feature = f
- continue
-
- strand = new_strand
- chrom = last_feature.chrom
+ interfeature['featuretype'] = new_featuretype
- # Shrink
- interfeature_start += 1
- interfeature_stop -= 1
+ # ...strand
+ if last_feature.strand != f.strand:
+ interfeature['strand'] = '.'
+ else:
+ interfeature['strand'] = f.strand
+ # and attributes
if merge_attributes:
new_attributes = helpers.merge_attributes(
- last_feature.attributes, f.attributes,
+ attribute_func(last_feature.attributes),
+ attribute_func(f.attributes),
numeric_sort=numeric_sort,
)
else:
@@ -911,31 +958,14 @@ class FeatureDB(object):
if update_attributes:
new_attributes.update(update_attributes)
- new_bin = bins.bins(interfeature_start, interfeature_stop, one=True)
- _id = None
- fields = dict(
- seqid=chrom,
- source="gffutils_derived",
- featuretype=new_featuretype,
- start=interfeature_start,
- end=interfeature_stop,
- score=".",
- strand=strand,
- frame=".",
- attributes=new_attributes,
- bin=new_bin,
- )
+ interfeature['attributes'] = new_attributes
+
+ # Ready to yield
+ new_feature = _prep_for_yield(interfeature)
+ if new_feature:
+ yield new_feature
+ nfeatures = 1
- if dialect is None:
- # Support for @classmethod -- if calling from the class, then
- # self.dialect is not defined, so defer to Feature's default
- # (which will be constants.dialect, or GFF3).
- try:
- dialect = self.dialect
- except AttributeError:
- dialect = None
- yield self._feature_returner(**fields)
- interfeature_start = f.stop
last_feature = f
def delete(self, features, make_backup=True, **kwargs):
=====================================
gffutils/test/data/a.py
=====================================
@@ -0,0 +1,22 @@
+import gffutils
+
+db = gffutils.create_db('issue_197.gff', ':memory:', merge_strategy='error')
+genes = list(db.features_of_type('gene'))
+
+genes = list(db.merge(genes))
+
+igss = list( db.interfeatures(genes,new_featuretype='intergenic_space') )
+
+def transform(f):
+ f['ID'] = [ '-'.join(f.attributes['ID']) ]
+ return f
+
+print('------')
+for i in igss:
+ print(transform(i))
+print('------')
+
+db = db.update(igss, transform=transform, merge_strategy='error')
+
+for i in db.all_features(order_by=('seqid', 'start')):
+ print(i)
=====================================
gffutils/test/data/dm6-chr2L.fa.fai
=====================================
@@ -0,0 +1 @@
+chr2L 2450 7 50 51
=====================================
gffutils/test/data/issue_197.gff
=====================================
@@ -0,0 +1,39 @@
+tig00000012 EVM gene 2181975 2182655 . + . ID=ctg012.gene0754;Name=gene0754
+tig00000012 EVM mRNA 2181975 2182655 . + . ID=ctg012.mRNA0754;Parent=ctg012.gene0754;Name=mRNA0754
+tig00000012 EVM exon 2181975 2182655 . + . ID=ctg012.mRNA0754.exon01;Parent=ctg012.mRNA0754
+tig00000012 EVM CDS 2181975 2182655 . + 0 ID=ctg012.mRNA0754.CDS01;Parent=ctg012.mRNA0754
+tig00000492 EVM gene 46225 47235 . - . ID=ctg492.gene0001;Name=gene0001
+tig00000492 EVM mRNA 46225 47235 . - . ID=ctg492.mRNA0001;Parent=ctg492.gene0001;Name=mRNA0001
+tig00000492 EVM exon 46225 47235 . - . ID=ctg492.mRNA0001.exon01;Parent=ctg492.mRNA0001
+tig00000492 EVM CDS 46225 47235 . - 0 ID=ctg492.mRNA0001.CDS01;Parent=ctg492.mRNA0001
+tig00000492 EVM gene 47351 48256 . - . ID=ctg492.gene0002;Name=gene0002
+tig00000492 EVM mRNA 47351 48256 . - . ID=ctg492.mRNA0002;Parent=ctg492.gene0002;Name=mRNA0002
+tig00000492 EVM exon 47351 48256 . - . ID=ctg492.mRNA0002.exon01;Parent=ctg492.mRNA0002
+tig00000492 EVM CDS 47351 48256 . - 0 ID=ctg492.mRNA0002.CDS01;Parent=ctg492.mRNA0002
+
+tig00000492 EVM gene 50000 50009 . - . ID=gene0
+
+# This is a long gene overlapping others. It should not yield an interfeature
+# with the previous gene (since it overlaps) but it also should not prevent
+# subsequent interfeatures. The docstring points out that nested features like
+# this should be merged. When genes are merged, then the next interfeature
+# shouldn't be until 50086 to 50089.
+tig00000492 EVM gene 50000 50085 . - . ID=gene00
+tig00000492 EVM gene 50009 50029 . - . ID=gene1
+
+# (no interfeature here since genes are contiguous)
+
+tig00000492 EVM gene 50030 50032 . - . ID=gene2
+
+# gene3 overlaps with gene2, so should not give interfeature here
+
+tig00000492 EVM gene 50030 50049 . - . ID=gene3
+tig00000492 EVM gene 50055 50070 . - . ID=gene4
+
+# interfeature created here should be length 1 (50071 to 50071)
+
+tig00000492 EVM gene 50072 50075 . - . ID=gene5
+
+# interfeature should be 50076 to 50089
+
+tig00000492 EVM gene 50090 50100 . - . ID=gene6
=====================================
gffutils/test/test_issues.py
=====================================
@@ -92,7 +92,7 @@ def test_issue_107():
)
assert [str(i) for i in interfeatures] == [
"chr1\tgffutils_derived\tinter_gene_gene\t6\t9\t.\t.\t.\tID=a,b;",
- "chr2\tgffutils_derived\tinter_gene_gene\t16\t54\t.\t-\t.\tID=c,d;",
+ "chr2\tgffutils_derived\tinter_gene_gene\t51\t54\t.\t-\t.\tID=c,d;",
]
@@ -184,9 +184,10 @@ def test_pr_139():
inter = list(db.interfeatures(exons))
# previously, the first exon's attributes would show up in subsequent merged features
- assert exons[0].attributes["Name"][0] not in inter[1].attributes["Name"]
- assert exons[0].attributes["Name"][0] not in inter[2].attributes["Name"]
- assert exons[0].attributes["Name"][0] not in inter[3].attributes["Name"]
+ first_name = exons[0].attributes["Name"][0]
+ for i in inter[1:]:
+ if "Name" in i.attributes:
+ assert first_name not in i.attributes["Name"], str(i)
def test_pr_144():
@@ -399,3 +400,30 @@ def test_issue_181():
return ','.join(f['ID'])
db.update(introns, id_spec={'intron': [intron_id]})
+
+def test_issue_197():
+
+ # Previously this would fail with ValueError due to using the stop position
+ # of the last item on the previous chrom as the start position.
+
+ db = gffutils.create_db(gffutils.example_filename('issue_197.gff'), ':memory:', merge_strategy='error')
+ genes = list(db.features_of_type('gene'))
+ igss = list( db.interfeatures(genes,new_featuretype='intergenic_space') )
+
+ def transform(f):
+ f['ID'] = [ '-'.join(f.attributes['ID']) ]
+ return f
+
+ db = db.update(igss, transform=transform, merge_strategy='error')
+
+ obs = list(db.features_of_type('intergenic_space'))
+ for i in obs:
+ print(i)
+
+ assert [str(i) for i in obs] == [
+ 'tig00000492\tgffutils_derived\tintergenic_space\t47236\t47350\t.\t-\t.\tID=ctg492.gene0001-ctg492.gene0002;Name=gene0001,gene0002',
+ 'tig00000492\tgffutils_derived\tintergenic_space\t48257\t49999\t.\t-\t.\tID=ctg492.gene0002-gene0;Name=gene0002',
+ 'tig00000492\tgffutils_derived\tintergenic_space\t50050\t50054\t.\t-\t.\tID=gene3-gene4',
+ 'tig00000492\tgffutils_derived\tintergenic_space\t50071\t50071\t.\t-\t.\tID=gene4-gene5',
+ 'tig00000492\tgffutils_derived\tintergenic_space\t50076\t50089\t.\t-\t.\tID=gene5-gene6',
+ ]
=====================================
gffutils/version.py
=====================================
@@ -1 +1 @@
-version = "0.11.0"
+version = "0.11.1"
View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/-/commit/98709811a05602518cc414151fa7dccb65fde572
--
View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/-/commit/98709811a05602518cc414151fa7dccb65fde572
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221013/5a3c4170/attachment-0001.htm>
More information about the debian-med-commit
mailing list