[med-svn] [Git][med-team/python-gffutils][upstream] New upstream version 0.10.1
Michael R. Crusoe
gitlab at salsa.debian.org
Wed Jan 1 11:55:02 GMT 2020
Michael R. Crusoe pushed to branch upstream at Debian Med / python-gffutils
Commits:
22db2456 by Michael R. Crusoe at 2020-01-01T11:45:45Z
New upstream version 0.10.1
- - - - -
25 changed files:
- PKG-INFO
- gffutils.egg-info/PKG-INFO
- gffutils.egg-info/SOURCES.txt
- gffutils.egg-info/requires.txt
- gffutils/attributes.py
- gffutils/bins.py
- gffutils/biopython_integration.py
- gffutils/create.py
- gffutils/feature.py
- gffutils/helpers.py
- − gffutils/inspection.py
- gffutils/interface.py
- gffutils/iterators.py
- + gffutils/merge_criteria.py
- gffutils/parser.py
- gffutils/pybedtools_integration.py
- + gffutils/test/data/synthetic.gff3
- + gffutils/test/synth_test_base.py
- gffutils/test/test.py
- gffutils/test/test_biopython_integration.py
- + gffutils/test/test_merge.py
- + gffutils/test/test_merge_all.py
- gffutils/version.py
- requirements.txt
- setup.cfg
Changes:
=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: gffutils
-Version: 0.9
+Version: 0.10.1
Summary: Work with GFF and GTF files in a flexible database framework
Home-page: https://github.com/daler/gffutils
Author: Ryan Dale
=====================================
gffutils.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: gffutils
-Version: 0.9
+Version: 0.10.1
Summary: Work with GFF and GTF files in a flexible database framework
Home-page: https://github.com/daler/gffutils
Author: Ryan Dale
=====================================
gffutils.egg-info/SOURCES.txt
=====================================
@@ -16,9 +16,9 @@ gffutils/feature.py
gffutils/gffwriter.py
gffutils/helpers.py
gffutils/inspect.py
-gffutils/inspection.py
gffutils/interface.py
gffutils/iterators.py
+gffutils/merge_criteria.py
gffutils/parser.py
gffutils/pybedtools_integration.py
gffutils/version.py
@@ -36,8 +36,11 @@ gffutils/test/feature_test.py
gffutils/test/helpers_test.py
gffutils/test/parser_test.py
gffutils/test/performance_test.py
+gffutils/test/synth_test_base.py
gffutils/test/test.py
gffutils/test/test_biopython_integration.py
+gffutils/test/test_merge.py
+gffutils/test/test_merge_all.py
gffutils/test/data/F3-unique-3.v2.gff
gffutils/test/data/FBgn0031208.gff
gffutils/test/data/FBgn0031208.gtf
@@ -67,6 +70,7 @@ gffutils/test/data/mouse_extra_comma.gff3
gffutils/test/data/ncbi_gff3.txt
gffutils/test/data/nonascii
gffutils/test/data/random-chr.gff
+gffutils/test/data/synthetic.gff3
gffutils/test/data/unsanitized.gff
gffutils/test/data/wormbase_gff2.txt
gffutils/test/data/wormbase_gff2_alt.txt
\ No newline at end of file
=====================================
gffutils.egg-info/requires.txt
=====================================
@@ -1,5 +1,5 @@
-pyfaidx
-six
-argh
-argcomplete
+pyfaidx>=0.5.5.2
+six>=1.12.0
+argh>=0.26.2
+argcomplete>=1.9.4
simplejson
=====================================
gffutils/attributes.py
=====================================
@@ -1,11 +1,16 @@
import six
import collections
+
+try:
+ collectionsAbc = collections.abc
+except AttributeError:
+ collectionsAbc = collections
from gffutils import constants
# collections.MutableMapping is apparently the best way to provide dict-like
# interface (http://stackoverflow.com/a/3387975)
-class Attributes(collections.MutableMapping):
+class Attributes(collectionsAbc.MutableMapping):
def __init__(self, *args, **kwargs):
"""
An Attributes object acts much like a dictionary. However, values are
=====================================
gffutils/bins.py
=====================================
@@ -53,6 +53,7 @@ OFFSETS = [
# for BED (0-based, half-open) or GFF (1-based, closed intervals)
COORD_OFFSETS = {'bed': 0, 'gff': 1}
+MAX_CHROM_SIZE = 2 ** 29
def bins(start, stop, fmt='gff', one=True):
"""
@@ -72,6 +73,15 @@ def bins(start, stop, fmt='gff', one=True):
fmt : 'gff' | 'bed'
This specifies 1-based start coords (gff) or 0-based start coords (bed)
"""
+
+ # For very large coordinates, return 1 which is "somewhere on the
+ # chromosome".
+ if start >= MAX_CHROM_SIZE or stop >= MAX_CHROM_SIZE:
+ if one:
+ return 1
+ else:
+ return set([1])
+
# Jump to highest resolution bin that will fit these coords (depending on
# whether we have a BED or GFF-style coordinate).
#
@@ -114,6 +124,7 @@ def bins(start, stop, fmt='gff', one=True):
# larger bin size)
start >>= NEXT_SHIFT
stop >>= NEXT_SHIFT
+
return bins
@@ -214,6 +225,12 @@ def test():
assert bins(1, -1000, one=True) == 1
assert bins(1, -1000, one=False) == set([1])
+ # Juuuust fits inside the max chrom size
+ assert bins(536870910, 536870911, one=True) == 8776
+
+ # Too big; falls back to 1.
+ assert bins(536870911, 536870912, one=True) == 1
+
if __name__ == "__main__":
print_bin_sizes()
test()
=====================================
gffutils/biopython_integration.py
=====================================
@@ -5,7 +5,6 @@ objects.
import six
try:
from Bio.SeqFeature import SeqFeature, FeatureLocation
- from Bio import SeqIO
except ImportError:
import warnings
warnings.warn("BioPython must be installed to use this module")
@@ -79,4 +78,4 @@ def from_seqfeature(s, **kwargs):
attributes.pop('seqid', '.')
attributes.pop('frame', '.')
return Feature(seqid, source, featuretype, start, stop, score, strand,
- frame, attributes, **kwargs)
+ frame, attributes, id=id, **kwargs)
=====================================
gffutils/create.py
=====================================
@@ -48,7 +48,7 @@ def deprecation_handler(kwargs):
class _DBCreator(object):
def __init__(self, data, dbfn, force=False, verbose=False, id_spec=None,
- merge_strategy='merge', checklines=10, transform=None,
+ merge_strategy='error', checklines=10, transform=None,
force_dialect_check=False, from_string=False, dialect=None,
default_encoding='utf-8',
disable_infer_genes=False,
@@ -97,7 +97,6 @@ class _DBCreator(object):
self.disable_infer_genes = disable_infer_genes
self.disable_infer_transcripts = disable_infer_transcripts
- self._autoincrements = collections.defaultdict(int)
if force:
if os.path.exists(dbfn):
os.unlink(dbfn)
@@ -124,7 +123,10 @@ class _DBCreator(object):
force_dialect_check=force_dialect_check, from_string=from_string,
dialect=dialect
)
-
+ if '_autoincrements' in kwargs:
+ self._autoincrements = kwargs['_autoincrements']
+ else:
+ self._autoincrements = collections.defaultdict(int)
def set_verbose(self, verbose=None):
if verbose == 'debug':
@@ -207,11 +209,12 @@ class _DBCreator(object):
Raise error
"warning"
- Log a warning
+ Log a warning, which indicates that all future instances of the
+ same ID will be ignored
"merge":
Combine old and new attributes -- but only if everything else
- matches; otherwise error. This can be slow, but is thorough.
+ matches; otherwise error. This can be slow, but is thorough.
"create_unique":
Autoincrement based on the ID, always creating a new ID.
@@ -248,8 +251,8 @@ class _DBCreator(object):
% ([i.id for i in self._candidate_merges(f)]))
# If force_merge_fields was provided, don't pay attention to these
- # fields if they're different. We are assuming attributes will be
- # different, hence the [:-1]
+ # fields if they're different. We are assuming the attributes field
+ # will be different, hence the [:-1]
_gffkeys_to_check = list(
set(constants._gffkeys[:-1])
.difference(self.force_merge_fields))
@@ -379,7 +382,7 @@ class _DBCreator(object):
def _candidate_merges(self, f):
"""
Identifies those features that originally had the same ID as `f`
- (according to the id_spec), but were modified because of duplicate
+ (according to the id_spec), but were modified because of duplicate
IDs.
"""
candidates = [self._get_feature(f.id)]
@@ -486,7 +489,7 @@ class _DBCreator(object):
c.execute('CREATE INDEX seqidstartendstrand ON features (seqid, start, end, strand)')
# speeds computation 1000x in some cases
- logger.info("Running ANALYSE features")
+ logger.info("Running ANALYZE features")
c.execute('ANALYZE features')
self.conn.commit()
@@ -505,6 +508,7 @@ class _DBCreator(object):
self._update_relations()
self._finalize()
+ # TODO: not sure this is used anywhere
def update(self, iterator):
self._populate_from_lines(iterator)
self._update_relations()
@@ -645,29 +649,29 @@ class _GFFDBCreator(_DBCreator):
else:
suffix = '.gffutils'
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
- fout = open(tmp, 'w')
-
- # Here we look for "grandchildren" -- for each ID, get the child
- # (parenthetical subquery below); then for each of those get *its*
- # child (main query below).
- #
- # Results are written to temp file so that we don't read and write at
- # the same time, which would slow things down considerably.
-
- c.execute('SELECT id FROM features')
- for parent in c:
- c2.execute('''
- SELECT child FROM relations WHERE parent IN
- (SELECT child FROM relations WHERE parent = ?)
- ''', tuple(parent))
- for grandchild in c2:
- fout.write('\t'.join((parent[0], grandchild[0])) + '\n')
- fout.close()
+ with open(tmp, 'w') as fout:
+
+ # Here we look for "grandchildren" -- for each ID, get the child
+ # (parenthetical subquery below); then for each of those get *its*
+ # child (main query below).
+ #
+ # Results are written to temp file so that we don't read and write at
+ # the same time, which would slow things down considerably.
+
+ c.execute('SELECT id FROM features')
+ for parent in c:
+ c2.execute('''
+ SELECT child FROM relations WHERE parent IN
+ (SELECT child FROM relations WHERE parent = ?)
+ ''', tuple(parent))
+ for grandchild in c2:
+ fout.write('\t'.join((parent[0], grandchild[0])) + '\n')
def relations_generator():
- for line in open(fout.name):
- parent, child = line.strip().split('\t')
- yield dict(parent=parent, child=child, level=2)
+ with open(fout.name) as fin:
+ for line in fin:
+ parent, child = line.strip().split('\t')
+ yield dict(parent=parent, child=child, level=2)
c.executemany(
'''
@@ -720,7 +724,7 @@ class _GTFDBCreator(_DBCreator):
warnings.warn(
"It appears you have a transcript feature in your GTF "
"file. You may want to use the "
- "`disable_infer_transcripts` "
+ "`disable_infer_transcripts=True` "
"option to speed up database creation")
elif (
f.featuretype == 'gene' and
@@ -729,7 +733,7 @@ class _GTFDBCreator(_DBCreator):
warnings.warn(
"It appears you have a gene feature in your GTF "
"file. You may want to use the "
- "`disable_infer_genes` "
+ "`disable_infer_genes=True` "
"option to speed up database creation")
lines_seen = i + 1
@@ -838,101 +842,64 @@ class _GTFDBCreator(_DBCreator):
suffix = self._keep_tempfiles
else:
suffix = '.gffutils'
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
- fout = open(tmp, 'w')
-
- self._tmpfile = tmp
- # This takes some explanation...
- #
- # First, the nested subquery gets the level-1 parents of
- # self.subfeature featuretypes. For an on-spec GTF file,
- # self.subfeature = "exon". So this subquery translates to getting the
- # distinct level-1 parents of exons -- which are transcripts.
- #
- # OK, so this first subquery is now a list of transcripts; call it
- # "firstlevel".
- #
- # Then join firstlevel on relations, but the trick is to now consider
- # each transcript a *child* -- so that relations.parent (on the first
- # line of the query) will be the first-level parent of the transcript
- # (the gene).
- #
- #
- # The result is something like:
- #
- # transcript1 gene1
- # transcript2 gene1
- # transcript3 gene2
- #
- # Note that genes are repeated; below we need to ensure that only one
- # is added. To ensure this, the results are ordered by the gene ID.
- #
- # By the way, we do this even if we're only looking for transcripts or
- # only looking for genes.
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
+ with open(tmp, 'w') as fout:
+ self._tmpfile = tmp
+
+ # This takes some explanation...
+ #
+ # First, the nested subquery gets the level-1 parents of
+ # self.subfeature featuretypes. For an on-spec GTF file,
+ # self.subfeature = "exon". So this subquery translates to getting the
+ # distinct level-1 parents of exons -- which are transcripts.
+ #
+ # OK, so this first subquery is now a list of transcripts; call it
+ # "firstlevel".
+ #
+ # Then join firstlevel on relations, but the trick is to now consider
+ # each transcript a *child* -- so that relations.parent (on the first
+ # line of the query) will be the first-level parent of the transcript
+ # (the gene).
+ #
+ #
+ # The result is something like:
+ #
+ # transcript1 gene1
+ # transcript2 gene1
+ # transcript3 gene2
+ #
+ # Note that genes are repeated; below we need to ensure that only one
+ # is added. To ensure this, the results are ordered by the gene ID.
+ #
+ # By the way, we do this even if we're only looking for transcripts or
+ # only looking for genes.
- c.execute(
- '''
- SELECT DISTINCT firstlevel.parent, relations.parent
- FROM (
- SELECT DISTINCT parent
- FROM relations
- JOIN features ON features.id = relations.child
- WHERE features.featuretype = ?
- AND relations.level = 1
- )
- AS firstlevel
- JOIN relations ON firstlevel.parent = child
- WHERE relations.level = 1
- ORDER BY relations.parent
- ''', (self.subfeature,))
-
- # Now we iterate through those results (using a new cursor) to infer
- # the extent of transcripts and/or genes.
-
- last_gene_id = None
- n_features = 0
- for transcript_id, gene_id in c:
-
- if not self.disable_infer_transcripts:
- # transcript extent
- c2.execute(
- '''
- SELECT MIN(start), MAX(end), strand, seqid
- FROM features
- JOIN relations ON
- features.id = relations.child
- WHERE parent = ? AND featuretype == ?
- ''', (transcript_id, self.subfeature))
- transcript_start, transcript_end, strand, seqid = c2.fetchone()
- transcript_attributes = {
- self.transcript_key: [transcript_id],
- self.gene_key: [gene_id]
- }
- transcript_bin = bins.bins(
- transcript_start, transcript_end, one=True)
-
- # Write out to file; we'll be reading it back in shortly. Omit
- # score, frame, source, and extra since they will always have
- # the same default values (".", ".", "gffutils_derived", and []
- # respectively)
-
- fout.write('\t'.join(map(str, [
- transcript_id,
- seqid,
- transcript_start,
- transcript_end,
- strand,
- 'transcript',
- transcript_bin,
- helpers._jsonify(transcript_attributes)
- ])) + '\n')
-
- n_features += 1
-
- if not self.disable_infer_genes:
- # Infer gene extent, but only if we haven't done so already
- if gene_id != last_gene_id:
+ c.execute(
+ '''
+ SELECT DISTINCT firstlevel.parent, relations.parent
+ FROM (
+ SELECT DISTINCT parent
+ FROM relations
+ JOIN features ON features.id = relations.child
+ WHERE features.featuretype = ?
+ AND relations.level = 1
+ )
+ AS firstlevel
+ JOIN relations ON firstlevel.parent = child
+ WHERE relations.level = 1
+ ORDER BY relations.parent
+ ''', (self.subfeature,))
+
+ # Now we iterate through those results (using a new cursor) to infer
+ # the extent of transcripts and/or genes.
+
+ last_gene_id = None
+ n_features = 0
+ for transcript_id, gene_id in c:
+
+ if not self.disable_infer_transcripts:
+ # transcript extent
c2.execute(
'''
SELECT MIN(start), MAX(end), strand, seqid
@@ -940,26 +907,61 @@ class _GTFDBCreator(_DBCreator):
JOIN relations ON
features.id = relations.child
WHERE parent = ? AND featuretype == ?
- ''', (gene_id, self.subfeature))
- gene_start, gene_end, strand, seqid = c2.fetchone()
- gene_attributes = {self.gene_key: [gene_id]}
- gene_bin = bins.bins(gene_start, gene_end, one=True)
+ ''', (transcript_id, self.subfeature))
+ transcript_start, transcript_end, strand, seqid = c2.fetchone()
+ transcript_attributes = {
+ self.transcript_key: [transcript_id],
+ self.gene_key: [gene_id]
+ }
+ transcript_bin = bins.bins(
+ transcript_start, transcript_end, one=True)
+
+ # Write out to file; we'll be reading it back in shortly. Omit
+ # score, frame, source, and extra since they will always have
+ # the same default values (".", ".", "gffutils_derived", and []
+ # respectively)
fout.write('\t'.join(map(str, [
- gene_id,
+ transcript_id,
seqid,
- gene_start,
- gene_end,
+ transcript_start,
+ transcript_end,
strand,
- 'gene',
- gene_bin,
- helpers._jsonify(gene_attributes)
+ 'transcript',
+ transcript_bin,
+ helpers._jsonify(transcript_attributes)
])) + '\n')
- last_gene_id = gene_id
- n_features += 1
+ n_features += 1
- fout.close()
+ if not self.disable_infer_genes:
+ # Infer gene extent, but only if we haven't done so already
+ if gene_id != last_gene_id:
+ c2.execute(
+ '''
+ SELECT MIN(start), MAX(end), strand, seqid
+ FROM features
+ JOIN relations ON
+ features.id = relations.child
+ WHERE parent = ? AND featuretype == ?
+ ''', (gene_id, self.subfeature))
+ gene_start, gene_end, strand, seqid = c2.fetchone()
+ gene_attributes = {self.gene_key: [gene_id]}
+ gene_bin = bins.bins(gene_start, gene_end, one=True)
+
+ fout.write('\t'.join(map(str, [
+ gene_id,
+ seqid,
+ gene_start,
+ gene_end,
+ strand,
+ 'gene',
+ gene_bin,
+ helpers._jsonify(gene_attributes)
+ ])) + '\n')
+
+ last_gene_id = gene_id
+ n_features += 1
def derived_feature_generator():
"""
@@ -967,17 +969,18 @@ class _GTFDBCreator(_DBCreator):
"""
keys = ['parent', 'seqid', 'start', 'end', 'strand',
'featuretype', 'bin', 'attributes']
- for line in open(fout.name):
- d = dict(list(zip(keys, line.strip().split('\t'))))
- d.pop('parent')
- d['score'] = '.'
- d['source'] = 'gffutils_derived'
- d['frame'] = '.'
- d['extra'] = []
- d['attributes'] = helpers._unjsonify(d['attributes'])
- f = feature.Feature(**d)
- f.id = self._id_handler(f)
- yield f
+ with open(fout.name) as fin:
+ for line in fin:
+ d = dict(list(zip(keys, line.strip().split('\t'))))
+ d.pop('parent')
+ d['score'] = '.'
+ d['source'] = 'gffutils_derived'
+ d['frame'] = '.'
+ d['extra'] = []
+ d['attributes'] = helpers._unjsonify(d['attributes'])
+ f = feature.Feature(**d)
+ f.id = self._id_handler(f)
+ yield f
# Drop the indexes so the inserts are faster
c.execute('DROP INDEX IF EXISTS relationsparent')
@@ -1009,7 +1012,8 @@ class _GTFDBCreator(_DBCreator):
if not self._keep_tempfiles:
os.unlink(fout.name)
- # TODO: recreate indexes?
+ # TODO: recreate indexes? Typically the _finalize() method will be
+ # called after this one, and indexes are created in _finalize().
def create_db(data, dbfn, id_spec=None, force=False, verbose=False,
=====================================
gffutils/feature.py
=====================================
@@ -1,6 +1,6 @@
from pyfaidx import Fasta
import six
-import simplejson
+import simplejson as json
from gffutils import constants
from gffutils import helpers
from gffutils import parser
@@ -145,7 +145,7 @@ class Feature(object):
attributes = helpers._unjsonify(attributes, isattributes=True)
# it's a string but not JSON: assume original attributes string.
- except simplejson.JSONDecodeError:
+ except json.JSONDecodeError:
# But Feature.attributes is still a dict
attributes, _dialect = parser._split_keyvals(attributes)
@@ -159,7 +159,7 @@ class Feature(object):
if isinstance(extra, six.string_types):
try:
extra = helpers._unjsonify(extra)
- except simplejson.JSONDecodeError:
+ except json.JSONDecodeError:
extra = extra.split('\t')
self.seqid = seqid
=====================================
gffutils/helpers.py
=====================================
@@ -1,7 +1,7 @@
import copy
import sys
import os
-import simplejson
+import simplejson as json
import time
import tempfile
import six
@@ -256,16 +256,16 @@ def _bin_from_dict(d):
def _jsonify(x):
"""Use most compact form of JSON"""
if isinstance(x, dict_class):
- return simplejson.dumps(x._d, separators=(',', ':'))
- return simplejson.dumps(x, separators=(',', ':'))
+ return json.dumps(x._d, separators=(',', ':'))
+ return json.dumps(x, separators=(',', ':'))
def _unjsonify(x, isattributes=False):
"""Convert JSON string to an ordered defaultdict."""
if isattributes:
- obj = simplejson.loads(x)
+ obj = json.loads(x)
return dict_class(obj)
- return simplejson.loads(x)
+ return json.loads(x)
def _feature_to_fields(f, jsonify=True):
@@ -318,7 +318,7 @@ def merge_attributes(attr1, attr2):
"""
new_d = copy.deepcopy(attr1)
- new_d.update(attr2)
+ new_d.update(copy.deepcopy(attr2))
#all of attr2 key : values just overwrote attr1, fix it
for k, v in new_d.items():
@@ -330,7 +330,7 @@ def merge_attributes(attr1, attr2):
if not isinstance(v, list):
v = [v]
new_d[k].extend(v)
- return dict((k, list(set(v))) for k, v in new_d.items())
+ return dict((k, sorted(set(v))) for k, v in new_d.items())
def dialect_compare(dialect1, dialect2):
=====================================
gffutils/inspection.py deleted
=====================================
@@ -1,105 +0,0 @@
-from gffutils import iterators
-from gffutils import interface
-from collections import Counter
-import sys
-
-
-def inspect(data, look_for=['featuretype', 'chrom', 'attribute_keys',
- 'feature_count'], limit=None, verbose=True):
- """
- Inspect a GFF or GTF data source.
-
- This function is useful for figuring out the different featuretypes found
- in a file (for potential removal before creating a FeatureDB).
-
- Returns a dictionary with a key for each item in `look_for` and
- a corresponding value that is a dictionary of how many of each unique item
- were found.
-
- There will always be a `feature_count` key, indicating how many features
- were looked at (if `limit` is provided, then `feature_count` will be the
- same as `limit`).
-
- For example, if `look_for` is ['chrom', 'featuretype'], then the result
- will be a dictionary like::
-
- {
- 'chrom': {
- 'chr1': 500,
- 'chr2': 435,
- 'chr3': 200,
- ...
- ...
- }.
-
- 'featuretype': {
- 'gene': 150,
- 'exon': 324,
- ...
- },
-
- 'feature_count': 5000
-
- }
-
-
- Parameters
- ----------
- data : str, FeatureDB instance, or iterator of Features
- If `data` is a string, assume it's a GFF or GTF filename. If it's
- a FeatureDB instance, then its `all_features()` method will be
- automatically called. Otherwise, assume it's an iterable of Feature
- objects.
-
- look_for : list
- List of things to keep track of. Options are:
-
- - any attribute of a Feature object, such as chrom, source, start,
- stop, strand.
-
- - "attribute_keys", which will look at all the individual
- attribute keys of each feature
-
- limit : int
- Number of features to look at. Default is no limit.
-
- verbose : bool
- Report how many features have been processed.
-
- Returns
- -------
- dict
- """
-
- results = {}
- obj_attrs = []
- for i in look_for:
- if i not in ['attribute_keys', 'feature_count']:
- obj_attrs.append(i)
- results[i] = Counter()
-
- attr_keys = 'attribute_keys' in look_for
-
- d = iterators.DataIterator(data)
- feature_count = 0
- for f in d:
- if verbose:
- sys.stderr.write('\r%s features inspected' % feature_count)
- sys.stderr.flush()
-
- for obj_attr in obj_attrs:
- results[obj_attr].update([getattr(f, obj_attr)])
-
- if attr_keys:
- results['attribute_keys'].update(f.attributes.keys())
-
- feature_count += 1
- if limit and feature_count == limit:
- break
-
- new_results = {}
- for k, v in results.items():
- new_results[k] = dict(v)
-
- new_results['feature_count'] = feature_count
- return new_results
=====================================
gffutils/interface.py
=====================================
@@ -1,3 +1,4 @@
+import collections
import os
import six
import sqlite3
@@ -6,10 +7,62 @@ import warnings
from gffutils import bins
from gffutils import helpers
from gffutils import constants
+from gffutils import merge_criteria as mc
from gffutils.feature import Feature
from gffutils.exceptions import FeatureNotFoundError
+def assign_child(parent, child):
+ """
+ Helper for add_relation()
+ Sets 'Parent' attribute to parent['ID']
+
+ Parameters
+ ----------
+
+ parent : Feature
+ Parent Feature
+
+ :param parent: parent Feature
+
+ child : Feature
+ Child Feature
+
+ Returns
+ -------
+
+ Child Feature
+ """
+ child.attributes['Parent'] = parent['ID']
+ return child
+
+
+# Reusable constant for FeatureDB.merge()
+no_children = tuple()
+
+def _finalize_merge(feature, feature_children):
+ """
+ Helper for FeatureDB.merge() to update source and assign children property
+
+ Parameters
+ ----------
+ feature : Feature
+ feature to finalise
+
+ feature_children
+ list of children to assign
+
+ Returns
+ -------
+ feature, modified
+ """
+ if len(feature_children) > 1:
+ feature.source = ','.join(set(child.source for child in feature_children))
+ feature.children = feature_children
+ else:
+ feature.children = no_children
+ return feature
+
class FeatureDB(object):
# docstring to be filled in for methods that call out to
# helpers.make_query()
@@ -147,7 +200,7 @@ class FeatureDB(object):
'''
SELECT base, n FROM autoincrements
''')
- self._autoincrements = dict(c)
+ self._autoincrements = collections.defaultdict(int, c)
self.set_pragmas(pragmas)
@@ -604,17 +657,15 @@ class FeatureDB(object):
# Only use bins if we have defined boundaries and completely_within is
# True. Otherwise you can't know how far away a feature stretches
# (which means bins are not computable ahead of time)
+ _bin_clause = ''
if (start is not None) and (end is not None) and completely_within:
- _bins = list(bins.bins(start, end, one=False))
- # See issue #45
- if len(_bins) < 900:
- _bin_clause = ' or ' .join(['bin = ?' for _ in _bins])
- _bin_clause = 'AND ( %s )' % _bin_clause
- args += _bins
- else:
- _bin_clause = ''
- else:
- _bin_clause = ''
+ if start <= bins.MAX_CHROM_SIZE and end <= bins.MAX_CHROM_SIZE:
+ _bins = list(bins.bins(start, end, one=False))
+ # See issue #45
+ if len(_bins) < 900:
+ _bin_clause = ' or ' .join(['bin = ?' for _ in _bins])
+ _bin_clause = 'AND ( %s )' % _bin_clause
+ args += _bins
query = ' '.join([
constants._SELECT,
@@ -662,11 +713,9 @@ class FeatureDB(object):
Providing N features will return N - 1 new features.
This method purposefully does *not* do any merging or sorting of
- coordinates, so you may want to use :meth:`FeatureDB.merge` first.
-
- The new features' attributes will be a merge of the neighboring
- features' attributes. This is useful if you have provided a list of
- exons; the introns will then retain the transcript and/or gene parents.
+ coordinates, so you may want to use :meth:`FeatureDB.merge` first, or
+ when selecting features use the `order_by` kwarg, e.g.,
+ `db.features_of_type('gene', order_by=('seqid', 'start'))`.
Parameters
----------
@@ -678,6 +727,14 @@ class FeatureDB(object):
then the featuretypes will be constructed from the neighboring
features, e.g., `inter_exon_exon`.
+ merge_attributes : bool
+ If True, new features' attributes will be a merge of the neighboring
+ features' attributes. This is useful if you have provided a list of
+ exons; the introns will then retain the transcript and/or gene
+ parents as a single item. Otherwise, if False, the attribute will
+ be a comma-separated list of values, potentially listing the same
+ gene ID twice.
+
attribute_func : callable or None
If None, then nothing special is done to the attributes. If
callable, then the callable accepts two attribute dictionaries and
@@ -705,9 +762,21 @@ class FeatureDB(object):
if new_featuretype is None:
new_featuretype = 'inter_%s_%s' % (
last_feature.featuretype, f.featuretype)
- assert last_feature.strand == f.strand
- assert last_feature.chrom == f.chrom
- strand = last_feature.strand
+ if last_feature.strand != f.strand:
+ new_strand = '.'
+ else:
+ new_strand = f.strand
+
+ if last_feature.chrom != f.chrom:
+ # We've moved to a new chromosome. For example, if we're
+ # getting intergenic regions from all genes, they will be on
+ # different chromosomes. We still assume sorted features, but
+ # don't complain if they're on different chromosomes -- just
+ # move on.
+ last_feature = f
+ continue
+
+ strand = new_strand
chrom = last_feature.chrom
# Shrink
@@ -748,6 +817,7 @@ class FeatureDB(object):
dialect = None
yield self._feature_returner(**fields)
interfeature_start = f.stop
+ last_feature = f
def delete(self, features, make_backup=True, **kwargs):
"""
@@ -797,7 +867,19 @@ class FeatureDB(object):
def update(self, data, make_backup=True, **kwargs):
"""
- Update database with features in `data`.
+ Update the on-disk database with features in `data`.
+
+ WARNING: If you used any non-default kwargs for gffutils.create_db when
+ creating the database in the first place (especially
+ `disable_infer_transcripts` or `disable_infer_genes`) then you should
+ use those same arguments here.
+
+ The returned object is the same FeatureDB, but since it is pointing to
+ the same database and that has been just updated, the new features can
+ now be accessed.
+
+ Parameters
+ ----------
data : str, iterable, FeatureDB instance
If FeatureDB, all data will be used. If string, assume it's
@@ -810,15 +892,15 @@ class FeatureDB(object):
makes a copy of the existing database and saves it with a .bak
extension.
- Notes
- -----
- Other kwargs are used in the same way as in gffutils.create_db; see the
- help for that function for details.
+ kwargs :
+ Additional kwargs are passed to gffutils.create_db; see the help
+ for that function for details.
Returns
-------
- FeatureDB with updated features.
+ Returns self but with the underlying database updated.
"""
+
from gffutils import create
from gffutils import iterators
if make_backup:
@@ -833,6 +915,7 @@ class FeatureDB(object):
# Handle all sorts of input
data = iterators.DataIterator(data, **_iterator_kwargs)
+ kwargs['_autoincrements'] = self._autoincrements
if self.dialect['fmt'] == 'gtf':
if 'id_spec' not in kwargs:
@@ -849,10 +932,18 @@ class FeatureDB(object):
else:
raise ValueError
+ peek, data._iter = iterators.peek(data._iter, 1)
+ if len(peek) == 0: return db # If the file is empty then do nothing
+
db._populate_from_lines(data)
db._update_relations()
+
+ # Note that the autoincrements gets updated here
db._finalize()
- return db
+
+ # Read it back in directly from the stored autoincrements table
+ self._autoincrements.update(db._autoincrements)
+ return self
def add_relation(self, parent, child, level, parent_func=None,
child_func=None):
@@ -1000,8 +1091,11 @@ class FeatureDB(object):
):
yield intron
- def merge(self, features, ignore_strand=False):
+ def _old_merge(self, features, ignore_strand=False):
"""
+ DEPRECATED, only retained here for backwards compatibility. Please use
+ merge().
+
Merge overlapping features together.
Parameters
@@ -1095,6 +1189,197 @@ class FeatureDB(object):
attributes='')
yield self._feature_returner(**merged_feature)
+
+ def merge(self, features,
+ merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, mc.feature_type),
+ multiline=False):
+ """
+ Merge features matching criteria together
+
+
+ Returned Features have a special property called 'children' that is
+ a list of the component features. This only exists for the lifetime of
+ the Feature instance.
+
+ Parameters
+ ----------
+ features : iterable
+ Iterable of Feature instances to merge
+
+ merge_criteria : list
+ List of merge criteria callbacks. All must evaluate to True in
+ order for a feature to be merged. See notes below on callback
+ signature.
+
+ multiline : bool
+ True to emit multiple features with the same ID attribute, False
+ otherwise.
+
+ Returns
+ -------
+ Generator yielding merged Features
+
+ Notes
+ -----
+
+ See the `gffutils.merge_criteria` module (imported here as `mc`) for
+ existing callback functions. For writing custom callbacks, functions
+ must have the following signature::
+
+ callback(
+ acc: gffutils.Feature,
+ cur: gffutils.Feature,
+ components: [gffutils.Feature]
+ ) -> bool
+
+ Where:
+
+ - `acc`: current accumulated feature
+ - `cur`: candidate feature to merge
+ - `components`: list of features that compose acc
+
+ The function should return True to merge `cur` into `acc`, False to set
+ `cur` to `acc` (that is, start a new merged feature).
+
+
+ If merge criteria allows different feature types then the merged
+ features' feature types should have their featuretype property
+ reassigned to a more specific ontology value.
+ """
+ if not isinstance(merge_criteria, list):
+ try:
+ merge_criteria = list(merge_criteria)
+ except TypeError:
+ merge_criteria = [merge_criteria]
+
+ # To start, we create a merged feature of just the first feature.
+ features = iter(features)
+ last_id = None
+ current_merged = None
+ feature_children = []
+
+ for feature in features:
+ if current_merged is None:
+ if all(criteria(feature, feature, feature_children) for criteria in merge_criteria):
+ current_merged = feature
+ feature_children = [feature]
+ else:
+ yield _finalize_merge(feature, no_children)
+ last_id = None
+ continue
+
+ if len(feature_children) == 0: # current_merged is last feature and unchecked
+ if all(criteria(current_merged, current_merged, feature_children) for criteria in merge_criteria):
+ feature_children.append(current_merged)
+ else:
+ yield _finalize_merge(current_merged, no_children)
+ current_merged = feature
+ last_id = None
+ continue
+
+ if all(criteria(current_merged, feature, feature_children) for criteria in merge_criteria):
+ # Criteria satisfied, merge
+ # TODO Test multiline records and iron out the following code
+ # if multiline and (feature.start > current_merged.end + 1 or feature.end + 1 < current_merged.start):
+ # # Feature is possibly multiline (discontiguous), keep ID but start new record
+ # yield _finalize_merge(current_merged, feature_children)
+ # current_merged = feature
+ # feature_children = [feature]
+
+ if len(feature_children) == 1:
+ # Current merged is only child and merge is going to occur, make copy
+ current_merged = vars(current_merged).copy()
+ del current_merged['attributes']
+ del current_merged['extra']
+ del current_merged['dialect']
+ del current_merged['keep_order']
+ del current_merged['sort_attribute_values']
+ current_merged = self._feature_returner(**current_merged)
+ if not last_id:
+ # Generate unique ID for new Feature
+ self._autoincrements[current_merged.featuretype] += 1
+ last_id = current_merged.featuretype + '_' + str(
+ self._autoincrements[current_merged.featuretype])
+ current_merged['ID'] = last_id
+ current_merged.id = last_id
+
+ feature_children.append(feature)
+
+ # Set mismatched properties to ambiguous values
+ if feature.seqid not in current_merged.seqid.split(','): current_merged.seqid += ',' + feature.seqid
+ if feature.strand != current_merged.strand: current_merged.strand = '.'
+ if feature.frame != current_merged.frame: current_merged.frame = '.'
+ if feature.featuretype != current_merged.featuretype: current_merged.featuretype = "sequence_feature"
+
+ if feature.start < current_merged.start:
+ # Extends prior, so set a new start position
+ current_merged.start = feature.start
+
+ if feature.end > current_merged.end:
+ # Extends further, so set a new stop position
+ current_merged.end = feature.end
+
+ else:
+ yield _finalize_merge(current_merged, feature_children)
+ current_merged = feature
+ feature_children = []
+ last_id = None
+
+ if current_merged:
+ yield _finalize_merge(current_merged, feature_children)
+
+ def merge_all(self,
+ merge_order=('seqid', 'featuretype', 'strand', 'start'),
+ merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, mc.feature_type),
+ featuretypes_groups=(None,),
+ exclude_components=False):
+ """
+ Merge all features in database according to criteria.
+ Merged features will be assigned as children of the merged record.
+ The resulting records are added to the database.
+
+ Parameters
+ ----------
+ merge_order : list
+ Ordered list of columns with which to group features before evaluating criteria
+ merge_criteria : list
+ List of merge criteria callbacks. See merge().
+ featuretypes_groups : list
+ iterable of sets of featuretypes to merge together
+ exclude_components : bool
+ True: child features will be discarded. False to keep them.
+
+ Returns
+ -------
+ list of merge features
+ """
+
+ if not len(featuretypes_groups):
+ # Can't be empty
+ featuretypes_groups = (None,)
+
+ result_features = []
+
+ # Merge features per featuregroup
+ for featuregroup in featuretypes_groups:
+ for merged in self.merge(self.all_features(featuretype=featuregroup, order_by=merge_order),
+ merge_criteria=merge_criteria):
+ # If feature is result of merge
+ if merged.children:
+ self._insert(merged, self.conn.cursor())
+ if exclude_components:
+ # Remove child features from DB
+ self.delete(merged.children)
+ else:
+ # Add child relations to DB
+ for child in merged.children:
+ self.add_relation(merged, child, 1, child_func=assign_child)
+ result_features.append(merged)
+ else:
+ pass # Do nothing, feature is already in DB
+
+ return result_features
+
def children_bp(self, feature, child_featuretype='exon', merge=False,
ignore_strand=False):
"""
=====================================
gffutils/iterators.py
=====================================
@@ -115,6 +115,7 @@ class _FileIterator(_BaseIterator):
Subclass for iterating over features provided as a filename
"""
def open_function(self, data):
+ data = os.path.expanduser(data)
if data.endswith('.gz'):
import gzip
return gzip.open(data)
@@ -130,7 +131,7 @@ class _FileIterator(_BaseIterator):
self.current_item_number = i
if line == '##FASTA' or line.startswith('>'):
- raise StopIteration
+ return
if line.startswith('##'):
self._directive_handler(line)
=====================================
gffutils/merge_criteria.py
=====================================
@@ -0,0 +1,51 @@
+"""
+Merge criteria used by FeatureDB merge() and merge_all()
+"""
+from gffutils.feature import Feature
+
+
+def seqid(acc, cur, components):
+ return cur.seqid == acc.seqid
+
+
+def strand(acc, cur, components):
+ return acc.strand == cur.strand
+
+
+def feature_type(acc, cur, components):
+ return acc.featuretype == cur.featuretype
+
+
+def exact_coordinates_only(acc, cur, components):
+ return cur.start == acc.start and cur.stop == acc.end
+
+
+def overlap_end_inclusive(acc, cur, components):
+ return acc.start <= cur.start <= acc.end + 1
+
+
+def overlap_start_inclusive(acc, cur, components):
+ return acc.start <= cur.end + 1 <= acc.end + 1
+
+
+def overlap_any_inclusive(acc, cur, components):
+ return acc.start <= cur.start <= acc.end + 1 or acc.start <= cur.end + 1 <= acc.end + 1
+
+
+def overlap_end_threshold(threshold):
+ def partial(acc, cur, components):
+ return acc.start <= cur.start <= acc.end + threshold
+ return partial
+
+
+def overlap_start_threshold(threshold):
+ def partial(acc, cur, components):
+ return acc.start - threshold <= cur.end + 1 <= acc.end + 1
+
+ return partial
+
+
+def overlap_any_threshold(threshold):
+ def partial(acc, cur, components):
+ return acc.start - threshold <= cur.end + 1 <= acc.end + 1 or acc.start <= cur.start <= acc.end + threshold
+ return partial
=====================================
gffutils/parser.py
=====================================
@@ -152,6 +152,8 @@ def _reconstruct(keyvals, dialect, keep_order=False,
# Typically "=" for GFF3 or " " otherwise
part = dialect['keyval separator'].join([key, val_str])
+ else:
+ part = key
else:
if dialect['fmt'] == 'gtf':
part = dialect['keyval separator'].join([key, '""'])
@@ -322,6 +324,7 @@ def _split_keyvals(keyval_str, dialect=None):
# Pathological cases where values of a key have within them the key-val
# separator, e.g.,
# Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126
+ # ^ ^
else:
key = item[0]
val = dialect['keyval separator'].join(item[1:])
=====================================
gffutils/pybedtools_integration.py
=====================================
@@ -2,11 +2,13 @@
Module for integration with pybedtools
"""
+import os
import pybedtools
from pybedtools import featurefuncs
from gffutils import helpers
import six
+
def to_bedtool(iterator):
"""
Convert any iterator into a pybedtools.BedTool object.
@@ -22,7 +24,7 @@ def to_bedtool(iterator):
def tsses(db, merge_overlapping=False, attrs=None, attrs_sep=":",
- merge_kwargs=dict(o='distinct', s=True, c=4), as_bed6=False):
+ merge_kwargs=None, as_bed6=False, bedtools_227_or_later=True):
"""
Create 1-bp transcription start sites for all transcripts in the database
and return as a sorted pybedtools.BedTool object pointing to a temporary
@@ -74,13 +76,21 @@ def tsses(db, merge_overlapping=False, attrs=None, attrs_sep=":",
attributes is supplied, e.g. ["gene_id", "transcript_id"], then these
will be joined by `attr_join_sep` and then placed in the name field.
-
attrs_sep: str
If `as_bed6=True` or `merge_overlapping=True`, then use this character
to separate attributes in the name field of the output BED. If also
using `merge_overlapping=True`, you'll probably want this to be
different than `merge_sep` in order to parse things out later.
+ bedtools_227_or_later : bool
+ In version 2.27, BEDTools changed the output for merge. By default,
+ this function expects BEDTools version 2.27 or later, but set this to
+ False to assume the older behavior.
+
+ For testing purposes, the environment variable
+ GFFUTILS_USES_BEDTOOLS_227_OR_LATER is set to either "true" or "false"
+ and is used to override this argument.
+
Examples
--------
@@ -146,7 +156,22 @@ def tsses(db, merge_overlapping=False, attrs=None, attrs_sep=":",
"""
- _merge_kwargs = dict(o='distinct', s=True, c=4)
+ _override = os.environ.get('GFFUTILS_USES_BEDTOOLS_227_OR_LATER', None)
+ if _override is not None:
+ if _override == 'true':
+ bedtools_227_or_later = True
+ elif _override == 'false':
+ bedtools_227_or_later = False
+ else:
+ raise ValueError(
+ "Unknown value for GFFUTILS_USES_BEDTOOLS_227_OR_LATER "
+ "environment variable: {0}".format(_override))
+
+ if bedtools_227_or_later:
+ _merge_kwargs = dict(o='distinct', s=True, c='4,5,6')
+ else:
+ _merge_kwargs = dict(o='distinct', s=True, c='4')
+
if merge_kwargs is not None:
_merge_kwargs.update(merge_kwargs)
@@ -195,18 +220,18 @@ def tsses(db, merge_overlapping=False, attrs=None, attrs_sep=":",
x = x.each(to_bed).saveas()
if merge_overlapping:
-
- def fix_merge(f):
- f = featurefuncs.extend_fields(f, 6)
- return pybedtools.Interval(
- f.chrom,
- f.start,
- f.stop,
- f[4],
- '.',
- f[3])
-
- x = x.merge(**_merge_kwargs).each(fix_merge).saveas()
-
+ if bedtools_227_or_later:
+ x = x.merge(**_merge_kwargs)
+ else:
+ def fix_merge(f):
+ f = featurefuncs.extend_fields(f, 6)
+ return pybedtools.Interval(
+ f.chrom,
+ f.start,
+ f.stop,
+ f[4],
+ '.',
+ f[3])
+ x = x.merge(**_merge_kwargs).saveas().each(fix_merge).saveas()
return x
=====================================
gffutils/test/data/synthetic.gff3
=====================================
@@ -0,0 +1,19 @@
+##gff-version 3
+seq1 synthetic1 sequence_feature 5 15 . . . ID=basic1
+seq1 synthetic1 sequence_feature 5 15 . . . ID=discontig1
+seq1 synthetic1 sequence_feature 20 30 . . . ID=discontig1
+seq1 synthetic1 sequence_feature 5 15 . . . ID=perfect_overlap1
+seq1 synthetic1 sequence_feature 16 20 . . . ID=adjacent1
+seq1 synthetic1 sequence_feature 16 16 . . . ID=adjacent2
+seq1 synthetic1 sequence_feature 1 4 . . . ID=adjacent3
+seq1 synthetic1 sequence_feature 4 4 . . . ID=adjacent4
+seq1 synthetic1 sequence_feature 15 20 . . . ID=overlap_end1
+seq1 synthetic1 sequence_feature 1 5 . . . ID=overlap_start1
+seq1 synthetic1 sequence_feature 10 20 . . . ID=overlap_past_end1
+seq1 synthetic1 sequence_feature 10 20 . + . ID=strand_plus1
+seq1 synthetic1 sequence_feature 10 20 . - . ID=strand_minus1
+seq1 synthetic1 sequence_feature 35 40 . . . ID=no_overlap1
+seq1 synthetic1 sequence_feature 21 29 . . . ID=no_overlap2
+seq1 synthetic1 misc_feature 1 10 . . . ID=different_type1
+seq1 synthetic2 sequence_feature 1 10 . . . ID=different_source1
+seq2 synthetic1 sequence_feature 1 10 . . . ID=different_seqid1
\ No newline at end of file
=====================================
gffutils/test/synth_test_base.py
=====================================
@@ -0,0 +1,14 @@
+from unittest import TestCase
+import gffutils
+synthetic_path = gffutils.example_filename('synthetic.gff3')
+num_synthetic_features = 18
+num_synthetic_overlap = 13
+
+
+class TestWithSynthDB(TestCase):
+ def setUp(self):
+ self.db = gffutils.create_db(synthetic_path, ":memory:", merge_strategy='create_unique') # type: gffutils.FeatureDB
+ self.assertEqual(num_synthetic_features, self.db.count_features_of_type())
+
+ def _dump_db(self):
+ return '\n'.join(map(str, self.db.all_features()))
=====================================
gffutils/test/test.py
=====================================
@@ -120,7 +120,7 @@ def test_update():
# Add attribute to each gene record
rec = gene_recs[0]
rec.attributes["new"] = ["new_value"]
- db.update([rec])
+ db.update([rec], merge_strategy='replace')
num_entries += 1
print(list(db.all_features()))
@@ -621,6 +621,7 @@ def test_create_db_from_url():
# Serving test/data folder
served_folder = gffutils.example_filename('')
+ savedir = os.getcwd()
os.chdir(served_folder)
print("Starting SimpleHTTPServer in thread")
@@ -654,6 +655,7 @@ def test_create_db_from_url():
print('Server shutdown.')
httpd.shutdown()
server_thread.join()
+ os.chdir(savedir)
def test_empty_files():
@@ -880,7 +882,7 @@ def test_tempfiles():
assert len(filelist) == 1, filelist
assert filelist[0].endswith('.gffutils')
- #...and another one for gff. This time, make sure the suffix
+ #...and another one for gff. This time, make sure the suffix
db = gffutils.create_db(
gffutils.example_filename('FBgn0031208.gff'), ':memory:', _keep_tempfiles=True)
filelist = os.listdir(tempdir)
@@ -1006,7 +1008,7 @@ def test_disable_infer():
def test_deprecation_handler():
- return
+ return
# TODO: when infer_gene_extent actually gets deprecated, test here.
assert_raises(ValueError, gffutils.create_db,
@@ -1141,7 +1143,7 @@ def test_unreasonable_unquoting():
assert f.attributes['null'][0] == '\x00'
assert f.attributes['comma'][0] == ','
- # Commas indicate
+ # Commas indicate
assert f.attributes['Parent'] == ['A,', 'B%', 'C']
assert str(f) == s
@@ -1174,6 +1176,145 @@ def test_db_unquoting():
assert db['e']['Note'] == [',']
assert db['f']['Note'] == [',']
+
+def test_issue_105():
+ fn = gffutils.example_filename('FBgn0031208.gtf')
+ home = os.path.expanduser('~')
+ newfn = os.path.join(home, '.gffutils.test')
+ with open(newfn, 'w') as fout:
+ fout.write(open(fn).read())
+ f = gffutils.iterators.DataIterator(newfn)
+ for i in f:
+ pass
+ os.unlink(newfn)
+
+
+def test_issue_107():
+ s = dedent(
+ '''
+ chr1\t.\tgene\t10\t15\t.\t+\t.\tID=b;
+ chr1\t.\tgene\t1\t5\t.\t-\t.\tID=a;
+ chr2\t.\tgene\t25\t50\t.\t-\t.\tID=c;
+ chr2\t.\tgene\t55\t60\t.\t-\t.\tID=d;
+ ''')
+ tmp = tempfile.NamedTemporaryFile(delete=False).name
+ with open(tmp, 'w') as fout:
+ fout.write(s + '\n')
+ db = gffutils.create_db(tmp, ':memory:')
+ interfeatures = list(db.interfeatures(
+ db.features_of_type('gene', order_by=('seqid', 'start'))))
+ assert [str(i) for i in interfeatures] == [
+ 'chr1\tgffutils_derived\tinter_gene_gene\t6\t9\t.\t.\t.\tID=a,b;',
+ 'chr2\tgffutils_derived\tinter_gene_gene\t16\t54\t.\t-\t.\tID=c,d;',
+ ]
+
+
+def test_issue_119():
+ # First file has these two exons with no ID:
+ #
+ # chr2L FlyBase exon 8193 8589 . + . Parent=FBtr0300690
+ # chr2L FlyBase exon 7529 8116 . + . Name=CG11023:1;Parent=FBtr0300689,FBtr0300690
+ #
+ db0 = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),':memory:')
+
+ # And this one, a bunch of reads with no IDs anywhere
+ db1 = gffutils.create_db(gffutils.example_filename('F3-unique-3.v2.gff'),':memory:')
+
+ # When db1 is updated by db0
+ db2 = db1.update(db0)
+ assert (
+ db2._autoincrements
+ == db1._autoincrements
+ == {'exon': 2, 'read': 112}
+ ), db2._autoincrements
+
+
+ assert len(list(db0.features_of_type('exon'))) == 6
+
+ # Now we update that with db0 again
+ db3 = db2.update(db0, merge_strategy='replace')
+
+ # Using the "replace" strategy, we should have only gotten another 2 exons
+ assert len(list(db3.features_of_type('exon'))) == 8
+
+ # Make sure that the autoincrements for exons jumped by 2
+ assert (
+ db2._autoincrements
+ == db3._autoincrements
+ == {'exon': 4, 'read': 112}
+ ), db2._autoincrements
+
+ # More isolated test, merging two databases each created from the same file
+ # which itself contains only a single feature with no ID.
+ tmp = tempfile.NamedTemporaryFile(delete=False).name
+ with open(tmp, 'w') as fout:
+ fout.write('chr1\t.\tgene\t10\t15\t.\t+\t.\t\n')
+
+ db4 = gffutils.create_db(tmp, tmp + '.db')
+ db5 = gffutils.create_db(tmp, ':memory:')
+
+ assert db4._autoincrements == {'gene': 1}
+ assert db5._autoincrements == {'gene': 1}
+
+ db6 = db4.update(db5)
+
+ db7 = gffutils.FeatureDB(db4.dbfn)
+
+ # both db4 and db6 should now have the same, updated autoincrements because
+ # they both point to the same db.
+ assert db6._autoincrements == db4._autoincrements == {'gene': 2}
+
+ # But db5 was created independently and should have unchanged autoincrements
+ assert db5._autoincrements == {'gene': 1}
+
+ # db7 was created from the database pointed to by both db4 and db6. This
+ # tests that when a FeatureDB is created it should have the
+ # correctly-updated autoincrements read from the db
+ assert db7._autoincrements == {'gene': 2}
+
+def test_pr_131():
+ db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),':memory:')
+
+ # previously would raise ValueError("No lines parsed -- was an empty
+ # file provided?"); now just does nothing
+ db2 = db.update([])
+
+
+def test_pr_133():
+ # Previously, merge_attributes would not deep-copy the values from the
+ # second dict, and when the values are then modified, the second dict is
+ # unintentionally modified.
+ d1 = {'a': [1]}
+ d2 = {'a': [2]}
+ d1a = {'a': [1]}
+ d2a = {'a': [2]}
+ d3 = gffutils.helpers.merge_attributes(d1, d2)
+ assert d1 == d1a, d1
+ assert d2 == d2a, d2
+
+
+def test_pr_139():
+ db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),':memory:')
+ exons = list(db.features_of_type('exon'))
+ inter = list(db.interfeatures(exons))
+
+ # previously, the first exon's attributes would show up in subsequent merged features
+ assert exons[0].attributes['Name'][0] not in inter[1].attributes['Name']
+ assert exons[0].attributes['Name'][0] not in inter[2].attributes['Name']
+ assert exons[0].attributes['Name'][0] not in inter[3].attributes['Name']
+
+
+def test_pr_144():
+ # previously this would fail with:
+ # UnboundLocalError: local variable 'part' referenced before assignment
+ f = gffutils.Feature(attributes={'a': ['']})
+
+ # Make sure everything got converted correctly
+ assert f.attributes['a'] == ['']
+ assert str(f) == ". . . . . . . . a"
+ g = gffutils.feature.feature_from_line(str(f))
+ assert g == f
+
if __name__ == "__main__":
# this test case fails
#test_attributes_modify()
@@ -1181,3 +1322,6 @@ if __name__ == "__main__":
#test_random_chr()
#test_nonascii()
test_iterator_update()
+
+
+
=====================================
gffutils/test/test_biopython_integration.py
=====================================
@@ -14,5 +14,6 @@ def test_roundtrip():
s = bp.to_seqfeature(feature)
assert s.location.start.position == feature.start - 1
assert s.location.end.position == feature.stop
+ assert s.id == feature.id
f = bp.from_seqfeature(s, dialect=dialect, keep_order=True)
assert feature == f
=====================================
gffutils/test/test_merge.py
=====================================
@@ -0,0 +1,104 @@
+from copy import deepcopy
+
+from gffutils import merge_criteria as mc
+from gffutils.test.synth_test_base import TestWithSynthDB, num_synthetic_features, num_synthetic_overlap
+
+
+class TestMerge(TestWithSynthDB):
+ def _merge_and_compare(self, ids, expected_merge_children, **kwargs):
+ features = [self.db[id] for id in ids]
+ features_backup = deepcopy(features)
+ merged = list(self.db.merge(features, **kwargs))
+ dump = self._dump_db()
+ m_dump = '\n'.join(map(str, merged))
+
+ # Ensure db wasn't modified
+ self.assertEqual(num_synthetic_features, self.db.count_features_of_type(), dump)
+ for expected, actual in zip(features_backup, ids):
+ self.assertEqual(expected, self.db[actual], '\n'.join(map(str, [expected, self.db[actual]])) + '\n')
+
+ # Ensure input Features not modified
+ for expected, actual in zip(features_backup, features):
+ self.assertEqual(expected, actual, '\n'.join(map(str, [expected, actual])) + '\n')
+
+ # Ensure output matches expected
+ self.assertEqual(len(expected_merge_children), len(merged), m_dump)
+ for expected, actual in zip(expected_merge_children, merged):
+ c_dump = '\n'.join(map(str, actual.children)) + '\n' + str(actual) + '\n'
+ self.assertEqual(len(expected), len(actual.children), '\n'.join(map(str, [expected, actual])) + '\n')
+ self.assertTrue(all(child.id in expected for child in actual.children), c_dump)
+
+ if actual.strand != '.':
+ self.assertTrue(all(f.strand == actual.strand for f in actual.children), c_dump)
+
+ if actual.frame != '.':
+ self.assertTrue(all(f.frame == actual.frame for f in actual.children), c_dump)
+
+ if expected:
+ self.assertEqual(actual.start, min(f.start for f in actual.children), c_dump)
+ self.assertEqual(actual.end, max(f.end for f in actual.children), c_dump)
+ if all(f.seqid == actual.children[0].seqid for f in actual.children):
+ self.assertEqual(actual.seqid, actual.children[0].seqid, c_dump)
+ else:
+ self.assertEqual(set(f.seqid for f in actual.children), set(actual.seqid.split(',')), c_dump)
+
+ if all(f.featuretype == actual.children[0].featuretype for f in actual.children):
+ self.assertTrue(all(f.featuretype == actual.featuretype for f in actual.children), c_dump)
+ else:
+ self.assertEqual('sequence_feature', actual.featuretype, c_dump)
+
+ return merged
+
+ def test_defaults(self):
+ merged = list(self.db.merge(self.db.all_features(order_by=('seqid', 'featuretype', 'strand', 'start'))))
+ dump = '\n'.join(map(str, merged))
+ self.assertEqual(num_synthetic_features, self.db.count_features_of_type(), dump)
+ self.assertEqual(6, len(merged), dump)
+ merged_count = 0
+ for f in merged:
+ if f.children:
+ self.assertEqual(num_synthetic_overlap, len(f.children), dump)
+ merged_count += 1
+
+ self.assertEqual(1, merged_count)
+
+ def test_none(self):
+ merged = list(self.db.merge([]))
+ dump = self._dump_db()
+ self.assertEqual(num_synthetic_features, self.db.count_features_of_type(), dump)
+ self.assertEqual(0, len(merged))
+
+ def test_one(self):
+ self._merge_and_compare(['basic1'], [[]])
+
+ def test_no_overlap(self):
+ self._merge_and_compare(['basic1', 'no_overlap1'], [[],[]])
+
+ def test_perfect_overlap(self):
+ self._merge_and_compare(['basic1', 'perfect_overlap1'], [['basic1', 'perfect_overlap1']])
+
+ def test_overlap(self):
+ self._merge_and_compare(['basic1', 'adjacent1'], [['basic1', 'adjacent1']])
+
+ def test_any_overlap(self):
+ self._merge_and_compare(['basic1', 'overlap_start1'], [['basic1', 'overlap_start1']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+ self._merge_and_compare(['overlap_start1', 'basic1'], [['basic1', 'overlap_start1']])
+ self._merge_and_compare(['adjacent1', 'basic1'], [['basic1', 'adjacent1']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+ self._merge_and_compare(['adjacent4', 'basic1'], [['basic1', 'adjacent4']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+
+ def test_adjacent_len1(self):
+ self._merge_and_compare(['basic1', 'adjacent2'], [['basic1', 'adjacent2']])
+ self._merge_and_compare(['adjacent2', 'basic1'], [['basic1', 'adjacent2']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+ self._merge_and_compare(['basic1', 'adjacent3'], [['basic1', 'adjacent3']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+ self._merge_and_compare(['adjacent3', 'basic1'], [['basic1', 'adjacent3']])
+
+ def test_end_overlap(self):
+ self._merge_and_compare(['basic1', 'overlap_end1'], [['basic1', 'overlap_end1']])
+
+ # TODO test various feature orders
+
+ # TODO test various merge criteria
+
+ # TODO test multiline, this doesn't really work until gffutils better supports multiline features
+
+ # TODO add test for PR #152
=====================================
gffutils/test/test_merge_all.py
=====================================
@@ -0,0 +1,53 @@
+from gffutils.test.synth_test_base import TestWithSynthDB, num_synthetic_features, num_synthetic_overlap
+
+
+class TestMerge_all(TestWithSynthDB):
+ def test_defaults(self):
+ merged = self.db.merge_all()
+ dump = self._dump_db()
+ self.assertEqual(num_synthetic_features + 1, self.db.count_features_of_type(), dump)
+ self.assertEqual(1, len(merged), dump)
+ self.assertEqual(num_synthetic_overlap, len(merged[0].children), dump)
+
+ def test_empty(self):
+ self.db.delete(self.db.all_features())
+ dump = self._dump_db()
+ self.assertEqual(0, self.db.count_features_of_type(), dump)
+ merged = self.db.merge_all()
+ dump = self._dump_db()
+ self.assertEqual(0, self.db.count_features_of_type(), dump)
+ self.assertEqual(0, len(merged), dump)
+
+ def test_one(self):
+ features = self.db.all_features()
+ next(features)
+ self.db.delete(features)
+ dump = self._dump_db()
+ self.assertEqual(1, self.db.count_features_of_type(), dump)
+ merged = self.db.merge_all()
+ dump = self._dump_db()
+ self.assertEqual(1, self.db.count_features_of_type(), dump)
+ self.assertEqual(0, len(merged), dump)
+
+ def test_no_overlap(self):
+ self.db.delete(f for f in self.db.all_features() if f.id not in ('basic1', 'no_overlap1'))
+ dump = self._dump_db()
+ self.assertEqual(2, self.db.count_features_of_type(), dump)
+ merged = self.db.merge_all()
+ dump = self._dump_db()
+ self.assertEqual(2, self.db.count_features_of_type(), dump)
+ self.assertEqual(0, len(merged), dump)
+
+ def test_merge_groups(self):
+ merged = self.db.merge_all(featuretypes_groups=({'sequence_feature', 'misc_feature'},))
+ dump = self._dump_db()
+ self.assertEqual(num_synthetic_features + 1, self.db.count_features_of_type(), dump)
+ self.assertEqual(1, len(merged), dump)
+ self.assertEqual(num_synthetic_overlap, len(merged[0].children), dump)
+
+ def test_exclude_components(self):
+ merged = self.db.merge_all(exclude_components=True)
+ dump = self._dump_db()
+ self.assertEqual(6, self.db.count_features_of_type(), dump)
+ self.assertEqual(1, len(merged), dump)
+ self.assertEqual(num_synthetic_overlap, len(merged[0].children), dump)
=====================================
gffutils/version.py
=====================================
@@ -1 +1 @@
-version="0.9"
+version="0.10.1"
=====================================
requirements.txt
=====================================
@@ -1,5 +1,5 @@
-pyfaidx
-six
-argh
-argcomplete
+pyfaidx>=0.5.5.2
+six>=1.12.0
+argh>=0.26.2
+argcomplete>=1.9.4
simplejson
=====================================
setup.cfg
=====================================
@@ -1,5 +1,4 @@
[egg_info]
tag_build =
tag_date = 0
-tag_svn_revision = 0
View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/commit/22db24567b5c6f4809ea721fbfe575f1aee283c7
--
View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/commit/22db24567b5c6f4809ea721fbfe575f1aee283c7
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200101/b6199c8e/attachment-0001.html>
More information about the debian-med-commit
mailing list