[med-svn] [Git][med-team/python-gffutils][upstream] New upstream version 0.10.1

Michael R. Crusoe gitlab at salsa.debian.org
Wed Jan 1 11:55:02 GMT 2020



Michael R. Crusoe pushed to branch upstream at Debian Med / python-gffutils


Commits:
22db2456 by Michael R. Crusoe at 2020-01-01T11:45:45Z
New upstream version 0.10.1
- - - - -


25 changed files:

- PKG-INFO
- gffutils.egg-info/PKG-INFO
- gffutils.egg-info/SOURCES.txt
- gffutils.egg-info/requires.txt
- gffutils/attributes.py
- gffutils/bins.py
- gffutils/biopython_integration.py
- gffutils/create.py
- gffutils/feature.py
- gffutils/helpers.py
- − gffutils/inspection.py
- gffutils/interface.py
- gffutils/iterators.py
- + gffutils/merge_criteria.py
- gffutils/parser.py
- gffutils/pybedtools_integration.py
- + gffutils/test/data/synthetic.gff3
- + gffutils/test/synth_test_base.py
- gffutils/test/test.py
- gffutils/test/test_biopython_integration.py
- + gffutils/test/test_merge.py
- + gffutils/test/test_merge_all.py
- gffutils/version.py
- requirements.txt
- setup.cfg


Changes:

=====================================
PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: gffutils
-Version: 0.9
+Version: 0.10.1
 Summary: Work with GFF and GTF files in a flexible database framework
 Home-page: https://github.com/daler/gffutils
 Author: Ryan Dale


=====================================
gffutils.egg-info/PKG-INFO
=====================================
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: gffutils
-Version: 0.9
+Version: 0.10.1
 Summary: Work with GFF and GTF files in a flexible database framework
 Home-page: https://github.com/daler/gffutils
 Author: Ryan Dale


=====================================
gffutils.egg-info/SOURCES.txt
=====================================
@@ -16,9 +16,9 @@ gffutils/feature.py
 gffutils/gffwriter.py
 gffutils/helpers.py
 gffutils/inspect.py
-gffutils/inspection.py
 gffutils/interface.py
 gffutils/iterators.py
+gffutils/merge_criteria.py
 gffutils/parser.py
 gffutils/pybedtools_integration.py
 gffutils/version.py
@@ -36,8 +36,11 @@ gffutils/test/feature_test.py
 gffutils/test/helpers_test.py
 gffutils/test/parser_test.py
 gffutils/test/performance_test.py
+gffutils/test/synth_test_base.py
 gffutils/test/test.py
 gffutils/test/test_biopython_integration.py
+gffutils/test/test_merge.py
+gffutils/test/test_merge_all.py
 gffutils/test/data/F3-unique-3.v2.gff
 gffutils/test/data/FBgn0031208.gff
 gffutils/test/data/FBgn0031208.gtf
@@ -67,6 +70,7 @@ gffutils/test/data/mouse_extra_comma.gff3
 gffutils/test/data/ncbi_gff3.txt
 gffutils/test/data/nonascii
 gffutils/test/data/random-chr.gff
+gffutils/test/data/synthetic.gff3
 gffutils/test/data/unsanitized.gff
 gffutils/test/data/wormbase_gff2.txt
 gffutils/test/data/wormbase_gff2_alt.txt
\ No newline at end of file


=====================================
gffutils.egg-info/requires.txt
=====================================
@@ -1,5 +1,5 @@
-pyfaidx
-six
-argh
-argcomplete
+pyfaidx>=0.5.5.2
+six>=1.12.0
+argh>=0.26.2
+argcomplete>=1.9.4
 simplejson


=====================================
gffutils/attributes.py
=====================================
@@ -1,11 +1,16 @@
 import six
 import collections
+
+try:
+    collectionsAbc = collections.abc
+except AttributeError:
+    collectionsAbc = collections
 from gffutils import constants
 
 
 # collections.MutableMapping is apparently the best way to provide dict-like
 # interface (http://stackoverflow.com/a/3387975)
-class Attributes(collections.MutableMapping):
+class Attributes(collectionsAbc.MutableMapping):
     def __init__(self, *args, **kwargs):
         """
         An Attributes object acts much like a dictionary.  However, values are


=====================================
gffutils/bins.py
=====================================
@@ -53,6 +53,7 @@ OFFSETS = [
 # for BED (0-based, half-open) or GFF (1-based, closed intervals)
 COORD_OFFSETS = {'bed': 0, 'gff': 1}
 
+MAX_CHROM_SIZE = 2 ** 29
 
 def bins(start, stop, fmt='gff', one=True):
     """
@@ -72,6 +73,15 @@ def bins(start, stop, fmt='gff', one=True):
     fmt : 'gff' | 'bed'
         This specifies 1-based start coords (gff) or 0-based start coords (bed)
     """
+
+    # For very large coordinates, return 1 which is "somewhere on the
+    # chromosome".
+    if start >= MAX_CHROM_SIZE or stop >= MAX_CHROM_SIZE:
+        if one:
+            return 1
+        else:
+            return set([1])
+
     # Jump to highest resolution bin that will fit these coords (depending on
     # whether we have a BED or GFF-style coordinate).
     #
@@ -114,6 +124,7 @@ def bins(start, stop, fmt='gff', one=True):
         # larger bin size)
         start >>= NEXT_SHIFT
         stop >>= NEXT_SHIFT
+
     return bins
 
 
@@ -214,6 +225,12 @@ def test():
     assert bins(1, -1000, one=True) == 1
     assert bins(1, -1000, one=False) == set([1])
 
+    # Juuuust fits inside the max chrom size
+    assert bins(536870910, 536870911, one=True) == 8776
+
+    # Too big; falls back to 1.
+    assert bins(536870911, 536870912, one=True) == 1
+
 if __name__ == "__main__":
     print_bin_sizes()
     test()


=====================================
gffutils/biopython_integration.py
=====================================
@@ -5,7 +5,6 @@ objects.
 import six
 try:
     from Bio.SeqFeature import SeqFeature, FeatureLocation
-    from Bio import SeqIO
 except ImportError:
     import warnings
     warnings.warn("BioPython must be installed to use this module")
@@ -79,4 +78,4 @@ def from_seqfeature(s, **kwargs):
     attributes.pop('seqid', '.')
     attributes.pop('frame', '.')
     return Feature(seqid, source, featuretype, start, stop, score, strand,
-                   frame, attributes, **kwargs)
+                   frame, attributes, id=id, **kwargs)


=====================================
gffutils/create.py
=====================================
@@ -48,7 +48,7 @@ def deprecation_handler(kwargs):
 
 class _DBCreator(object):
     def __init__(self, data, dbfn, force=False, verbose=False, id_spec=None,
-                 merge_strategy='merge', checklines=10, transform=None,
+                 merge_strategy='error', checklines=10, transform=None,
                  force_dialect_check=False, from_string=False, dialect=None,
                  default_encoding='utf-8',
                  disable_infer_genes=False,
@@ -97,7 +97,6 @@ class _DBCreator(object):
         self.disable_infer_genes = disable_infer_genes
         self.disable_infer_transcripts = disable_infer_transcripts
 
-        self._autoincrements = collections.defaultdict(int)
         if force:
             if os.path.exists(dbfn):
                 os.unlink(dbfn)
@@ -124,7 +123,10 @@ class _DBCreator(object):
             force_dialect_check=force_dialect_check, from_string=from_string,
             dialect=dialect
         )
-
+        if '_autoincrements' in kwargs:
+            self._autoincrements = kwargs['_autoincrements']
+        else:
+            self._autoincrements = collections.defaultdict(int)
 
     def set_verbose(self, verbose=None):
         if verbose == 'debug':
@@ -207,11 +209,12 @@ class _DBCreator(object):
             Raise error
 
         "warning"
-            Log a warning
+            Log a warning, which indicates that all future instances of the
+            same ID will be ignored
 
         "merge":
             Combine old and new attributes -- but only if everything else
-            matches; otherwise error.  This can be slow, but is thorough.
+            matches; otherwise error. This can be slow, but is thorough.
 
         "create_unique":
             Autoincrement based on the ID, always creating a new ID.
@@ -248,8 +251,8 @@ class _DBCreator(object):
                              % ([i.id for i in self._candidate_merges(f)]))
 
             # If force_merge_fields was provided, don't pay attention to these
-            # fields if they're different.  We are assuming attributes will be
-            # different, hence the [:-1]
+            # fields if they're different. We are assuming the attributes field
+            # will be different, hence the [:-1]
             _gffkeys_to_check = list(
                 set(constants._gffkeys[:-1])
                 .difference(self.force_merge_fields))
@@ -379,7 +382,7 @@ class _DBCreator(object):
     def _candidate_merges(self, f):
         """
         Identifies those features that originally had the same ID as `f`
-        (according to the id_spec),  but were modified because of duplicate
+        (according to the id_spec), but were modified because of duplicate
         IDs.
         """
         candidates = [self._get_feature(f.id)]
@@ -486,7 +489,7 @@ class _DBCreator(object):
         c.execute('CREATE INDEX seqidstartendstrand ON features (seqid, start, end, strand)')
 
         # speeds computation 1000x in some cases
-        logger.info("Running ANALYSE features")
+        logger.info("Running ANALYZE features")
         c.execute('ANALYZE features')
 
         self.conn.commit()
@@ -505,6 +508,7 @@ class _DBCreator(object):
         self._update_relations()
         self._finalize()
 
+    # TODO: not sure this is used anywhere
     def update(self, iterator):
         self._populate_from_lines(iterator)
         self._update_relations()
@@ -645,29 +649,29 @@ class _GFFDBCreator(_DBCreator):
         else:
             suffix = '.gffutils'
         tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
-        fout = open(tmp, 'w')
-
-        # Here we look for "grandchildren" -- for each ID, get the child
-        # (parenthetical subquery below); then for each of those get *its*
-        # child (main query below).
-        #
-        # Results are written to temp file so that we don't read and write at
-        # the same time, which would slow things down considerably.
-
-        c.execute('SELECT id FROM features')
-        for parent in c:
-            c2.execute('''
-                       SELECT child FROM relations WHERE parent IN
-                       (SELECT child FROM relations WHERE parent = ?)
-                       ''', tuple(parent))
-            for grandchild in c2:
-                fout.write('\t'.join((parent[0], grandchild[0])) + '\n')
-        fout.close()
+        with open(tmp, 'w') as fout:
+
+            # Here we look for "grandchildren" -- for each ID, get the child
+            # (parenthetical subquery below); then for each of those get *its*
+            # child (main query below).
+            #
+            # Results are written to temp file so that we don't read and write at
+            # the same time, which would slow things down considerably.
+
+            c.execute('SELECT id FROM features')
+            for parent in c:
+                c2.execute('''
+                           SELECT child FROM relations WHERE parent IN
+                           (SELECT child FROM relations WHERE parent = ?)
+                           ''', tuple(parent))
+                for grandchild in c2:
+                    fout.write('\t'.join((parent[0], grandchild[0])) + '\n')
 
         def relations_generator():
-            for line in open(fout.name):
-                parent, child = line.strip().split('\t')
-                yield dict(parent=parent, child=child, level=2)
+            with open(fout.name) as fin:
+                for line in fin:
+                    parent, child = line.strip().split('\t')
+                    yield dict(parent=parent, child=child, level=2)
 
         c.executemany(
             '''
@@ -720,7 +724,7 @@ class _GTFDBCreator(_DBCreator):
                     warnings.warn(
                         "It appears you have a transcript feature in your GTF "
                         "file. You may want to use the "
-                        "`disable_infer_transcripts` "
+                        "`disable_infer_transcripts=True` "
                         "option to speed up database creation")
                 elif (
                     f.featuretype == 'gene' and
@@ -729,7 +733,7 @@ class _GTFDBCreator(_DBCreator):
                     warnings.warn(
                         "It appears you have a gene feature in your GTF "
                         "file. You may want to use the "
-                        "`disable_infer_genes` "
+                        "`disable_infer_genes=True` "
                         "option to speed up database creation")
 
             lines_seen = i + 1
@@ -838,101 +842,64 @@ class _GTFDBCreator(_DBCreator):
             suffix = self._keep_tempfiles
         else:
             suffix = '.gffutils'
-        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
-        fout = open(tmp, 'w')
-
-        self._tmpfile = tmp
 
-        # This takes some explanation...
-        #
-        # First, the nested subquery gets the level-1 parents of
-        # self.subfeature featuretypes.  For an on-spec GTF file,
-        # self.subfeature = "exon". So this subquery translates to getting the
-        # distinct level-1 parents of exons -- which are transcripts.
-        #
-        # OK, so this first subquery is now a list of transcripts; call it
-        # "firstlevel".
-        #
-        # Then join firstlevel on relations, but the trick is to now consider
-        # each transcript a *child* -- so that relations.parent (on the first
-        # line of the query) will be the first-level parent of the transcript
-        # (the gene).
-        #
-        #
-        # The result is something like:
-        #
-        #   transcript1     gene1
-        #   transcript2     gene1
-        #   transcript3     gene2
-        #
-        # Note that genes are repeated; below we need to ensure that only one
-        # is added.  To ensure this, the results are ordered by the gene ID.
-        #
-        # By the way, we do this even if we're only looking for transcripts or
-        # only looking for genes.
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
+        with open(tmp, 'w') as fout:
+            self._tmpfile = tmp
+
+            # This takes some explanation...
+            #
+            # First, the nested subquery gets the level-1 parents of
+            # self.subfeature featuretypes.  For an on-spec GTF file,
+            # self.subfeature = "exon". So this subquery translates to getting the
+            # distinct level-1 parents of exons -- which are transcripts.
+            #
+            # OK, so this first subquery is now a list of transcripts; call it
+            # "firstlevel".
+            #
+            # Then join firstlevel on relations, but the trick is to now consider
+            # each transcript a *child* -- so that relations.parent (on the first
+            # line of the query) will be the first-level parent of the transcript
+            # (the gene).
+            #
+            #
+            # The result is something like:
+            #
+            #   transcript1     gene1
+            #   transcript2     gene1
+            #   transcript3     gene2
+            #
+            # Note that genes are repeated; below we need to ensure that only one
+            # is added.  To ensure this, the results are ordered by the gene ID.
+            #
+            # By the way, we do this even if we're only looking for transcripts or
+            # only looking for genes.
 
-        c.execute(
-            '''
-            SELECT DISTINCT firstlevel.parent, relations.parent
-            FROM (
-                SELECT DISTINCT parent
-                FROM relations
-                JOIN features ON features.id = relations.child
-                WHERE features.featuretype = ?
-                AND relations.level = 1
-            )
-            AS firstlevel
-            JOIN relations ON firstlevel.parent = child
-            WHERE relations.level = 1
-            ORDER BY relations.parent
-            ''', (self.subfeature,))
-
-        # Now we iterate through those results (using a new cursor) to infer
-        # the extent of transcripts and/or genes.
-
-        last_gene_id = None
-        n_features = 0
-        for transcript_id, gene_id in c:
-
-            if not self.disable_infer_transcripts:
-                # transcript extent
-                c2.execute(
-                    '''
-                    SELECT MIN(start), MAX(end), strand, seqid
-                    FROM features
-                    JOIN relations ON
-                    features.id = relations.child
-                    WHERE parent = ? AND featuretype == ?
-                    ''', (transcript_id, self.subfeature))
-                transcript_start, transcript_end, strand, seqid = c2.fetchone()
-                transcript_attributes = {
-                    self.transcript_key: [transcript_id],
-                    self.gene_key: [gene_id]
-                }
-                transcript_bin = bins.bins(
-                    transcript_start, transcript_end, one=True)
-
-                # Write out to file; we'll be reading it back in shortly.  Omit
-                # score, frame, source, and extra since they will always have
-                # the same default values (".", ".", "gffutils_derived", and []
-                # respectively)
-
-                fout.write('\t'.join(map(str, [
-                    transcript_id,
-                    seqid,
-                    transcript_start,
-                    transcript_end,
-                    strand,
-                    'transcript',
-                    transcript_bin,
-                    helpers._jsonify(transcript_attributes)
-                ])) + '\n')
-
-                n_features += 1
-
-            if not self.disable_infer_genes:
-                # Infer gene extent, but only if we haven't done so already
-                if gene_id != last_gene_id:
+            c.execute(
+                '''
+                SELECT DISTINCT firstlevel.parent, relations.parent
+                FROM (
+                    SELECT DISTINCT parent
+                    FROM relations
+                    JOIN features ON features.id = relations.child
+                    WHERE features.featuretype = ?
+                    AND relations.level = 1
+                )
+                AS firstlevel
+                JOIN relations ON firstlevel.parent = child
+                WHERE relations.level = 1
+                ORDER BY relations.parent
+                ''', (self.subfeature,))
+
+            # Now we iterate through those results (using a new cursor) to infer
+            # the extent of transcripts and/or genes.
+
+            last_gene_id = None
+            n_features = 0
+            for transcript_id, gene_id in c:
+
+                if not self.disable_infer_transcripts:
+                    # transcript extent
                     c2.execute(
                         '''
                         SELECT MIN(start), MAX(end), strand, seqid
@@ -940,26 +907,61 @@ class _GTFDBCreator(_DBCreator):
                         JOIN relations ON
                         features.id = relations.child
                         WHERE parent = ? AND featuretype == ?
-                        ''', (gene_id, self.subfeature))
-                    gene_start, gene_end, strand, seqid = c2.fetchone()
-                    gene_attributes = {self.gene_key: [gene_id]}
-                    gene_bin = bins.bins(gene_start, gene_end, one=True)
+                        ''', (transcript_id, self.subfeature))
+                    transcript_start, transcript_end, strand, seqid = c2.fetchone()
+                    transcript_attributes = {
+                        self.transcript_key: [transcript_id],
+                        self.gene_key: [gene_id]
+                    }
+                    transcript_bin = bins.bins(
+                        transcript_start, transcript_end, one=True)
+
+                    # Write out to file; we'll be reading it back in shortly.  Omit
+                    # score, frame, source, and extra since they will always have
+                    # the same default values (".", ".", "gffutils_derived", and []
+                    # respectively)
 
                     fout.write('\t'.join(map(str, [
-                        gene_id,
+                        transcript_id,
                         seqid,
-                        gene_start,
-                        gene_end,
+                        transcript_start,
+                        transcript_end,
                         strand,
-                        'gene',
-                        gene_bin,
-                        helpers._jsonify(gene_attributes)
+                        'transcript',
+                        transcript_bin,
+                        helpers._jsonify(transcript_attributes)
                     ])) + '\n')
 
-                last_gene_id = gene_id
-                n_features += 1
+                    n_features += 1
 
-        fout.close()
+                if not self.disable_infer_genes:
+                    # Infer gene extent, but only if we haven't done so already
+                    if gene_id != last_gene_id:
+                        c2.execute(
+                            '''
+                            SELECT MIN(start), MAX(end), strand, seqid
+                            FROM features
+                            JOIN relations ON
+                            features.id = relations.child
+                            WHERE parent = ? AND featuretype == ?
+                            ''', (gene_id, self.subfeature))
+                        gene_start, gene_end, strand, seqid = c2.fetchone()
+                        gene_attributes = {self.gene_key: [gene_id]}
+                        gene_bin = bins.bins(gene_start, gene_end, one=True)
+
+                        fout.write('\t'.join(map(str, [
+                            gene_id,
+                            seqid,
+                            gene_start,
+                            gene_end,
+                            strand,
+                            'gene',
+                            gene_bin,
+                            helpers._jsonify(gene_attributes)
+                        ])) + '\n')
+
+                    last_gene_id = gene_id
+                    n_features += 1
 
         def derived_feature_generator():
             """
@@ -967,17 +969,18 @@ class _GTFDBCreator(_DBCreator):
             """
             keys = ['parent', 'seqid', 'start', 'end', 'strand',
                     'featuretype', 'bin', 'attributes']
-            for line in open(fout.name):
-                d = dict(list(zip(keys, line.strip().split('\t'))))
-                d.pop('parent')
-                d['score'] = '.'
-                d['source'] = 'gffutils_derived'
-                d['frame'] = '.'
-                d['extra'] = []
-                d['attributes'] = helpers._unjsonify(d['attributes'])
-                f = feature.Feature(**d)
-                f.id = self._id_handler(f)
-                yield f
+            with open(fout.name) as fin:
+                for line in fin:
+                    d = dict(list(zip(keys, line.strip().split('\t'))))
+                    d.pop('parent')
+                    d['score'] = '.'
+                    d['source'] = 'gffutils_derived'
+                    d['frame'] = '.'
+                    d['extra'] = []
+                    d['attributes'] = helpers._unjsonify(d['attributes'])
+                    f = feature.Feature(**d)
+                    f.id = self._id_handler(f)
+                    yield f
 
         # Drop the indexes so the inserts are faster
         c.execute('DROP INDEX IF EXISTS relationsparent')
@@ -1009,7 +1012,8 @@ class _GTFDBCreator(_DBCreator):
         if not self._keep_tempfiles:
             os.unlink(fout.name)
 
-        # TODO: recreate indexes?
+        # TODO: recreate indexes? Typically the _finalize() method will be
+        # called after this one, and indexes are created in _finalize().
 
 
 def create_db(data, dbfn, id_spec=None, force=False, verbose=False,


=====================================
gffutils/feature.py
=====================================
@@ -1,6 +1,6 @@
 from pyfaidx import Fasta
 import six
-import simplejson
+import simplejson as json
 from gffutils import constants
 from gffutils import helpers
 from gffutils import parser
@@ -145,7 +145,7 @@ class Feature(object):
                 attributes = helpers._unjsonify(attributes, isattributes=True)
 
             # it's a string but not JSON: assume original attributes string.
-            except simplejson.JSONDecodeError:
+            except json.JSONDecodeError:
 
                 # But Feature.attributes is still a dict
                 attributes, _dialect = parser._split_keyvals(attributes)
@@ -159,7 +159,7 @@ class Feature(object):
         if isinstance(extra, six.string_types):
             try:
                 extra = helpers._unjsonify(extra)
-            except simplejson.JSONDecodeError:
+            except json.JSONDecodeError:
                 extra = extra.split('\t')
 
         self.seqid = seqid


=====================================
gffutils/helpers.py
=====================================
@@ -1,7 +1,7 @@
 import copy
 import sys
 import os
-import simplejson
+import simplejson as json
 import time
 import tempfile
 import six
@@ -256,16 +256,16 @@ def _bin_from_dict(d):
 def _jsonify(x):
     """Use most compact form of JSON"""
     if isinstance(x, dict_class):
-        return simplejson.dumps(x._d, separators=(',', ':'))
-    return simplejson.dumps(x, separators=(',', ':'))
+        return json.dumps(x._d, separators=(',', ':'))
+    return json.dumps(x, separators=(',', ':'))
 
 
 def _unjsonify(x, isattributes=False):
     """Convert JSON string to an ordered defaultdict."""
     if isattributes:
-        obj = simplejson.loads(x)
+        obj = json.loads(x)
         return dict_class(obj)
-    return simplejson.loads(x)
+    return json.loads(x)
 
 
 def _feature_to_fields(f, jsonify=True):
@@ -318,7 +318,7 @@ def merge_attributes(attr1, attr2):
     """
 
     new_d = copy.deepcopy(attr1)
-    new_d.update(attr2)
+    new_d.update(copy.deepcopy(attr2))
 
     #all of attr2 key : values just overwrote attr1, fix it
     for k, v in new_d.items():
@@ -330,7 +330,7 @@ def merge_attributes(attr1, attr2):
             if not isinstance(v, list):
                 v = [v]
             new_d[k].extend(v)
-    return dict((k, list(set(v))) for k, v in new_d.items())
+    return dict((k, sorted(set(v))) for k, v in new_d.items())
 
 
 def dialect_compare(dialect1, dialect2):


=====================================
gffutils/inspection.py deleted
=====================================
@@ -1,105 +0,0 @@
-from gffutils import iterators
-from gffutils import interface
-from collections import Counter
-import sys
-
-
-def inspect(data, look_for=['featuretype', 'chrom', 'attribute_keys',
-                            'feature_count'], limit=None, verbose=True):
-    """
-    Inspect a GFF or GTF data source.
-
-    This function is useful for figuring out the different featuretypes found
-    in a file (for potential removal before creating a FeatureDB).
-
-    Returns a dictionary with a key for each item in `look_for` and
-    a corresponding value that is a dictionary of how many of each unique item
-    were found.
-
-    There will always be a `feature_count` key, indicating how many features
-    were looked at (if `limit` is provided, then `feature_count` will be the
-    same as `limit`).
-
-    For example, if `look_for` is ['chrom', 'featuretype'], then the result
-    will be a dictionary like::
-
-        {
-            'chrom': {
-                'chr1': 500,
-                'chr2': 435,
-                'chr3': 200,
-                ...
-                ...
-            }.
-
-            'featuretype': {
-                'gene': 150,
-                'exon': 324,
-                ...
-            },
-
-            'feature_count': 5000
-
-        }
-
-
-    Parameters
-    ----------
-    data : str, FeatureDB instance, or iterator of Features
-        If `data` is a string, assume it's a GFF or GTF filename.  If it's
-        a FeatureDB instance, then its `all_features()` method will be
-        automatically called. Otherwise, assume it's an iterable of Feature
-        objects.
-
-    look_for : list
-        List of things to keep track of. Options are:
-
-            - any attribute of a Feature object, such as chrom, source, start,
-              stop, strand.
-
-            - "attribute_keys", which will look at all the individual
-              attribute keys of each feature
-
-    limit : int
-        Number of features to look at.  Default is no limit.
-
-    verbose : bool
-        Report how many features have been processed.
-
-    Returns
-    -------
-    dict
-    """
-
-    results = {}
-    obj_attrs = []
-    for i in look_for:
-        if i not in ['attribute_keys', 'feature_count']:
-            obj_attrs.append(i)
-        results[i] = Counter()
-
-    attr_keys = 'attribute_keys' in look_for
-
-    d = iterators.DataIterator(data)
-    feature_count = 0
-    for f in d:
-        if verbose:
-            sys.stderr.write('\r%s features inspected' % feature_count)
-            sys.stderr.flush()
-
-        for obj_attr in obj_attrs:
-            results[obj_attr].update([getattr(f, obj_attr)])
-
-        if attr_keys:
-            results['attribute_keys'].update(f.attributes.keys())
-
-        feature_count += 1
-        if limit and feature_count == limit:
-            break
-
-    new_results = {}
-    for k, v in results.items():
-        new_results[k] = dict(v)
-
-    new_results['feature_count'] = feature_count
-    return new_results


=====================================
gffutils/interface.py
=====================================
@@ -1,3 +1,4 @@
+import collections
 import os
 import six
 import sqlite3
@@ -6,10 +7,62 @@ import warnings
 from gffutils import bins
 from gffutils import helpers
 from gffutils import constants
+from gffutils import merge_criteria as mc
 from gffutils.feature import Feature
 from gffutils.exceptions import FeatureNotFoundError
 
 
+def assign_child(parent, child):
+    """
+    Helper for add_relation()
+    Sets 'Parent' attribute to parent['ID']
+
+    Parameters
+    ----------
+
+    parent : Feature
+        Parent Feature
+
+    :param parent: parent Feature
+
+    child : Feature
+        Child Feature
+
+    Returns
+    -------
+
+    Child Feature
+    """
+    child.attributes['Parent'] = parent['ID']
+    return child
+
+
+# Reusable constant for FeatureDB.merge()
+no_children = tuple()
+
+def _finalize_merge(feature, feature_children):
+    """
+    Helper for FeatureDB.merge() to update source and assign children property
+
+    Parameters
+    ----------
+    feature : Feature
+        feature to finalise
+    
+    feature_children
+        list of children to assign
+    
+    Returns
+    -------
+    feature, modified
+    """
+    if len(feature_children) > 1:
+        feature.source = ','.join(set(child.source for child in feature_children))
+        feature.children = feature_children
+    else:
+        feature.children = no_children
+    return feature
+
 class FeatureDB(object):
     # docstring to be filled in for methods that call out to
     # helpers.make_query()
@@ -147,7 +200,7 @@ class FeatureDB(object):
             '''
             SELECT base, n FROM autoincrements
             ''')
-        self._autoincrements = dict(c)
+        self._autoincrements = collections.defaultdict(int, c)
 
         self.set_pragmas(pragmas)
 
@@ -604,17 +657,15 @@ class FeatureDB(object):
         # Only use bins if we have defined boundaries and completely_within is
         # True. Otherwise you can't know how far away a feature stretches
         # (which means bins are not computable ahead of time)
+        _bin_clause = ''
         if (start is not None) and (end is not None) and completely_within:
-            _bins = list(bins.bins(start, end, one=False))
-            # See issue #45
-            if len(_bins) < 900:
-                _bin_clause = ' or ' .join(['bin = ?' for _ in _bins])
-                _bin_clause = 'AND ( %s )' % _bin_clause
-                args += _bins
-            else:
-                _bin_clause = ''
-        else:
-            _bin_clause = ''
+            if start <= bins.MAX_CHROM_SIZE and end <= bins.MAX_CHROM_SIZE:
+                _bins = list(bins.bins(start, end, one=False))
+                # See issue #45
+                if len(_bins) < 900:
+                    _bin_clause = ' or ' .join(['bin = ?' for _ in _bins])
+                    _bin_clause = 'AND ( %s )' % _bin_clause
+                    args += _bins
 
         query = ' '.join([
             constants._SELECT,
@@ -662,11 +713,9 @@ class FeatureDB(object):
         Providing N features will return N - 1 new features.
 
         This method purposefully does *not* do any merging or sorting of
-        coordinates, so you may want to use :meth:`FeatureDB.merge` first.
-
-        The new features' attributes will be a merge of the neighboring
-        features' attributes.  This is useful if you have provided a list of
-        exons; the introns will then retain the transcript and/or gene parents.
+        coordinates, so you may want to use :meth:`FeatureDB.merge` first, or
+        when selecting features use the `order_by` kwarg, e.g.,
+        `db.features_of_type('gene', order_by=('seqid', 'start'))`.
 
         Parameters
         ----------
@@ -678,6 +727,14 @@ class FeatureDB(object):
             then the featuretypes will be constructed from the neighboring
             features, e.g., `inter_exon_exon`.
 
+        merge_attributes : bool
+            If True, new features' attributes will be a merge of the neighboring
+            features' attributes.  This is useful if you have provided a list of
+            exons; the introns will then retain the transcript and/or gene
+            parents as a single item. Otherwise, if False, the attribute will
+            be a comma-separated list of values, potentially listing the same
+            gene ID twice.
+
         attribute_func : callable or None
             If None, then nothing special is done to the attributes.  If
             callable, then the callable accepts two attribute dictionaries and
@@ -705,9 +762,21 @@ class FeatureDB(object):
             if new_featuretype is None:
                 new_featuretype = 'inter_%s_%s' % (
                     last_feature.featuretype, f.featuretype)
-            assert last_feature.strand == f.strand
-            assert last_feature.chrom == f.chrom
-            strand = last_feature.strand
+            if last_feature.strand != f.strand:
+                new_strand = '.'
+            else:
+                new_strand = f.strand
+
+            if last_feature.chrom != f.chrom:
+                # We've moved to a new chromosome.  For example, if we're
+                # getting intergenic regions from all genes, they will be on
+                # different chromosomes. We still assume sorted features, but
+                # don't complain if they're on different chromosomes -- just
+                # move on.
+                last_feature = f
+                continue
+
+            strand = new_strand
             chrom = last_feature.chrom
 
             # Shrink
@@ -748,6 +817,7 @@ class FeatureDB(object):
                     dialect = None
             yield self._feature_returner(**fields)
             interfeature_start = f.stop
+            last_feature = f
 
     def delete(self, features, make_backup=True, **kwargs):
         """
@@ -797,7 +867,19 @@ class FeatureDB(object):
 
     def update(self, data, make_backup=True, **kwargs):
         """
-        Update database with features in `data`.
+        Update the on-disk database with features in `data`.
+
+        WARNING: If you used any non-default kwargs for gffutils.create_db when
+        creating the database in the first place (especially
+        `disable_infer_transcripts` or `disable_infer_genes`) then you should
+        use those same arguments here.
+
+        The returned object is the same FeatureDB, but since it is pointing to
+        the same database and that has been just updated, the new features can
+        now be accessed.
+
+        Parameters
+        ----------
 
         data : str, iterable, FeatureDB instance
             If FeatureDB, all data will be used. If string, assume it's
@@ -810,15 +892,15 @@ class FeatureDB(object):
             makes a copy of the existing database and saves it with a .bak
             extension.
 
-        Notes
-        -----
-        Other kwargs are used in the same way as in gffutils.create_db; see the
-        help for that function for details.
+        kwargs :
+            Additional kwargs are passed to gffutils.create_db; see the help
+            for that function for details.
 
         Returns
         -------
-        FeatureDB with updated features.
+        Returns self but with the underlying database updated.
         """
+
         from gffutils import create
         from gffutils import iterators
         if make_backup:
@@ -833,6 +915,7 @@ class FeatureDB(object):
 
         # Handle all sorts of input
         data = iterators.DataIterator(data, **_iterator_kwargs)
+        kwargs['_autoincrements'] = self._autoincrements
 
         if self.dialect['fmt'] == 'gtf':
             if 'id_spec' not in kwargs:
@@ -849,10 +932,18 @@ class FeatureDB(object):
         else:
             raise ValueError
 
+        peek, data._iter = iterators.peek(data._iter, 1)
+        if len(peek) == 0: return db  # If the file is empty then do nothing
+
         db._populate_from_lines(data)
         db._update_relations()
+
+        # Note that the autoincrements gets updated here
         db._finalize()
-        return db
+
+        # Read it back in directly from the stored autoincrements table
+        self._autoincrements.update(db._autoincrements)
+        return self
 
     def add_relation(self, parent, child, level, parent_func=None,
                      child_func=None):
@@ -1000,8 +1091,11 @@ class FeatureDB(object):
             ):
                 yield intron
 
-    def merge(self, features, ignore_strand=False):
+    def _old_merge(self, features, ignore_strand=False):
         """
+        DEPRECATED, only retained here for backwards compatibility. Please use
+        merge().
+
         Merge overlapping features together.
 
         Parameters
@@ -1095,6 +1189,197 @@ class FeatureDB(object):
             attributes='')
         yield self._feature_returner(**merged_feature)
 
+
+    def merge(self, features,
+              merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, mc.feature_type),
+              multiline=False):
+        """
+        Merge features matching criteria together
+
+
+        Returned Features have a special property called 'children' that is
+        a list of the component features.  This only exists for the lifetime of
+        the Feature instance.
+
+        Parameters
+        ----------
+        features : iterable
+            Iterable of Feature instances to merge
+
+        merge_criteria : list
+            List of merge criteria callbacks. All must evaluate to True in
+            order for a feature to be merged. See notes below on callback
+            signature.
+
+        multiline : bool
+            True to emit multiple features with the same ID attribute, False
+            otherwise.
+
+        Returns
+        -------
+        Generator yielding merged Features
+
+        Notes
+        -----
+
+        See the `gffutils.merge_criteria` module (imported here as `mc`) for
+        existing callback functions. For writing custom callbacks, functions
+        must have the following signature::
+
+            callback(
+                acc: gffutils.Feature,
+                cur: gffutils.Feature,
+                components: [gffutils.Feature]
+            ) -> bool
+
+        Where:
+
+            - `acc`: current accumulated feature
+            - `cur`: candidate feature to merge
+            - `components`: list of features that compose acc
+
+        The function should return True to merge `cur` into `acc`, False to set
+        `cur` to `acc` (that is, start a new merged feature).
+
+
+        If merge criteria allows different feature types then the merged
+        features' feature types should have their featuretype property
+        reassigned to a more specific ontology value.
+        """
+        if not isinstance(merge_criteria, list):
+            try:
+                merge_criteria = list(merge_criteria)
+            except TypeError:
+                merge_criteria = [merge_criteria]
+
+        # To start, we create a merged feature of just the first feature.
+        features = iter(features)
+        last_id = None
+        current_merged = None
+        feature_children = []
+
+        for feature in features:
+            if current_merged is None:
+                if all(criteria(feature, feature, feature_children) for criteria in merge_criteria):
+                    current_merged = feature
+                    feature_children = [feature]
+                else:
+                    yield _finalize_merge(feature, no_children)
+                    last_id = None
+                continue
+
+            if len(feature_children) == 0:  # current_merged is last feature and unchecked
+                if all(criteria(current_merged, current_merged, feature_children) for criteria in merge_criteria):
+                    feature_children.append(current_merged)
+                else:
+                    yield _finalize_merge(current_merged, no_children)
+                    current_merged = feature
+                    last_id = None
+                    continue
+
+            if all(criteria(current_merged, feature, feature_children) for criteria in merge_criteria):
+                # Criteria satisfied, merge
+                # TODO Test multiline records and iron out the following code
+                # if multiline and (feature.start > current_merged.end + 1 or feature.end + 1 < current_merged.start):
+                #    # Feature is possibly multiline (discontiguous), keep ID but start new record
+                #    yield _finalize_merge(current_merged, feature_children)
+                #    current_merged = feature
+                #    feature_children = [feature]
+
+                if len(feature_children) == 1:
+                    # Current merged is only child and merge is going to occur, make copy
+                    current_merged = vars(current_merged).copy()
+                    del current_merged['attributes']
+                    del current_merged['extra']
+                    del current_merged['dialect']
+                    del current_merged['keep_order']
+                    del current_merged['sort_attribute_values']
+                    current_merged = self._feature_returner(**current_merged)
+                    if not last_id:
+                        # Generate unique ID for new Feature
+                        self._autoincrements[current_merged.featuretype] += 1
+                        last_id = current_merged.featuretype + '_' + str(
+                            self._autoincrements[current_merged.featuretype])
+                    current_merged['ID'] = last_id
+                    current_merged.id = last_id
+
+                feature_children.append(feature)
+
+                # Set mismatched properties to ambiguous values
+                if feature.seqid not in current_merged.seqid.split(','): current_merged.seqid += ',' + feature.seqid
+                if feature.strand != current_merged.strand: current_merged.strand = '.'
+                if feature.frame != current_merged.frame: current_merged.frame = '.'
+                if feature.featuretype != current_merged.featuretype: current_merged.featuretype = "sequence_feature"
+
+                if feature.start < current_merged.start:
+                    # Extends prior, so set a new start position
+                    current_merged.start = feature.start
+
+                if feature.end > current_merged.end:
+                    # Extends further, so set a new stop position
+                    current_merged.end = feature.end
+
+            else:
+                yield _finalize_merge(current_merged, feature_children)
+                current_merged = feature
+                feature_children = []
+                last_id = None
+
+        if current_merged:
+            yield _finalize_merge(current_merged, feature_children)
+
+    def merge_all(self,
+                  merge_order=('seqid', 'featuretype', 'strand', 'start'),
+                  merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, mc.feature_type),
+                  featuretypes_groups=(None,),
+                  exclude_components=False):
+        """
+        Merge all features in database according to criteria.
+        Merged features will be assigned as children of the merged record.
+        The resulting records are added to the database.
+
+        Parameters
+        ----------
+        merge_order : list
+            Ordered list of columns with which to group features before evaluating criteria
+        merge_criteria : list
+            List of merge criteria callbacks. See merge().
+        featuretypes_groups : list
+            iterable of sets of featuretypes to merge together
+        exclude_components : bool
+            True: child features will be discarded. False to keep them.
+
+        Returns
+        -------
+            list of merge features
+        """
+
+        if not len(featuretypes_groups):
+            # Can't be empty
+            featuretypes_groups = (None,)
+
+        result_features = []
+
+        # Merge features per featuregroup
+        for featuregroup in featuretypes_groups:
+            for merged in self.merge(self.all_features(featuretype=featuregroup, order_by=merge_order),
+                                merge_criteria=merge_criteria):
+                # If feature is result of merge
+                if merged.children:
+                    self._insert(merged, self.conn.cursor())
+                    if exclude_components:
+                        # Remove child features from DB
+                        self.delete(merged.children)
+                    else:
+                        # Add child relations to DB
+                        for child in merged.children:
+                            self.add_relation(merged, child, 1, child_func=assign_child)
+                    result_features.append(merged)
+                else:
+                    pass  # Do nothing, feature is already in DB
+
+        return result_features
+
     def children_bp(self, feature, child_featuretype='exon', merge=False,
                     ignore_strand=False):
         """


=====================================
gffutils/iterators.py
=====================================
@@ -115,6 +115,7 @@ class _FileIterator(_BaseIterator):
     Subclass for iterating over features provided as a filename
     """
     def open_function(self, data):
+        data = os.path.expanduser(data)
         if data.endswith('.gz'):
             import gzip
             return gzip.open(data)
@@ -130,7 +131,7 @@ class _FileIterator(_BaseIterator):
             self.current_item_number = i
 
             if line == '##FASTA' or line.startswith('>'):
-                raise StopIteration
+                return
 
             if line.startswith('##'):
                 self._directive_handler(line)


=====================================
gffutils/merge_criteria.py
=====================================
@@ -0,0 +1,51 @@
+"""
+Merge criteria used by FeatureDB merge() and merge_all()
+"""
+from gffutils.feature import Feature
+
+
+def seqid(acc, cur, components):
+    return cur.seqid == acc.seqid
+
+
+def strand(acc, cur, components):
+    return acc.strand == cur.strand
+
+
+def feature_type(acc, cur, components):
+    return acc.featuretype == cur.featuretype
+
+
+def exact_coordinates_only(acc, cur, components):
+    return cur.start == acc.start and cur.stop == acc.end
+
+
+def overlap_end_inclusive(acc, cur, components):
+    return acc.start <= cur.start <= acc.end + 1
+
+
+def overlap_start_inclusive(acc, cur, components):
+    return acc.start <= cur.end + 1 <= acc.end + 1
+
+
+def overlap_any_inclusive(acc, cur, components):
+    return acc.start <= cur.start <= acc.end + 1 or acc.start <= cur.end + 1 <= acc.end + 1
+
+
+def overlap_end_threshold(threshold):
+    def partial(acc, cur, components):
+        return acc.start <= cur.start <= acc.end + threshold
+    return partial
+
+
+def overlap_start_threshold(threshold):
+    def partial(acc, cur, components):
+        return acc.start - threshold <= cur.end + 1 <= acc.end + 1
+
+    return partial
+
+
+def overlap_any_threshold(threshold):
+    def partial(acc, cur, components):
+        return acc.start - threshold <= cur.end + 1 <= acc.end + 1 or acc.start <= cur.start <= acc.end + threshold
+    return partial


=====================================
gffutils/parser.py
=====================================
@@ -152,6 +152,8 @@ def _reconstruct(keyvals, dialect, keep_order=False,
 
                 # Typically "=" for GFF3 or " " otherwise
                 part = dialect['keyval separator'].join([key, val_str])
+            else:
+                part = key
         else:
             if dialect['fmt'] == 'gtf':
                 part = dialect['keyval separator'].join([key, '""'])
@@ -322,6 +324,7 @@ def _split_keyvals(keyval_str, dialect=None):
         # Pathological cases where values of a key have within them the key-val
         # separator, e.g.,
         #  Alias=SGN-M1347;ID=T0028;Note=marker name(s): T0028 SGN-M1347 |identity=99.58|escore=2e-126
+        #                                                                         ^            ^
         else:
             key = item[0]
             val = dialect['keyval separator'].join(item[1:])


=====================================
gffutils/pybedtools_integration.py
=====================================
@@ -2,11 +2,13 @@
 Module for integration with pybedtools
 """
 
+import os
 import pybedtools
 from pybedtools import featurefuncs
 from gffutils import helpers
 import six
 
+
 def to_bedtool(iterator):
     """
     Convert any iterator into a pybedtools.BedTool object.
@@ -22,7 +24,7 @@ def to_bedtool(iterator):
 
 
 def tsses(db, merge_overlapping=False, attrs=None, attrs_sep=":",
-          merge_kwargs=dict(o='distinct', s=True, c=4), as_bed6=False):
+          merge_kwargs=None, as_bed6=False, bedtools_227_or_later=True):
     """
     Create 1-bp transcription start sites for all transcripts in the database
     and return as a sorted pybedtools.BedTool object pointing to a temporary
@@ -74,13 +76,21 @@ def tsses(db, merge_overlapping=False, attrs=None, attrs_sep=":",
         attributes is supplied, e.g. ["gene_id", "transcript_id"], then these
         will be joined by `attr_join_sep` and then placed in the name field.
 
-
     attrs_sep: str
         If `as_bed6=True` or `merge_overlapping=True`, then use this character
         to separate attributes in the name field of the output BED. If also
         using `merge_overlapping=True`, you'll probably want this to be
         different than `merge_sep` in order to parse things out later.
 
+    bedtools_227_or_later : bool
+        In version 2.27, BEDTools changed the output for merge. By default,
+        this function expects BEDTools version 2.27 or later, but set this to
+        False to assume the older behavior.
+
+        For testing purposes, the environment variable
+        GFFUTILS_USES_BEDTOOLS_227_OR_LATER is set to either "true" or "false"
+        and is used to override this argument.
+
     Examples
     --------
 
@@ -146,7 +156,22 @@ def tsses(db, merge_overlapping=False, attrs=None, attrs_sep=":",
 
 
     """
-    _merge_kwargs = dict(o='distinct', s=True, c=4)
+    _override = os.environ.get('GFFUTILS_USES_BEDTOOLS_227_OR_LATER', None)
+    if _override is not None:
+        if _override == 'true':
+            bedtools_227_or_later = True
+        elif _override == 'false':
+            bedtools_227_or_later = False
+        else:
+            raise ValueError(
+                "Unknown value for GFFUTILS_USES_BEDTOOLS_227_OR_LATER "
+                "environment variable: {0}".format(_override))
+
+    if bedtools_227_or_later:
+        _merge_kwargs = dict(o='distinct', s=True, c='4,5,6')
+    else:
+        _merge_kwargs = dict(o='distinct', s=True, c='4')
+
     if merge_kwargs is not None:
         _merge_kwargs.update(merge_kwargs)
 
@@ -195,18 +220,18 @@ def tsses(db, merge_overlapping=False, attrs=None, attrs_sep=":",
         x = x.each(to_bed).saveas()
 
     if merge_overlapping:
-
-        def fix_merge(f):
-            f = featurefuncs.extend_fields(f, 6)
-            return pybedtools.Interval(
-                f.chrom,
-                f.start,
-                f.stop,
-                f[4],
-                '.',
-                f[3])
-
-        x = x.merge(**_merge_kwargs).each(fix_merge).saveas()
-
+        if bedtools_227_or_later:
+            x = x.merge(**_merge_kwargs)
+        else:
+            def fix_merge(f):
+                f = featurefuncs.extend_fields(f, 6)
+                return pybedtools.Interval(
+                    f.chrom,
+                    f.start,
+                    f.stop,
+                    f[4],
+                    '.',
+                    f[3])
+            x = x.merge(**_merge_kwargs).saveas().each(fix_merge).saveas()
 
     return x


=====================================
gffutils/test/data/synthetic.gff3
=====================================
@@ -0,0 +1,19 @@
+##gff-version 3
+seq1	synthetic1	sequence_feature	5	15	.	.	.	ID=basic1
+seq1	synthetic1	sequence_feature	5	15	.	.	.	ID=discontig1
+seq1	synthetic1	sequence_feature	20	30	.	.	.	ID=discontig1
+seq1	synthetic1	sequence_feature	5	15	.	.	.	ID=perfect_overlap1
+seq1	synthetic1	sequence_feature	16	20	.	.	.	ID=adjacent1
+seq1	synthetic1	sequence_feature	16	16	.	.	.	ID=adjacent2
+seq1	synthetic1	sequence_feature	1	4	.	.	.	ID=adjacent3
+seq1	synthetic1	sequence_feature	4	4	.	.	.	ID=adjacent4
+seq1	synthetic1	sequence_feature	15	20	.	.	.	ID=overlap_end1
+seq1	synthetic1	sequence_feature	1	5	.	.	.	ID=overlap_start1
+seq1	synthetic1	sequence_feature	10	20	.	.	.	ID=overlap_past_end1
+seq1	synthetic1	sequence_feature	10	20	.	+	.	ID=strand_plus1
+seq1	synthetic1	sequence_feature	10	20	.	-	.	ID=strand_minus1
+seq1	synthetic1	sequence_feature	35	40	.	.	.	ID=no_overlap1
+seq1	synthetic1	sequence_feature	21	29	.	.	.	ID=no_overlap2
+seq1	synthetic1	misc_feature	1	10	.	.	.	ID=different_type1
+seq1	synthetic2	sequence_feature	1	10	.	.	.	ID=different_source1
+seq2	synthetic1	sequence_feature	1	10	.	.	.	ID=different_seqid1
\ No newline at end of file


=====================================
gffutils/test/synth_test_base.py
=====================================
@@ -0,0 +1,14 @@
+from unittest import TestCase
+import gffutils
+synthetic_path = gffutils.example_filename('synthetic.gff3')
+num_synthetic_features = 18
+num_synthetic_overlap = 13
+
+
+class TestWithSynthDB(TestCase):
+    def setUp(self):
+        self.db = gffutils.create_db(synthetic_path, ":memory:", merge_strategy='create_unique')  # type: gffutils.FeatureDB
+        self.assertEqual(num_synthetic_features, self.db.count_features_of_type())
+
+    def _dump_db(self):
+        return '\n'.join(map(str, self.db.all_features()))


=====================================
gffutils/test/test.py
=====================================
@@ -120,7 +120,7 @@ def test_update():
         # Add attribute to each gene record
         rec = gene_recs[0]
         rec.attributes["new"] = ["new_value"]
-        db.update([rec])
+        db.update([rec], merge_strategy='replace')
         num_entries += 1
     print(list(db.all_features()))
 
@@ -621,6 +621,7 @@ def test_create_db_from_url():
 
     # Serving test/data folder
     served_folder = gffutils.example_filename('')
+    savedir = os.getcwd()
     os.chdir(served_folder)
 
     print("Starting SimpleHTTPServer in thread")
@@ -654,6 +655,7 @@ def test_create_db_from_url():
         print('Server shutdown.')
         httpd.shutdown()
         server_thread.join()
+        os.chdir(savedir)
 
 
 def test_empty_files():
@@ -880,7 +882,7 @@ def test_tempfiles():
     assert len(filelist) == 1, filelist
     assert filelist[0].endswith('.gffutils')
 
-    #...and another one for gff. This time, make sure the suffix 
+    #...and another one for gff. This time, make sure the suffix
     db = gffutils.create_db(
         gffutils.example_filename('FBgn0031208.gff'), ':memory:', _keep_tempfiles=True)
     filelist = os.listdir(tempdir)
@@ -1006,7 +1008,7 @@ def test_disable_infer():
 
 
 def test_deprecation_handler():
-    return 
+    return
 
     # TODO: when infer_gene_extent actually gets deprecated, test here.
     assert_raises(ValueError, gffutils.create_db,
@@ -1141,7 +1143,7 @@ def test_unreasonable_unquoting():
     assert f.attributes['null'][0] == '\x00'
     assert f.attributes['comma'][0] == ','
 
-    # Commas indicate 
+    # Commas indicate
     assert f.attributes['Parent'] == ['A,', 'B%', 'C']
     assert str(f) == s
 
@@ -1174,6 +1176,145 @@ def test_db_unquoting():
     assert db['e']['Note'] == [',']
     assert db['f']['Note'] == [',']
 
+
+def test_issue_105():
+    fn = gffutils.example_filename('FBgn0031208.gtf')
+    home = os.path.expanduser('~')
+    newfn = os.path.join(home, '.gffutils.test')
+    with open(newfn, 'w') as fout:
+        fout.write(open(fn).read())
+    f = gffutils.iterators.DataIterator(newfn)
+    for i in f:
+        pass
+    os.unlink(newfn)
+
+
+def test_issue_107():
+    s = dedent(
+        '''
+        chr1\t.\tgene\t10\t15\t.\t+\t.\tID=b;
+        chr1\t.\tgene\t1\t5\t.\t-\t.\tID=a;
+        chr2\t.\tgene\t25\t50\t.\t-\t.\tID=c;
+        chr2\t.\tgene\t55\t60\t.\t-\t.\tID=d;
+        ''')
+    tmp = tempfile.NamedTemporaryFile(delete=False).name
+    with open(tmp, 'w') as fout:
+        fout.write(s + '\n')
+    db = gffutils.create_db(tmp, ':memory:')
+    interfeatures = list(db.interfeatures(
+        db.features_of_type('gene', order_by=('seqid', 'start'))))
+    assert [str(i) for i in interfeatures] == [
+            'chr1\tgffutils_derived\tinter_gene_gene\t6\t9\t.\t.\t.\tID=a,b;',
+            'chr2\tgffutils_derived\tinter_gene_gene\t16\t54\t.\t-\t.\tID=c,d;',
+    ]
+
+
+def test_issue_119():
+    # First file has these two exons with no ID:
+    #
+    #   chr2L FlyBase exon  8193  8589  .  +  .  Parent=FBtr0300690
+    #   chr2L FlyBase exon  7529  8116  .  +  .  Name=CG11023:1;Parent=FBtr0300689,FBtr0300690
+    #
+    db0  = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),':memory:')
+
+    # And this one, a bunch of reads with no IDs anywhere
+    db1 = gffutils.create_db(gffutils.example_filename('F3-unique-3.v2.gff'),':memory:')
+
+    # When db1 is updated by db0
+    db2 = db1.update(db0)
+    assert (
+        db2._autoincrements
+        == db1._autoincrements
+        == {'exon': 2, 'read': 112}
+    ), db2._autoincrements
+
+
+    assert len(list(db0.features_of_type('exon'))) == 6
+
+    # Now we update that with db0 again
+    db3 = db2.update(db0, merge_strategy='replace')
+
+    # Using the "replace" strategy, we should have only gotten another 2 exons
+    assert len(list(db3.features_of_type('exon'))) == 8
+
+    # Make sure that the autoincrements for exons jumped by 2
+    assert (
+        db2._autoincrements
+        == db3._autoincrements
+        == {'exon': 4, 'read': 112}
+    ), db2._autoincrements
+
+    # More isolated test, merging two databases each created from the same file
+    # which itself contains only a single feature with no ID.
+    tmp = tempfile.NamedTemporaryFile(delete=False).name
+    with open(tmp, 'w') as fout:
+        fout.write('chr1\t.\tgene\t10\t15\t.\t+\t.\t\n')
+
+    db4 = gffutils.create_db(tmp, tmp + '.db')
+    db5 = gffutils.create_db(tmp, ':memory:')
+
+    assert db4._autoincrements == {'gene': 1}
+    assert db5._autoincrements == {'gene': 1}
+
+    db6 = db4.update(db5)
+
+    db7 = gffutils.FeatureDB(db4.dbfn)
+
+    # both db4 and db6 should now have the same, updated autoincrements because
+    # they both point to the same db.
+    assert db6._autoincrements == db4._autoincrements == {'gene': 2}
+
+    # But db5 was created independently and should have unchanged autoincrements
+    assert db5._autoincrements == {'gene': 1}
+
+    # db7 was created from the database pointed to by both db4 and db6. This
+    # tests that when a FeatureDB is created it should have the
+    # correctly-updated autoincrements read from the db
+    assert db7._autoincrements == {'gene': 2}
+
+def test_pr_131():
+    db  = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),':memory:')
+
+    # previously would raise ValueError("No lines parsed -- was an empty
+    # file provided?"); now just does nothing
+    db2 = db.update([])
+
+
+def test_pr_133():
+    # Previously, merge_attributes would not deep-copy the values from the
+    # second dict, and when the values are then modified, the second dict is
+    # unintentionally modified.
+    d1 = {'a': [1]}
+    d2 = {'a': [2]}
+    d1a = {'a': [1]}
+    d2a = {'a': [2]}
+    d3 = gffutils.helpers.merge_attributes(d1, d2)
+    assert d1 == d1a, d1
+    assert d2 == d2a, d2
+
+
+def test_pr_139():
+    db  = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'),':memory:')
+    exons = list(db.features_of_type('exon'))
+    inter = list(db.interfeatures(exons))
+
+    # previously, the first exon's attributes would show up in subsequent merged features
+    assert exons[0].attributes['Name'][0] not in inter[1].attributes['Name']
+    assert exons[0].attributes['Name'][0] not in inter[2].attributes['Name']
+    assert exons[0].attributes['Name'][0] not in inter[3].attributes['Name']
+
+
+def test_pr_144():
+    # previously this would fail with:
+    #   UnboundLocalError: local variable 'part' referenced before assignment
+    f = gffutils.Feature(attributes={'a': ['']})
+
+    # Make sure everything got converted correctly
+    assert f.attributes['a'] == ['']
+    assert str(f) == ".	.	.	.	.	.	.	.	a"
+    g = gffutils.feature.feature_from_line(str(f))
+    assert g == f
+
 if __name__ == "__main__":
     # this test case fails
     #test_attributes_modify()
@@ -1181,3 +1322,6 @@ if __name__ == "__main__":
     #test_random_chr()
     #test_nonascii()
     test_iterator_update()
+
+
+


=====================================
gffutils/test/test_biopython_integration.py
=====================================
@@ -14,5 +14,6 @@ def test_roundtrip():
     s = bp.to_seqfeature(feature)
     assert s.location.start.position == feature.start - 1
     assert s.location.end.position == feature.stop
+    assert s.id == feature.id
     f = bp.from_seqfeature(s, dialect=dialect, keep_order=True)
     assert feature == f


=====================================
gffutils/test/test_merge.py
=====================================
@@ -0,0 +1,104 @@
+from copy import deepcopy
+
+from gffutils import merge_criteria as mc
+from gffutils.test.synth_test_base import TestWithSynthDB, num_synthetic_features, num_synthetic_overlap
+
+
+class TestMerge(TestWithSynthDB):
+    def _merge_and_compare(self, ids, expected_merge_children, **kwargs):
+        features = [self.db[id] for id in ids]
+        features_backup = deepcopy(features)
+        merged = list(self.db.merge(features, **kwargs))
+        dump = self._dump_db()
+        m_dump = '\n'.join(map(str, merged))
+
+        # Ensure db wasn't modified
+        self.assertEqual(num_synthetic_features, self.db.count_features_of_type(), dump)
+        for expected, actual in zip(features_backup, ids):
+            self.assertEqual(expected, self.db[actual], '\n'.join(map(str, [expected, self.db[actual]])) + '\n')
+
+        # Ensure input Features not modified
+        for expected, actual in zip(features_backup, features):
+            self.assertEqual(expected, actual, '\n'.join(map(str, [expected, actual])) + '\n')
+
+        # Ensure output matches expected
+        self.assertEqual(len(expected_merge_children), len(merged), m_dump)
+        for expected, actual in zip(expected_merge_children, merged):
+            c_dump = '\n'.join(map(str, actual.children)) + '\n' + str(actual) + '\n'
+            self.assertEqual(len(expected), len(actual.children), '\n'.join(map(str, [expected, actual])) + '\n')
+            self.assertTrue(all(child.id in expected for child in actual.children), c_dump)
+
+            if actual.strand != '.':
+                self.assertTrue(all(f.strand == actual.strand for f in actual.children), c_dump)
+
+            if actual.frame != '.':
+                self.assertTrue(all(f.frame == actual.frame for f in actual.children), c_dump)
+
+            if expected:
+                self.assertEqual(actual.start, min(f.start for f in actual.children), c_dump)
+                self.assertEqual(actual.end, max(f.end for f in actual.children), c_dump)
+                if all(f.seqid == actual.children[0].seqid for f in actual.children):
+                    self.assertEqual(actual.seqid, actual.children[0].seqid, c_dump)
+                else:
+                    self.assertEqual(set(f.seqid for f in actual.children), set(actual.seqid.split(',')), c_dump)
+
+                if all(f.featuretype == actual.children[0].featuretype for f in actual.children):
+                    self.assertTrue(all(f.featuretype == actual.featuretype for f in actual.children), c_dump)
+                else:
+                    self.assertEqual('sequence_feature', actual.featuretype, c_dump)
+
+        return merged
+
+    def test_defaults(self):
+        merged = list(self.db.merge(self.db.all_features(order_by=('seqid', 'featuretype', 'strand', 'start'))))
+        dump = '\n'.join(map(str, merged))
+        self.assertEqual(num_synthetic_features, self.db.count_features_of_type(), dump)
+        self.assertEqual(6, len(merged), dump)
+        merged_count = 0
+        for f in merged:
+            if f.children:
+                self.assertEqual(num_synthetic_overlap, len(f.children), dump)
+                merged_count += 1
+
+        self.assertEqual(1, merged_count)
+
+    def test_none(self):
+        merged = list(self.db.merge([]))
+        dump = self._dump_db()
+        self.assertEqual(num_synthetic_features, self.db.count_features_of_type(), dump)
+        self.assertEqual(0, len(merged))
+
+    def test_one(self):
+        self._merge_and_compare(['basic1'], [[]])
+
+    def test_no_overlap(self):
+        self._merge_and_compare(['basic1', 'no_overlap1'], [[],[]])
+
+    def test_perfect_overlap(self):
+        self._merge_and_compare(['basic1', 'perfect_overlap1'], [['basic1', 'perfect_overlap1']])
+
+    def test_overlap(self):
+        self._merge_and_compare(['basic1', 'adjacent1'], [['basic1', 'adjacent1']])
+
+    def test_any_overlap(self):
+        self._merge_and_compare(['basic1', 'overlap_start1'], [['basic1', 'overlap_start1']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+        self._merge_and_compare(['overlap_start1', 'basic1'], [['basic1', 'overlap_start1']])
+        self._merge_and_compare(['adjacent1', 'basic1'], [['basic1', 'adjacent1']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+        self._merge_and_compare(['adjacent4', 'basic1'], [['basic1', 'adjacent4']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+
+    def test_adjacent_len1(self):
+        self._merge_and_compare(['basic1', 'adjacent2'], [['basic1', 'adjacent2']])
+        self._merge_and_compare(['adjacent2', 'basic1'], [['basic1', 'adjacent2']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+        self._merge_and_compare(['basic1', 'adjacent3'], [['basic1', 'adjacent3']], merge_criteria=(mc.seqid, mc.overlap_any_inclusive, mc.strand, mc.feature_type))
+        self._merge_and_compare(['adjacent3', 'basic1'], [['basic1', 'adjacent3']])
+
+    def test_end_overlap(self):
+        self._merge_and_compare(['basic1', 'overlap_end1'], [['basic1', 'overlap_end1']])
+
+    # TODO test various feature orders
+
+    # TODO test various merge criteria
+
+    # TODO test multiline, this doesn't really work until gffutils better supports multiline features
+
+    # TODO add test for PR #152


=====================================
gffutils/test/test_merge_all.py
=====================================
@@ -0,0 +1,53 @@
+from gffutils.test.synth_test_base import TestWithSynthDB, num_synthetic_features, num_synthetic_overlap
+
+
+class TestMerge_all(TestWithSynthDB):
+    def test_defaults(self):
+        merged = self.db.merge_all()
+        dump = self._dump_db()
+        self.assertEqual(num_synthetic_features + 1, self.db.count_features_of_type(), dump)
+        self.assertEqual(1, len(merged), dump)
+        self.assertEqual(num_synthetic_overlap, len(merged[0].children), dump)
+
+    def test_empty(self):
+        self.db.delete(self.db.all_features())
+        dump = self._dump_db()
+        self.assertEqual(0, self.db.count_features_of_type(), dump)
+        merged = self.db.merge_all()
+        dump = self._dump_db()
+        self.assertEqual(0, self.db.count_features_of_type(), dump)
+        self.assertEqual(0, len(merged), dump)
+
+    def test_one(self):
+        features = self.db.all_features()
+        next(features)
+        self.db.delete(features)
+        dump = self._dump_db()
+        self.assertEqual(1, self.db.count_features_of_type(), dump)
+        merged = self.db.merge_all()
+        dump = self._dump_db()
+        self.assertEqual(1, self.db.count_features_of_type(), dump)
+        self.assertEqual(0, len(merged), dump)
+
+    def test_no_overlap(self):
+        self.db.delete(f for f in self.db.all_features() if f.id not in ('basic1', 'no_overlap1'))
+        dump = self._dump_db()
+        self.assertEqual(2, self.db.count_features_of_type(), dump)
+        merged = self.db.merge_all()
+        dump = self._dump_db()
+        self.assertEqual(2, self.db.count_features_of_type(), dump)
+        self.assertEqual(0, len(merged), dump)
+
+    def test_merge_groups(self):
+        merged = self.db.merge_all(featuretypes_groups=({'sequence_feature', 'misc_feature'},))
+        dump = self._dump_db()
+        self.assertEqual(num_synthetic_features + 1, self.db.count_features_of_type(), dump)
+        self.assertEqual(1, len(merged), dump)
+        self.assertEqual(num_synthetic_overlap, len(merged[0].children), dump)
+
+    def test_exclude_components(self):
+        merged = self.db.merge_all(exclude_components=True)
+        dump = self._dump_db()
+        self.assertEqual(6, self.db.count_features_of_type(), dump)
+        self.assertEqual(1, len(merged), dump)
+        self.assertEqual(num_synthetic_overlap, len(merged[0].children), dump)


=====================================
gffutils/version.py
=====================================
@@ -1 +1 @@
-version="0.9"
+version="0.10.1"


=====================================
requirements.txt
=====================================
@@ -1,5 +1,5 @@
-pyfaidx
-six
-argh
-argcomplete
+pyfaidx>=0.5.5.2
+six>=1.12.0
+argh>=0.26.2
+argcomplete>=1.9.4
 simplejson


=====================================
setup.cfg
=====================================
@@ -1,5 +1,4 @@
 [egg_info]
 tag_build = 
 tag_date = 0
-tag_svn_revision = 0
 



View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/commit/22db24567b5c6f4809ea721fbfe575f1aee283c7

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-gffutils/commit/22db24567b5c6f4809ea721fbfe575f1aee283c7
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200101/b6199c8e/attachment-0001.html>


More information about the debian-med-commit mailing list