[med-svn] [Git][med-team/python-bcbio-gff][upstream] New upstream version 0.6.7
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Sat Oct 9 20:40:19 BST 2021
Étienne Mollier pushed to branch upstream at Debian Med / python-bcbio-gff
Commits:
0195e227 by Étienne Mollier at 2021-10-09T21:21:38+02:00
New upstream version 0.6.7
- - - - -
30 changed files:
- BCBio/GFF/GFFParser.py
- BCBio/GFF/__init__.py
- PKG-INFO
- + Scripts/gff/access_gff_index.py
- + Scripts/gff/genbank_to_gff.py
- + Scripts/gff/gff2_to_gff3.py
- + Scripts/gff/gff_to_biosql.py
- + Scripts/gff/gff_to_genbank.py
- + Tests/GFF/F3-unique-3.v2.gff
- + Tests/GFF/c_elegans_WS199_ann_gff.txt
- + Tests/GFF/c_elegans_WS199_dna_shortened.fa
- + Tests/GFF/c_elegans_WS199_shortened_gff.txt
- + Tests/GFF/ensembl_gtf.txt
- + Tests/GFF/glimmer_nokeyval.gff3
- + Tests/GFF/hybrid1.gff3
- + Tests/GFF/jgi_gff2.txt
- + Tests/GFF/mouse_extra_comma.gff3
- + Tests/GFF/ncbi_gff3.txt
- + Tests/GFF/problem_sequence_region.gff3
- + Tests/GFF/spaces.gff3
- + Tests/GFF/trans_splicing.gff3
- + Tests/GFF/transcripts.gff3
- + Tests/GFF/unescaped-semicolon.gff3
- + Tests/GFF/wormbase_gff2.txt
- + Tests/GFF/wormbase_gff2_alt.txt
- + Tests/test_GFFSeqIOFeatureAdder.py
- bcbio_gff.egg-info/PKG-INFO
- bcbio_gff.egg-info/SOURCES.txt
- bcbio_gff.egg-info/requires.txt
- setup.py
Changes:
=====================================
BCBio/GFF/GFFParser.py
=====================================
@@ -19,8 +19,10 @@ import os
import copy
import re
import collections
+import io
import itertools
import warnings
+import six
from six.moves import urllib
# Make defaultdict compatible with versions of python older than 2.4
try:
@@ -34,8 +36,10 @@ from Bio.SeqRecord import SeqRecord
from Bio import SeqFeature
from Bio import SeqIO
from Bio import BiopythonDeprecationWarning
+
warnings.simplefilter("ignore", BiopythonDeprecationWarning)
+
def _gff_line_map(line, params):
"""Map part of Map-Reduce; parses a line of GFF into a dictionary.
@@ -46,6 +50,7 @@ def _gff_line_map(line, params):
- determines the type of attribute (flat, parent, child or annotation)
- generates a dictionary of GFF info which can be serialized as JSON
"""
+
def _merge_keyvals(parts):
"""Merge key-values escaped by quotes that are improperly split at semicolons.
"""
@@ -62,6 +67,7 @@ def _gff_line_map(line, params):
return out
gff3_kw_pat = re.compile("\w+=")
+
def _split_keyvals(keyval_str):
"""Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
@@ -151,15 +157,14 @@ def _gff_line_map(line, params):
gff_parts["id"] = gff_parts["quals"][flat_name][0]
gff_parts["quals"]["ID"] = [gff_parts["id"]]
# children types
- elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR",
- "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
- "start_codon"]:
+ elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", "coding_exon", "five_prime_UTR", "CDS",
+ "stop_codon", "start_codon"]:
gff_parts["quals"]["Parent"] = gff_parts["quals"][flat_name]
break
return gff_parts
- strand_map = {'+' : 1, '-' : -1, '?' : None, None: None}
+ strand_map = {'+': 1, '-': -1, '?': None, None: None}
line = line.strip()
if line[:2] == "##":
return [('directive', line[2:])]
@@ -168,8 +173,7 @@ def _gff_line_map(line, params):
should_do = True
if params.limit_info:
for limit_name, limit_values in params.limit_info.items():
- cur_id = tuple([parts[i] for i in
- params.filter_info[limit_name]])
+ cur_id = tuple([parts[i] for i in params.filter_info[limit_name]])
if cur_id not in limit_values:
should_do = False
break
@@ -200,8 +204,7 @@ def _gff_line_map(line, params):
gff_info['rec_id'] = gff_parts[0]
# if we are describing a location, then we are a feature
if gff_parts[3] and gff_parts[4]:
- gff_info['location'] = [int(gff_parts[3]) - 1,
- int(gff_parts[4])]
+ gff_info['location'] = [int(gff_parts[3]) - 1, int(gff_parts[4])]
gff_info['type'] = gff_parts[2]
gff_info['id'] = quals.get('ID', [''])[0]
gff_info['strand'] = strand_map.get(gff_parts[6], None)
@@ -232,6 +235,7 @@ def _gff_line_map(line, params):
return [(final_key, gff_info)]
return []
+
def _gff_line_reduce(map_results, out, params):
"""Reduce part of Map-Reduce; combines results of parsed features.
"""
@@ -248,6 +252,7 @@ def _gff_line_reduce(map_results, out, params):
vals = simplejson.dumps(vals)
out.add(key, vals)
+
class _MultiIDRemapper:
"""Provide an ID remapping for cases where a parent has a non-unique ID.
@@ -255,6 +260,7 @@ class _MultiIDRemapper:
by using the unique sequence region to assign children to the right
parent.
"""
+
def __init__(self, base_id, all_parents):
self._base_id = base_id
self._parents = all_parents
@@ -271,9 +277,11 @@ class _MultiIDRemapper:
# if we haven't found a location match but parents are umabiguous, return that
if len(self._parents) == 1:
return self._base_id
- raise ValueError("Did not find remapped ID location: %s, %s, %s" % (
- self._base_id, [p['location'] for p in self._parents],
- feature_dict['location']))
+ raise ValueError(
+ "Did not find remapped ID location: %s, %s, %s" %
+ (self._base_id, [p['location'] for p in self._parents], feature_dict['location'])
+ )
+
class _AbstractMapReduceGFF:
"""Base class providing general GFF parsing for local and remote classes.
@@ -283,6 +291,7 @@ class _AbstractMapReduceGFF:
the _gff_process function, which returns a dictionary of SeqRecord
information.
"""
+
def __init__(self, create_missing=True):
"""Initialize GFF parser
@@ -311,8 +320,7 @@ class _AbstractMapReduceGFF:
for rec in self.parse_in_parts(gff_files, base_dict, limit_info):
yield rec
- def parse_in_parts(self, gff_files, base_dict=None, limit_info=None,
- target_lines=None):
+ def parse_in_parts(self, gff_files, base_dict=None, limit_info=None, target_lines=None):
"""Parse a region of a GFF file specified, returning info as generated.
target_lines -- The number of lines in the file which should be used
@@ -363,8 +371,7 @@ class _AbstractMapReduceGFF:
base = self._add_annotations(base, results.get('annotation', []))
for feature in results.get('feature', []):
(_, base) = self._add_toplevel_feature(base, feature)
- base = self._add_parent_child_features(base, results.get('parent', []),
- results.get('child', []))
+ base = self._add_parent_child_features(base, results.get('parent', []), results.get('child', []))
base = self._add_seqs(base, results.get('fasta', []))
base = self._add_directives(base, results.get('directive', []))
return base
@@ -384,8 +391,11 @@ class _AbstractMapReduceGFF:
else:
val = tuple(parts[1:])
# specific directives that need special handling
- if key == "sequence-region": # convert to Python 0-based coordinates
- val = (val[0], int(val[1]) - 1, int(val[2]))
+ if key == "sequence-region": # convert to Python 0-based coordinates
+ if len(val) == 2: # handle regions missing contig
+ val = (int(val[0]) - 1, int(val[1]))
+ elif len(val) == 3:
+ val = (val[0], int(val[1]) - 1, int(val[2]))
dir_keyvals[key].append(val)
for key, vals in dir_keyvals.items():
for rec in base.values():
@@ -414,18 +424,15 @@ class _AbstractMapReduceGFF:
if pid in multi_remap:
pid = multi_remap[pid].remap_id(child_dict)
child_feature.qualifiers['Parent'][pindex] = pid
- children_prep[pid].append((child_dict['rec_id'],
- child_feature))
+ children_prep[pid].append((child_dict['rec_id'], child_feature))
children = dict(children_prep)
# add children to parents that exist
for cur_parent_dict in parents:
cur_id = cur_parent_dict['id']
if cur_id in multi_remap:
- cur_parent_dict['id'] = multi_remap[cur_id].remap_id(
- cur_parent_dict)
+ cur_parent_dict['id'] = multi_remap[cur_id].remap_id(cur_parent_dict)
cur_parent, base = self._add_toplevel_feature(base, cur_parent_dict)
- cur_parent, children = self._add_children_to_parent(cur_parent,
- children)
+ cur_parent, children = self._add_children_to_parent(cur_parent, children)
# create parents for children without them (GFF2 or split/bad files)
while len(children) > 0:
parent_id, cur_children = next(itertools.islice(children.items(), 1))
@@ -433,15 +440,12 @@ class _AbstractMapReduceGFF:
if len(cur_children) == 1:
rec_id, child = cur_children[0]
loc = (child.location.nofuzzy_start, child.location.nofuzzy_end)
- rec, base = self._get_rec(base,
- dict(rec_id=rec_id, location=loc))
+ rec, base = self._get_rec(base, dict(rec_id=rec_id, location=loc))
rec.features.append(child)
del children[parent_id]
else:
- cur_parent, base = self._add_missing_parent(base, parent_id,
- cur_children)
- cur_parent, children = self._add_children_to_parent(cur_parent,
- children)
+ cur_parent, base = self._add_missing_parent(base, parent_id, cur_children)
+ cur_parent, children = self._add_children_to_parent(cur_parent, children)
return base
def _identify_dup_ids(self, parents):
@@ -454,8 +458,7 @@ class _AbstractMapReduceGFF:
multi_ids = collections.defaultdict(list)
for parent in parents:
multi_ids[parent['id']].append(parent)
- multi_ids = [(mid, ps) for (mid, ps) in multi_ids.items()
- if len(parents) > 1]
+ multi_ids = [(mid, ps) for (mid, ps) in multi_ids.items() if len(parents) > 1]
multi_remap = dict()
for mid, parents in multi_ids:
multi_remap[mid] = _MultiIDRemapper(mid, parents)
@@ -525,13 +528,15 @@ class _AbstractMapReduceGFF:
child_strands = list(set(c[1].strand for c in cur_children))
inferred_strand = child_strands[0] if len(child_strands) == 1 else None
assert len(base_rec_id) > 0
- feature_dict = dict(id=parent_id, strand=inferred_strand,
- type="inferred_parent", quals=dict(ID=[parent_id]),
- rec_id=base_rec_id[0])
- coords = [(c.location.nofuzzy_start, c.location.nofuzzy_end)
- for r, c in cur_children]
- feature_dict["location"] = (min([c[0] for c in coords]),
- max([c[1] for c in coords]))
+ feature_dict = dict(
+ id=parent_id,
+ strand=inferred_strand,
+ type="inferred_parent",
+ quals=dict(ID=[parent_id]),
+ rec_id=base_rec_id[0]
+ )
+ coords = [(c.location.nofuzzy_start, c.location.nofuzzy_end) for r, c in cur_children]
+ feature_dict["location"] = (min([c[0] for c in coords]), max([c[1] for c in coords]))
return self._add_toplevel_feature(base, feature_dict)
def _add_toplevel_feature(self, base, feature_dict):
@@ -546,8 +551,9 @@ class _AbstractMapReduceGFF:
"""Retrieve a Biopython feature from our dictionary representation.
"""
location = SeqFeature.FeatureLocation(*feature_dict['location'])
- new_feature = SeqFeature.SeqFeature(location, feature_dict['type'],
- id=feature_dict['id'], strand=feature_dict['strand'])
+ new_feature = SeqFeature.SeqFeature(
+ location, feature_dict['type'], id=feature_dict['id'], strand=feature_dict['strand']
+ )
# Support for Biopython 1.68 and above, which removed sub_features
if not hasattr(new_feature, "sub_features"):
new_feature.sub_features = []
@@ -559,9 +565,11 @@ class _AbstractMapReduceGFF:
"""
return list(SeqIO.parse(in_handle, "fasta"))
+
class _GFFParserLocalOut:
"""Provide a collector for local GFF MapReduce file parsing.
"""
+
def __init__(self, smart_breaks=False):
self._items = dict()
self._smart_breaks = smart_breaks
@@ -619,13 +627,15 @@ class _GFFParserLocalOut:
self._last_parent = None
return self._items
+
class GFFParser(_AbstractMapReduceGFF):
"""Local GFF parser providing standardized parsing of GFF3 and GFF2 files.
"""
+
def __init__(self, line_adjust_fn=None, create_missing=True):
_AbstractMapReduceGFF.__init__(self, create_missing=create_missing)
self._line_adjust_fn = line_adjust_fn
-
+
def _gff_process(self, gff_files, limit_info, target_lines):
"""Process GFF addition without any parallelization.
@@ -655,40 +665,46 @@ class GFFParser(_AbstractMapReduceGFF):
if need_close:
in_handle.close()
- def _lines_to_out_info(self, line_iter, limit_info=None,
- target_lines=None):
+ def _lines_to_out_info(self, line_iter, limit_info=None, target_lines=None):
"""Generate SeqRecord and SeqFeatures from GFF file lines.
"""
params = self._examiner._get_local_params(limit_info)
- out_info = _GFFParserLocalOut((target_lines is not None and
- target_lines > 1))
+ out_info = _GFFParserLocalOut((target_lines is not None and target_lines > 1))
found_seqs = False
for line in line_iter:
results = self._map_fn(line, params)
if self._line_adjust_fn and results:
if results[0][0] not in ['directive']:
- results = [(results[0][0],
- self._line_adjust_fn(results[0][1]))]
+ results = [(results[0][0], self._line_adjust_fn(results[0][1]))]
self._reduce_fn(results, out_info, params)
- if (target_lines and out_info.num_lines >= target_lines and
- out_info.can_break):
+ if (target_lines and out_info.num_lines >= target_lines and out_info.can_break):
yield out_info.get_results()
- out_info = _GFFParserLocalOut((target_lines is not None and
- target_lines > 1))
- if (results and results[0][0] == 'directive' and
- results[0][1] == 'FASTA'):
+ out_info = _GFFParserLocalOut((target_lines is not None and target_lines > 1))
+ if (results and results[0][0] == 'directive' and results[0][1] == 'FASTA'):
found_seqs = True
break
class FakeHandle:
+
def __init__(self, line_iter):
self._iter = line_iter
+
def __iter__(self):
return self
+
def __next__(self):
return next(self._iter)
- def read(self):
- return "".join(l for l in self._iter)
+
+ next = __next__
+
+ def read(self, size=-1):
+ if size < 0:
+ return "".join(l for l in self._iter)
+ elif size == 0:
+ return "" # Used by Biopython to sniff unicode vs bytes
+ else:
+ raise NotImplementedError
+
def readline(self):
try:
return next(self._iter)
@@ -701,9 +717,11 @@ class GFFParser(_AbstractMapReduceGFF):
if out_info.has_items():
yield out_info.get_results()
+
class DiscoGFFParser(_AbstractMapReduceGFF):
"""GFF Parser with parallelization through Disco (http://discoproject.org.
"""
+
def __init__(self, disco_host, create_missing=True):
"""Initialize parser.
@@ -720,32 +738,36 @@ class DiscoGFFParser(_AbstractMapReduceGFF):
# make these imports local; only need them when using disco
import simplejson
import disco
- # absolute path names unless they are special disco files
+ # absolute path names unless they are special disco files
full_files = []
for f in gff_files:
if f.split(":")[0] != "disco":
full_files.append(os.path.abspath(f))
else:
full_files.append(f)
- results = disco.job(self._disco_host, name="gff_reader",
- input=full_files,
- params=disco.Params(limit_info=limit_info, jsonify=True,
- filter_info=self._examiner._filter_info),
- required_modules=["simplejson", "collections", "re"],
- map=self._map_fn, reduce=self._reduce_fn)
+ results = disco.job(
+ self._disco_host,
+ name="gff_reader",
+ input=full_files,
+ params=disco.Params(limit_info=limit_info, jsonify=True, filter_info=self._examiner._filter_info),
+ required_modules=["simplejson", "collections", "re"],
+ map=self._map_fn,
+ reduce=self._reduce_fn
+ )
processed = dict()
for out_key, out_val in disco.result_iterator(results):
processed[out_key] = simplejson.loads(out_val)
yield processed
+
def parse(gff_files, base_dict=None, limit_info=None, target_lines=None):
"""High level interface to parse GFF files into SeqRecords and SeqFeatures.
"""
parser = GFFParser()
- for rec in parser.parse_in_parts(gff_files, base_dict, limit_info,
- target_lines):
+ for rec in parser.parse_in_parts(gff_files, base_dict, limit_info, target_lines):
yield rec
+
def parse_simple(gff_files, limit_info=None):
"""Parse GFF files as line by line dictionary of parts.
"""
@@ -756,18 +778,24 @@ def parse_simple(gff_files, limit_info=None):
yield rec["child"][0]
elif "parent" in rec:
yield rec["parent"][0]
+ elif "feature" in rec:
+ yield rec["feature"][0]
# ignore directive lines
else:
assert "directive" in rec
+
def _file_or_handle(fn):
"""Decorator to handle either an input handle or a file.
"""
+
def _file_or_handle_inside(*args, **kwargs):
in_file = args[1]
if hasattr(in_file, "read"):
need_close = False
in_handle = in_file
+ if six.PY3 and not isinstance(in_handle, io.TextIOBase):
+ raise TypeError('input handle must be opened in text mode')
else:
need_close = True
in_handle = open(in_file)
@@ -776,8 +804,10 @@ def _file_or_handle(fn):
if need_close:
in_handle.close()
return out
+
return _file_or_handle_inside
+
class GFFExaminer:
"""Provide high level details about a GFF file to refine parsing.
@@ -787,19 +817,22 @@ class GFFExaminer:
information you need. This class provides high level summary details to
help in learning.
"""
+
def __init__(self):
- self._filter_info = dict(gff_id = [0], gff_source_type = [1, 2],
- gff_source = [1], gff_type = [2])
-
+ self._filter_info = dict(gff_id=[0], gff_source_type=[1, 2], gff_source=[1], gff_type=[2])
+
def _get_local_params(self, limit_info=None):
+
class _LocalParams:
+
def __init__(self):
self.jsonify = False
+
params = _LocalParams()
params.limit_info = limit_info
params.filter_info = self._filter_info
return params
-
+
@_file_or_handle
def available_limits(self, gff_handle):
"""Return dictionary information on possible limits for this file.
@@ -856,16 +889,12 @@ class GFFExaminer:
if line.startswith("##FASTA"):
break
if line.strip() and not line.startswith("#"):
- line_type, line_info = _gff_line_map(line,
- self._get_local_params())[0]
- if (line_type == 'parent' or (line_type == 'child' and
- line_info['id'])):
- parent_sts[line_info['id']] = (
- line_info['quals'].get('source', [""])[0], line_info['type'])
+ line_type, line_info = _gff_line_map(line, self._get_local_params())[0]
+ if (line_type == 'parent' or (line_type == 'child' and line_info['id'])):
+ parent_sts[line_info['id']] = (line_info['quals'].get('source', [""])[0], line_info['type'])
if line_type == 'child':
for parent_id in line_info['quals']['Parent']:
- child_sts[parent_id].append((
- line_info['quals'].get('source', [""])[0], line_info['type']))
+ child_sts[parent_id].append((line_info['quals'].get('source', [""])[0], line_info['type']))
#print parent_sts, child_sts
# generate a dictionary of the unique final type relationships
pc_map = collections.defaultdict(list)
=====================================
BCBio/GFF/__init__.py
=====================================
@@ -3,4 +3,4 @@
from BCBio.GFF.GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple
from BCBio.GFF.GFFOutput import GFF3Writer, write
-__version__="0.6.6"
+__version__ = "0.6.7"
=====================================
PKG-INFO
=====================================
@@ -1,10 +1,10 @@
Metadata-Version: 1.0
Name: bcbio-gff
-Version: 0.6.6
+Version: 0.6.7
Summary: Read and write Generic Feature Format (GFF) with Biopython integration.
Home-page: https://github.com/chapmanb/bcbb/tree/master/gff
Author: Brad Chapman
Author-email: chapmanb at 50mail.com
-License: UNKNOWN
+License: Biopython License
Description: UNKNOWN
Platform: UNKNOWN
=====================================
Scripts/gff/access_gff_index.py
=====================================
@@ -0,0 +1,98 @@
+"""Access an GFF file using bx-python's interval indexing.
+
+Requires:
+ bx-python: http://bitbucket.org/james_taylor/bx-python/wiki/Home
+ gff library: http://github.com/chapmanb/bcbb/tree/master/gff
+
+Index time:
+ 44 Mb file
+ 11 seconds
+ Index is 7.5Mb
+"""
+from __future__ import with_statement
+import os
+import sys
+
+from bx import interval_index_file
+
+from BCBio import GFF
+
+def main(gff_file):
+ gff_index = gff_file + ".index"
+ if not os.path.exists(gff_index):
+ print "Indexing GFF file"
+ index(gff_file)
+ index = GFFIndexedAccess(gff_file, keep_open=True)
+ print index.seqids
+ print
+ for feature in index.get_features_in_region("Chr2", 17500, 20000):
+ print feature
+ for feature in index.get_features_in_region("Chr5", 500000, 502500):
+ print feature
+
+ exam = GFF.GFFExaminer()
+ #print exam.available_limits(gff_file)
+ #print exam.parent_child_map(gff_file)
+
+ found = 0
+ limit_info = dict(
+ gff_type = ["protein", "gene", "mRNA", "exon", "CDS", "five_prime_UTR",
+ "three_prime_UTR"]
+ )
+ for feature in index.get_features_in_region("Chr1", 0, 50000,
+ limit_info):
+ found += 1
+ print found
+
+class GFFIndexedAccess(interval_index_file.AbstractIndexedAccess):
+ """Provide indexed access to a GFF file.
+ """
+ def __init__(self, *args, **kwargs):
+ interval_index_file.AbstractIndexedAccess.__init__(self, *args,
+ **kwargs)
+ self._parser = GFF.GFFParser()
+
+ @property
+ def seqids(self):
+ return self.indexes.indexes.keys()
+
+ def get_features_in_region(self, seqid, start, end, limit_info=None):
+ """Retrieve features located on a given region in start/end coordinates.
+ """
+ limit_info = self._parser._normalize_limit_info(limit_info)
+ line_gen = self.get_as_iterator(seqid, int(start), int(end))
+ recs = None
+ for results in self._parser._lines_to_out_info(line_gen, limit_info):
+ assert not recs, "Unexpected multiple results"
+ recs = self._parser._results_to_features(dict(), results)
+ if recs is None:
+ return []
+ else:
+ assert len(recs) == 1
+ rec = recs[seqid]
+ return rec.features
+
+ def read_at_current_offset(self, handle, **kwargs):
+ line = handle.readline()
+ return line
+
+def index(gff_file, index_file=None):
+ index = interval_index_file.Indexes()
+ with open(gff_file) as in_handle:
+ while 1:
+ pos = in_handle.tell()
+ line = in_handle.readline()
+ if not line:
+ break
+ if not line.startswith("#"):
+ parts = line.split("\t")
+ (seqid, gtype, source, start, end) = parts[:5]
+ index.add(seqid, int(start), int(end), pos)
+ if index_file is None:
+ index_file = gff_file + ".index"
+ with open(index_file, "w") as index_handle:
+ index.write(index_handle)
+ return index_file
+
+if __name__ == "__main__":
+ main(*sys.argv[1:])
=====================================
Scripts/gff/genbank_to_gff.py
=====================================
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+"""Convert a GenBank file into GFF format.
+
+Usage:
+ genbank_to_gff.py <genbank_file>
+"""
+import sys
+import os
+
+from Bio import SeqIO
+from Bio import Seq
+
+from BCBio import GFF
+
+def main(gb_file):
+ out_file = "%s.gff" % os.path.splitext(gb_file)[0]
+ with open(out_file, "w") as out_handle:
+ GFF.write(SeqIO.parse(gb_file, "genbank"), out_handle)
+
+if __name__ == "__main__":
+ main(*sys.argv[1:])
=====================================
Scripts/gff/gff2_to_gff3.py
=====================================
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+"""Convert a GFF2 file to an updated GFF3 format file.
+
+Usage:
+ gff2_to_gff3.py <in_gff2_file>
+
+The output file has the same name with the extension gff3.
+"""
+import sys
+import os
+
+from BCBio.GFF import GFFParser, GFF3Writer
+
+def main(in_file):
+ base, ext = os.path.splitext(in_file)
+ out_file = "%s.gff3" % (base)
+ in_handle = open(in_file)
+ out_handle = open(out_file, "w")
+ reader = GFFParser()
+ writer = GFF3Writer()
+ writer.write(reader.parse_in_parts(in_handle, target_lines=25000),
+ out_handle)
+ in_handle.close()
+ out_handle.close()
+
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ print __doc__
+ sys.exit()
+ main(sys.argv[1])
=====================================
Scripts/gff/gff_to_biosql.py
=====================================
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+"""Load a fasta file of sequences and associated GFF file into BioSQL.
+
+You will need to adjust the database parameters and have a BioSQL database set
+up. See:
+
+http://biopython.org/wiki/BioSQL
+
+Depending on the size of the sequences being loaded, you may also get errors on
+loading very large chromosome sequences. Updating these options can help:
+
+ set global max_allowed_packet=1000000000;
+ set global net_buffer_length=1000000;
+
+Usage:
+ gff_to_biosql.py <fasta file> <gff file>
+"""
+from __future__ import with_statement
+import sys
+
+from BioSQL import BioSeqDatabase
+from Bio import SeqIO
+
+from BCBio.GFF import GFFParser
+
+def main(seq_file, gff_file):
+ # -- To be customized
+ # You need to update these parameters to point to your local database
+ # XXX demo example could be swapped to use SQLite when that is integrated
+ user = "chapmanb"
+ passwd = "cdev"
+ host = "localhost"
+ db_name = "wb199_gff"
+ biodb_name = "wb199_gff_cds_pcr"
+ # These need to be updated to reflect what you would like to parse
+ # out of the GFF file. Set limit_info=None to parse everything, but
+ # be sure the file is small or you may deal with memory issues.
+ rnai_types = [('Orfeome', 'PCR_product'),
+ ('GenePair_STS', 'PCR_product'),
+ ('Promoterome', 'PCR_product')]
+ gene_types = [('Non_coding_transcript', 'gene'),
+ ('Coding_transcript', 'gene'),
+ ('Coding_transcript', 'mRNA'),
+ ('Coding_transcript', 'CDS')]
+ limit_info = dict(gff_source_type = rnai_types + gene_types)
+ # --
+ print "Parsing FASTA sequence file..."
+ with open(seq_file) as seq_handle:
+ seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
+
+ print "Parsing GFF data file..."
+ parser = GFFParser()
+ recs = parser.parse(gff_file, seq_dict, limit_info=limit_info)
+
+ print "Writing to BioSQL database..."
+ server = BioSeqDatabase.open_database(driver="MySQLdb", user=user,
+ passwd=passwd, host=host, db=db_name)
+ try:
+ if biodb_name not in server.keys():
+ server.new_database(biodb_name)
+ else:
+ server.remove_database(biodb_name)
+ server.adaptor.commit()
+ server.new_database(biodb_name)
+ db = server[biodb_name]
+ db.load(recs)
+ server.adaptor.commit()
+ except:
+ server.adaptor.rollback()
+ raise
+
+if __name__ == "__main__":
+ if len(sys.argv) != 3:
+ print __doc__
+ sys.exit()
+ main(sys.argv[1], sys.argv[2])
=====================================
Scripts/gff/gff_to_genbank.py
=====================================
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+"""Convert a GFF and associated FASTA file into GenBank format.
+
+Usage:
+ gff_to_genbank.py <GFF annotation file> [<FASTA sequence file> <molecule type>]
+
+ FASTA sequence file: input sequences matching records in GFF. Optional if sequences
+ are in the GFF
+ molecule type: type of molecule in the GFF file. Defaults to DNA, the most common case.
+"""
+from __future__ import print_function
+
+import sys
+import os
+
+from Bio import SeqIO
+
+from BCBio import GFF
+
+
+def main(gff_file, fasta_file=None, molecule_type="DNA"):
+ out_file = "%s.gb" % os.path.splitext(gff_file)[0]
+ if fasta_file:
+ fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))
+ else:
+ fasta_input = {}
+ gff_iter = GFF.parse(gff_file, fasta_input)
+ SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter), molecule_type), out_file, "genbank")
+
+
+def _fix_ncbi_id(fasta_iter):
+ """GenBank identifiers can only be 16 characters; try to shorten NCBI.
+ """
+ for rec in fasta_iter:
+ if len(rec.name) > 16 and rec.name.find("|") > 0:
+ new_id = [x for x in rec.name.split("|") if x][-1]
+ print("Warning: shortening NCBI name %s to %s" % (rec.id, new_id))
+ rec.id = new_id
+ rec.name = new_id
+ yield rec
+
+
+def _check_gff(gff_iterator, molecule_type):
+ """Check GFF files before feeding to SeqIO to be sure they have sequences.
+ """
+ for rec in gff_iterator:
+ if "molecule_type" not in rec.annotations:
+ rec.annotations["molecule_type"] = molecule_type
+ yield _flatten_features(rec)
+
+
+def _flatten_features(rec):
+ """Make sub_features in an input rec flat for output.
+
+ GenBank does not handle nested features, so we want to make
+ everything top level.
+ """
+ out = []
+ for f in rec.features:
+ cur = [f]
+ while len(cur) > 0:
+ nextf = []
+ for curf in cur:
+ out.append(curf)
+ if len(curf.sub_features) > 0:
+ nextf.extend(curf.sub_features)
+ cur = nextf
+ rec.features = out
+ return rec
+
+
+if __name__ == "__main__":
+ main(*sys.argv[1:])
=====================================
Tests/GFF/F3-unique-3.v2.gff
=====================================
@@ -0,0 +1,128 @@
+##solid-gff-version 0.2
+##gff-version 2
+##source-version MaToGff.java v1.5
+##date 2008-05-28
+##time 13:11:03
+##Type solid_read
+##color-code AA=0,AC=1,AG=2,AT=3,CA=1,CC=0,CG=3,CT=2,GA=2,GC=3,GG=0,GT=1,TA=3,TC=2,TG=1,TT=0
+##primer-base F3=T
+##max-num-mismatches 3
+##max-read-length 20
+##line-order fragment
+##history filter_fasta.pl --noduplicates --output=/data/results/DAEMON/DAEMON_MATE_PAIRS_2_20070326/S1/results.01/primary.20071218094706805 --name=DAEMON_MATE_PAIRS_2_20070326_S1 --tag=F3 --minlength=20 --prefix=T /data/results/DAEMON/DAEMON_MATE_PAIRS_2_20070326/S1/jobs/postPrimerSetPrimary.117/rawseq
+##history map /data/results/RegressionDriver/CaseManager/results/r12/integration/case0002/reads1/test_S1_F3.csfasta /data/results/RegressionDriver/CaseManager/knownData/validatedReference/matchingPipeline/ecoli_k12_MG1655.fasta T=30 L=19 C=1 E=.Tmpfile1211939575SVhDtd F=0 B=1 D=1 u=1 r=0 n=1 Z=1000 P="0000000111111111111" M=0 U=0.000000 H=0 > .Tmpfile1211939575SVhDtd.out.1
+##history MaToGff.java --sort --qvs=test_S1_F3_QV.qual.txt --convert=unique --clear=3 --tempdir=../tmp test_S1_F3.csfasta.ma.20.3
+##hdr seqname source feature start end score strand frame [attributes] [comments]
+3_336_815_F3 solid read 55409 55428 10.4 + . g=A3233312322232122211;i=1;p=1.000;q=23,12,18,17,10,24,19,14,27,9,23,9,16,20,11,7,8,4,4,14;u=0,0,0,1
+3_142_1011_F3 solid read 91290 91309 5.0 - . g=T0330222333132222222;i=1;p=1.000;q=4,4,14,4,4,4,4,21,4,4,4,4,25,4,4,4,5,21,4,4;u=0,0,0,1
+3_341_424_F3 solid read 102717 102736 10.6 - . g=T2203031313223113212;i=1;p=1.000;q=9,27,25,16,18,9,27,26,23,13,14,25,27,5,24,5,26,26,4,5;u=0,0,1
+3_6_37_F3 solid read 181053 181072 9.4 + . g=C3220221332111020310;i=1;p=1.000;q=9,5,13,9,10,22,6,12,21,7,13,4,21,16,23,6,20,20,13,6;u=0,0,0,1
+3_34_202_F3 solid read 284207 284226 6.9 + . g=G0301333332232122333;i=1;p=1.000;q=6,15,21,8,12,4,4,5,12,8,4,12,4,7,10,6,8,16,4,6;u=0,1
+3_277_712_F3 solid read 304136 304155 11.8 - . g=A2033101122223322133;i=1;p=1.000;q=26,11,14,27,4,17,4,26,26,23,17,25,26,27,21,23,5,20,26,23;u=0,1
+3_394_71_F3 solid read 308736 308755 10.8 + . g=T3203322323203312331;i=1;p=1.000;q=9,24,19,15,20,18,20,10,13,13,11,21,12,7,4,11,20,24,4,25;u=0,1
+3_285_1497_F3 solid read 404055 404074 8.4 - . g=T1221231003202232221;i=1;p=1.000;q=8,10,6,25,16,14,23,27,8,14,21,19,5,4,4,6,22,12,4,6;u=0,0,0,1
+3_228_178_F3 solid read 453227 453246 9.5 - . g=G1130333332331110323;i=1;p=1.000;q=4,19,25,18,18,5,19,6,8,24,4,26,21,11,15,4,26,13,13,15;u=0,0,0,1
+3_406_794_F3 solid read 504835 504854 8.3 - . g=T3033331301320201111;i=1;p=1.000;q=27,4,13,4,21,11,7,11,5,26,10,8,9,4,6,18,9,26,17,6;u=0,0,0,1
+3_303_251_F3 solid read 561501 561520 5.3 + . g=C0011111112222112221;i=1;p=1.000;q=9,8,4,4,10,4,4,4,6,14,4,4,4,4,16,4,4,4,4,23;u=0,0,1
+3_152_112_F3 solid read 624012 624031 7.7 - . g=G0301122312213122221;i=1;p=1.000;q=22,14,7,13,18,5,11,4,15,6,6,11,4,8,15,5,10,4,6,24;u=0,0,0,1
+3_112_1154_F3 solid read 630582 630601 11.3 - . g=T1333312011131131011;i=1;p=1.000;q=27,27,4,5,17,24,20,19,7,4,25,17,18,15,22,23,17,25,16,26;u=0,0,1
+3_196_392_F3 solid read 661664 661683 19.7 - . g=T3321013301122133323;i=1;p=1.000;q=27,25,13,26,21,25,23,27,27,27,27,11,16,27,27,19,26,27,26,27;u=1
+3_192_1248_F3 solid read 672037 672056 4.5 - . g=A0333232333121222222;i=1;p=1.000;q=4,7,4,4,4,4,4,4,6,4,4,4,4,4,7,7,4,4,6,4;u=0,0,0,1
+3_63_479_F3 solid read 742582 742601 7.9 - . g=A0133333333233232332;i=1;p=1.000;q=4,9,6,11,20,12,11,9,13,20,18,4,4,14,9,15,4,6,21,4;u=0,0,0,1
+3_30_710_F3 solid read 816069 816088 9.2 - . g=T3311001223313333313;i=1;p=1.000;q=22,27,18,25,25,7,26,25,14,23,6,25,5,11,7,4,15,7,4,6;u=0,0,0,1
+3_284_77_F3 solid read 864876 864895 7.4 + . g=T2003133033233112331;i=1;p=1.000;q=13,19,4,11,22,24,6,16,4,6,13,4,12,18,4,6,7,11,4,5;u=0,0,0,1
+3_411_1040_F3 solid read 876023 876042 10.9 - . g=T2121301233200033221;i=1;p=1.000;q=9,9,5,12,11,8,4,16,27,27,18,21,24,9,18,24,21,9,23,17;u=0,0,0,1
+3_188_171_F3 solid read 884683 884702 5.8 - . g=A1322330132213322231;i=1;p=1.000;q=4,8,4,5,7,6,5,4,11,6,6,11,4,8,4,8,4,6,4,15;u=0,0,0,1
+3_63_787_F3 solid read 1022149 1022168 7.5 + . g=C3131132013020123031;i=1;p=1.000;q=12,13,26,14,9,9,13,14,4,7,8,5,11,4,17,4,4,6,4,21;u=0,1
+3_391_2015_F3 solid read 1074989 1075008 18.5 - . g=A2323101222321232322;i=1;p=1.000;q=27,25,18,20,27,27,24,23,27,23,27,25,19,26,12,26,9,21,27,21;u=1
+3_8_425_F3 solid read 1119124 1119143 6.7 - . g=T0321201132230303323;i=1;p=1.000;q=6,5,8,6,4,4,23,9,12,10,15,4,13,13,8,4,4,5,5,12;u=0,0,1
+3_53_745_F3 solid read 1130179 1130198 7.6 - . g=C0213313233333113321;i=1;p=1.000;q=27,6,9,22,18,9,8,15,6,8,14,5,8,6,16,4,5,4,4,14;u=0,0,0,1
+3_123_576_F3 solid read 1219122 1219141 8.7 + . g=A3333133323333323323;i=1;p=1.000;q=18,22,5,11,16,16,8,14,8,5,19,8,9,10,7,11,6,11,9,4;u=0,0,1
+3_81_12_F3 solid read 1236732 1236751 8.6 + . g=G2210332302233112321;i=1;p=1.000;q=7,16,17,9,7,9,9,16,9,4,10,21,17,8,4,6,9,16,6,12;u=0,0,0,1
+3_96_1862_F3 solid read 1264409 1264428 6.9 - . g=G0301032323231222021;i=1;p=1.000;q=26,23,11,20,15,8,6,4,6,6,9,7,6,4,8,6,4,5,6,5;u=0,0,0,1
+3_40_136_F3 solid read 1266177 1266196 7.4 - . g=T2332222332203312221;i=1;p=1.000;q=9,23,6,19,13,9,4,8,17,9,4,4,13,9,8,5,4,6,10,8;u=0,0,1
+3_124_1781_F3 solid read 1385416 1385435 10.3 + . g=A1322302333332222132;i=1;p=1.000;q=13,17,8,6,5,9,24,4,7,9,18,27,18,16,16,23,18,18,11,23;u=0,0,1
+3_134_1165_F3 solid read 1393169 1393188 9.0 - . g=T3301123202321131311;i=1;p=1.000;q=4,27,18,7,27,4,27,26,4,20,4,27,26,9,27,4,27,14,10,27;u=1
+3_224_587_F3 solid read 1490044 1490063 6.1 + . g=G2032313231111233321;i=1;p=1.000;q=4,4,6,6,13,24,4,4,5,15,6,7,9,14,4,4,4,25,5,5;u=0,0,0,1
+3_25_747_F3 solid read 1513598 1513617 9.5 + . g=T1223213101133121231;i=1;p=1.000;q=26,27,8,27,27,27,26,27,26,19,8,14,4,17,11,5,7,4,7,6;u=0,0,1
+3_143_14_F3 solid read 1528236 1528255 9.7 + . g=T3233113323230202011;i=1;p=1.000;q=13,23,17,19,23,16,24,25,14,15,9,6,4,11,4,9,12,4,16,10;u=0,0,0,1
+3_164_1025_F3 solid read 1570107 1570126 7.9 - . g=T3220332323303320231;i=1;p=1.000;q=7,10,20,8,4,24,4,4,21,6,26,22,9,6,11,9,6,4,17,14;u=0,0,0,1
+3_137_552_F3 solid read 1630276 1630295 9.1 - . g=G3030333223233102131;i=1;p=1.000;q=6,28,9,4,6,26,27,6,10,9,27,21,6,16,9,25,6,7,23,12;u=0,0,0,1
+3_125_1810_F3 solid read 1634104 1634123 10.5 + . g=G1232220322032311332;i=1;p=1.000;q=27,8,26,26,10,6,26,12,27,27,26,4,27,27,23,8,8,4,27,12;u=0,0,0,1
+3_314_1310_F3 solid read 1639981 1640000 9.2 + . g=A2221332230322203033;i=1;p=1.000;q=19,12,6,27,11,27,6,11,5,6,9,13,27,27,8,18,5,22,4,27;u=0,0,0,1
+3_384_591_F3 solid read 1654341 1654360 6.8 + . g=A3323221133121102313;i=1;p=1.000;q=19,8,7,7,15,4,20,7,4,6,14,7,19,6,8,4,5,9,4,4;u=0,0,0,1
+3_145_739_F3 solid read 1791040 1791059 11.9 - . g=A0221223333323131212;i=1;p=1.000;q=20,27,23,13,27,14,27,28,27,25,12,24,8,16,8,4,8,21,9,11;u=0,0,0,1
+3_326_2020_F3 solid read 1830564 1830583 9.3 + . g=A3321322331103233322;i=1;p=1.000;q=14,4,25,16,10,12,16,5,14,10,25,5,25,5,9,18,13,26,4,26;u=0,0,0,1
+3_233_1265_F3 solid read 1857564 1857583 8.9 + . g=T3112113020130223311;i=1;p=1.000;q=7,27,25,26,27,14,26,27,27,27,4,6,5,10,17,4,5,7,6,12;u=0,0,1
+3_235_100_F3 solid read 1912460 1912479 9.6 - . g=G2233020000132311231;i=1;p=1.000;q=23,24,25,16,17,6,21,25,9,4,6,11,8,19,6,6,19,14,13,6;u=0,0,0,1
+3_111_107_F3 solid read 1944496 1944515 7.6 - . g=C3023223333211322231;i=1;p=1.000;q=15,5,6,14,5,13,4,12,11,4,9,9,11,12,4,11,11,13,6,6;u=0,0,0,1
+3_457_1514_F3 solid read 1956598 1956617 9.9 - . g=T0013331013332110221;i=1;p=1.000;q=18,24,10,24,23,25,22,11,20,10,15,11,4,5,27,4,9,13,5,27;u=0,1
+3_183_74_F3 solid read 1992040 1992059 9.8 + . g=C3332233131131222322;i=1;p=1.000;q=27,27,25,23,25,8,11,11,7,11,4,12,14,10,15,7,14,4,9,12;u=0,0,1
+3_357_1303_F3 solid read 2037917 2037936 10.9 - . g=T3331331323320311331;i=1;p=1.000;q=7,27,5,19,26,8,27,12,14,27,8,27,23,9,19,4,26,20,9,27;u=0,0,0,1
+3_153_186_F3 solid read 2083441 2083460 6.7 + . g=T3112233331133323322;i=1;p=1.000;q=7,14,19,7,12,6,11,4,11,8,4,6,6,4,11,4,6,4,4,18;u=0,1
+3_65_1741_F3 solid read 2107441 2107460 8.4 + . g=T3333332330233132123;i=1;p=1.000;q=4,4,6,25,9,4,26,16,21,9,18,15,27,27,4,21,9,7,9,6;u=0,0,0,1
+3_98_323_F3 solid read 2118821 2118840 7.5 + . g=A3222212322131112031;i=1;p=1.000;q=13,14,8,10,8,14,4,13,10,7,15,4,6,4,4,12,6,11,6,8;u=0,0,1
+3_48_258_F3 solid read 2153882 2153901 9.4 - . g=G0330113313201122321;i=1;p=1.000;q=22,15,20,4,16,17,14,24,4,5,4,22,19,8,10,9,13,22,8,15;u=0,0,0,1
+3_140_1125_F3 solid read 2182909 2182928 7.9 + . g=T3231331302232001131;i=1;p=1.000;q=10,4,12,6,4,12,13,6,18,5,8,11,4,26,6,25,5,18,11,12;u=0,0,0,1
+3_359_118_F3 solid read 2188393 2188412 8.4 + . g=A0301311133331131322;i=1;p=1.000;q=11,5,7,13,20,6,6,25,8,18,9,15,27,9,6,7,15,17,4,4;u=0,0,0,1
+3_203_483_F3 solid read 2272874 2272893 9.1 - . g=C3031223110333133311;i=1;p=1.000;q=23,21,25,27,10,5,22,15,17,18,5,18,17,5,19,4,4,13,4,22;u=0,0,0,1
+3_66_301_F3 solid read 2286038 2286057 6.6 - . g=C1113113330132222311;i=1;p=1.000;q=10,4,6,4,8,13,9,4,10,9,4,6,13,9,5,6,11,6,4,9;u=0,0,0,1
+3_78_130_F3 solid read 2291021 2291040 7.6 + . g=G3233131332212222321;i=1;p=1.000;q=13,16,6,12,17,11,10,4,12,8,13,4,8,6,4,4,12,10,4,11;u=0,0,0,1
+3_141_110_F3 solid read 2291354 2291373 9.3 + . g=T1312203322212123321;i=1;p=1.000;q=9,21,24,11,16,4,23,27,16,16,8,22,6,10,16,4,9,4,7,25;u=0,0,1
+3_51_1383_F3 solid read 2374918 2374937 8.8 + . g=T3311203033322222231;i=1;p=1.000;q=24,26,6,27,27,23,27,4,21,27,4,27,6,9,24,4,23,4,4,27;u=0,0,1
+3_231_366_F3 solid read 2392091 2392110 10.0 - . g=T2022333223101331322;i=1;p=1.000;q=18,12,9,9,13,8,7,22,7,7,4,26,12,17,9,20,24,8,18,14;u=0,0,0,1
+3_214_1802_F3 solid read 2394604 2394623 8.8 - . g=T1232111001220211133;i=1;p=1.000;q=17,18,14,6,19,4,21,4,6,12,11,4,26,20,9,18,7,16,5,18;u=0,0,0,1
+3_67_1434_F3 solid read 2454508 2454527 15.2 - . g=T3121311232222231203;i=1;p=1.000;q=9,27,27,18,16,14,25,27,26,21,19,27,27,27,15,5,24,27,24,24;u=0,0,1
+3_124_1647_F3 solid read 2493617 2493636 7.5 + . g=A0211320203220231332;i=1;p=1.000;q=9,12,12,9,6,14,12,7,4,4,12,9,4,9,16,4,4,9,9,16;u=0,0,0,1
+3_39_328_F3 solid read 2500759 2500778 7.8 + . g=T1332333033231132333;i=1;p=1.000;q=24,27,26,26,25,21,7,8,4,5,20,4,11,6,8,4,6,4,11,7;u=0,0,1
+3_378_322_F3 solid read 2541624 2541643 8.9 + . g=T2333331001023011220;i=1;p=1.000;q=14,6,13,25,27,4,24,22,14,19,9,23,15,6,8,4,22,4,4,20;u=0,0,0,1
+3_216_848_F3 solid read 2550573 2550592 11.5 - . g=G2320322020031220322;i=1;p=1.000;q=21,24,8,21,20,25,18,6,24,14,21,9,7,18,8,18,7,9,19,12;u=0,0,0,1
+3_221_516_F3 solid read 2607559 2607578 11.1 - . g=T2132333313222333332;i=1;p=1.000;q=9,19,27,26,24,26,26,25,25,26,21,4,6,10,21,6,20,13,5,24;u=0,0,0,1
+3_56_45_F3 solid read 2662103 2662122 5.5 + . g=G3021122332232122321;i=1;p=1.000;q=4,4,4,6,4,6,4,5,18,9,4,16,10,4,4,4,12,4,6,6;u=0,0,0,1
+3_127_210_F3 solid read 2798906 2798925 10.2 + . g=G2331321333232203222;i=1;p=1.000;q=11,25,9,4,23,16,26,14,7,22,9,25,9,8,21,8,15,17,4,26;u=0,0,1
+3_417_422_F3 solid read 2812322 2812341 8.8 - . g=T3321222333313333132;i=1;p=1.000;q=9,26,7,19,7,13,23,4,25,4,6,19,4,16,15,15,23,4,19,13;u=0,0,0,1
+3_42_1403_F3 solid read 2830264 2830283 9.6 - . g=T3212330132120221212;i=1;p=1.000;q=7,4,25,18,6,17,12,12,17,14,8,26,13,15,10,4,21,5,12,22;u=0,1
+3_457_42_F3 solid read 2874245 2874264 7.6 - . g=G0301123332223122221;i=1;p=1.000;q=18,10,14,9,19,4,10,8,11,10,6,8,5,8,11,4,13,6,4,6;u=0,0,1
+3_361_728_F3 solid read 2893879 2893898 14.6 + . g=C3213223312310132221;i=1;p=1.000;q=14,18,7,7,17,19,23,24,17,26,12,15,21,23,21,19,17,20,22,24;u=0,0,0,1
+3_77_718_F3 solid read 2913092 2913111 9.4 + . g=T3021331333313131231;i=1;p=1.000;q=15,26,7,24,20,18,5,6,17,18,6,11,4,13,19,15,7,4,22,25;u=0,0,0,1
+3_116_154_F3 solid read 2917672 2917691 9.8 - . g=A0323231223233132311;i=1;p=1.000;q=20,9,19,18,10,18,8,16,25,6,18,6,12,24,6,7,5,15,7,17;u=0,0,0,1
+3_239_1415_F3 solid read 2923256 2923275 19.2 + . g=T3233113121300032200;i=1;p=1.000;q=25,27,27,26,27,24,27,27,25,27,22,27,21,26,22,19,26,9,14,21;u=1
+3_142_1468_F3 solid read 2930117 2930136 10.5 - . g=A3233323333303103330;i=1;p=1.000;q=9,20,6,26,16,18,8,13,20,25,25,18,6,12,11,18,4,16,16,6;u=0,0,1
+3_394_295_F3 solid read 2930118 2930137 8.1 - . g=T3023333333333311331;i=1;p=1.000;q=4,14,6,12,7,22,10,4,13,24,18,12,12,4,6,9,9,9,14,4;u=0,0,0,1
+3_222_1773_F3 solid read 2934040 2934059 11.6 + . g=T1303031311123232302;i=1;p=1.000;q=11,10,24,15,28,6,19,5,13,27,8,26,8,22,25,27,26,27,8,13;u=0,0,0,1
+3_276_1344_F3 solid read 2969950 2969969 13.2 - . g=G3211212131233322233;i=1;p=1.000;q=27,27,12,16,11,23,27,8,23,12,27,22,20,12,15,25,8,27,16,6;u=0,1
+3_155_1814_F3 solid read 3107393 3107412 13.6 + . g=A2332222213113120221;i=1;p=1.000;q=27,26,20,25,26,27,12,27,26,18,26,4,27,10,23,26,6,23,26,26;u=0,0,0,1
+3_373_2014_F3 solid read 3143956 3143975 12.0 - . g=T3013322223222221211;i=1;p=1.000;q=16,8,17,21,10,10,18,18,18,13,4,23,16,24,8,19,14,15,23,11;u=0,1
+3_81_1637_F3 solid read 3413619 3413638 9.1 + . g=G2313032322122302111;i=1;p=1.000;q=9,4,7,19,27,6,11,5,12,15,20,27,8,27,6,16,6,27,21,6;u=0,0,1
+3_291_969_F3 solid read 3438323 3438342 17.4 + . g=T0021120212032121313;i=1;p=1.000;q=24,27,6,27,27,27,27,13,27,27,25,27,26,27,27,20,23,26,27,20;u=1
+3_179_1617_F3 solid read 3475164 3475183 8.0 + . g=A2100132222332123123;i=1;p=1.000;q=21,25,11,22,4,19,7,21,20,4,5,24,25,16,4,4,11,19,4,4;u=0,0,0,1
+3_446_861_F3 solid read 3476173 3476192 11.6 - . g=G1213302212022132321;i=1;p=1.000;q=27,27,27,27,26,25,12,27,24,18,24,6,27,26,20,9,6,6,4,23;u=0,0,1
+3_397_317_F3 solid read 3545152 3545171 11.1 + . g=T3110031332233111131;i=1;p=1.000;q=22,27,9,9,26,5,22,20,9,10,16,22,24,6,23,25,22,4,17,18;u=0,0,0,1
+3_323_713_F3 solid read 3575287 3575306 16.2 - . g=A0322222200213223302;i=1;p=1.000;q=27,25,21,27,26,26,24,26,27,18,27,26,26,27,22,22,6,26,25,8;u=0,1
+3_294_1906_F3 solid read 3727542 3727561 8.4 - . g=A3030310223202311021;i=1;p=1.000;q=14,7,5,4,7,18,4,6,13,6,12,12,10,11,15,14,16,7,9,12;u=0,0,0,1
+3_443_223_F3 solid read 3730805 3730824 17.1 - . g=T1113320033330133111;i=1;p=1.000;q=28,27,18,27,27,27,20,26,27,14,25,16,19,19,8,23,16,21,16,15;u=0,0,1
+3_94_809_F3 solid read 3841898 3841917 21.8 - . g=A2032223110001131310;i=1;p=1.000;q=27,27,27,27,26,27,25,24,27,27,27,25,27,27,27,12,23,16,27,27;u=0,0,0,1
+3_245_387_F3 solid read 3878549 3878568 24.4 - . g=A0222211220333132122;i=1;p=1.000;q=27,27,26,27,26,27,27,25,27,25,26,27,18,21,26,25,26,23,24,24;u=1
+3_190_1089_F3 solid read 3900038 3900057 13.7 - . g=T1111110323122301202;i=1;p=1.000;q=27,11,27,11,8,9,27,9,9,26,25,27,11,27,23,14,24,20,22,26;u=0,0,1
+3_442_1501_F3 solid read 3912610 3912629 8.5 + . g=A0012333103302132301;i=1;p=1.000;q=11,11,15,19,15,6,12,10,4,11,21,5,9,16,7,14,4,4,8,19;u=0,0,1
+3_342_678_F3 solid read 4044575 4044594 4.0 + . g=A3333112332213322323;i=1;p=1.000;q=4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4;u=0,0,0,1
+3_56_1294_F3 solid read 4058789 4058808 12.7 + . g=G3323331232322213322;i=1;p=1.000;q=26,17,18,27,23,8,8,24,27,27,9,27,25,14,26,4,27,9,24,23;u=0,0,0,1
+3_69_1575_F3 solid read 4070467 4070486 9.9 + . g=A2222011012222112121;i=1;p=1.000;q=16,25,14,9,9,9,21,9,4,24,6,21,13,6,27,10,19,8,6,27;u=0,0,0,1
+3_198_476_F3 solid read 4080622 4080641 8.9 + . g=C2010231122212011133;i=1;p=1.000;q=16,8,8,16,12,17,4,16,12,15,10,4,9,6,4,25,9,9,23,11;u=0,1
+3_24_715_F3 solid read 4136503 4136522 4.0 - . g=G1313332132232313233;i=1;p=1.000;q=4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4;u=0,0,0,1
+3_151_283_F3 solid read 4148264 4148283 9.7 + . g=T3230210232022111220;i=1;p=1.000;q=9,14,6,25,25,19,6,4,16,11,12,20,10,13,26,19,6,4,19,14;u=0,0,1
+3_164_774_F3 solid read 4156157 4156176 9.6 + . g=G2311112210110223313;i=1;p=1.000;q=8,24,19,7,6,16,12,9,4,8,26,14,26,24,7,18,6,16,14,7;u=0,0,0,1
+3_275_1212_F3 solid read 4171385 4171404 8.3 + . g=G0223122231333302232;i=1;p=1.000;q=13,8,5,4,10,7,12,25,4,25,6,15,6,27,6,11,12,7,14,10;u=0,0,0,1
+3_148_289_F3 solid read 4177672 4177691 8.0 - . g=T1203101332223323323;i=1;p=1.000;q=9,21,11,6,5,7,25,24,26,24,8,9,7,12,7,4,11,9,4,4;u=0,0,0,1
+3_437_1000_F3 solid read 4179623 4179642 12.3 + . g=A0112222212231131001;i=1;p=1.000;q=26,27,26,27,4,27,17,6,22,13,27,24,6,27,21,27,22,15,24,9;u=0,0,1
+3_318_2011_F3 solid read 4218181 4218200 12.9 - . g=T2133330223033303323;i=1;p=1.000;q=25,27,27,5,5,16,27,16,27,15,18,25,26,11,27,19,16,24,9,15;u=0,0,0,1
+3_14_11_F3 solid read 4222697 4222716 7.8 - . g=T2323310222232322122;i=1;p=1.000;q=6,23,16,25,25,9,7,4,12,4,14,6,10,7,6,9,18,4,10,4;u=0,0,0,1
+3_402_391_F3 solid read 4274545 4274564 6.2 - . g=C3303323321111111111;i=1;p=1.000;q=10,19,15,15,7,8,13,4,7,4,5,16,4,4,5,4,9,4,4,4;u=0,0,0,1
+3_293_504_F3 solid read 4339235 4339254 9.5 + . g=C2133223303331120213;i=1;p=1.000;q=6,4,5,26,13,7,17,6,24,10,27,24,5,9,21,9,23,24,20,14;u=0,0,0,1
+3_360_914_F3 solid read 4407004 4407023 10.7 + . g=T3012102130232022001;i=1;p=1.000;q=23,24,19,17,24,6,26,17,25,15,7,24,14,11,26,9,22,4,8,5;u=0,0,0,1
+3_118_1532_F3 solid read 4431702 4431721 10.2 + . g=C3233220201223200322;i=1;p=1.000;q=20,9,17,22,17,23,13,4,9,5,16,11,10,6,17,7,9,22,27,27;u=0,0,1
+3_358_133_F3 solid read 4460191 4460210 9.1 + . g=T0221223112322112233;i=1;p=1.000;q=6,23,12,22,7,6,7,4,13,5,9,23,12,9,24,8,14,7,20,26;u=0,0,0,1
+3_397_195_F3 solid read 4499390 4499409 6.9 - . g=T3302332313332212121;i=1;p=1.000;q=23,14,15,5,9,8,6,4,4,13,4,16,13,16,4,7,4,12,4,5;u=0,0,0,1
+3_158_642_F3 solid read 4533144 4533163 7.1 - . g=A1332103332323233212;i=1;p=1.000;q=8,20,9,22,8,14,4,16,17,4,8,13,7,8,4,12,5,4,4,4;u=0,0,0,1
+3_300_1439_F3 solid read 4580452 4580471 12.3 - . g=A0331111211302100201;i=1;p=1.000;q=5,17,21,14,4,16,11,27,21,9,17,17,27,23,12,21,16,27,25,25;u=0,0,0,1
+# Elapsed time 0.846 secs
=====================================
Tests/GFF/c_elegans_WS199_ann_gff.txt
=====================================
@@ -0,0 +1,2 @@
+# modified GFF file to remove location coordinates and test annotations
+I Expr_profile experimental_result_region . . . + . expr_profile=B0019.1
=====================================
Tests/GFF/c_elegans_WS199_dna_shortened.fa
=====================================
@@ -0,0 +1,21 @@
+>I
+gcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaa
+gcctaagcctaagcctaagcctaagcctaagcctaagcct
+>II
+cctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaag
+cctaagcctaagcctaagcctaagcctaagcctaagccta
+>III
+cctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaag
+cctaagcctaagcctaagcctaagcctaagcctaagccta
+>IV
+cctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaag
+cctaagcctaagcctaagcctaagcctaagcctaagccta
+>V
+gaattcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagc
+ctaagcctaagcctaagcctaagcctaagcctaagcctaa
+>X
+ctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagcctaagc
+ctaagcctaagcctaagcctaagcctaagcctaagcctaa
+>MtDNA
+cagtaaatagtttaataaaaatatagcatttgggttgctaagatattattactgatagaa
+tttttagtttaatttagaatgtatcacttacaatgatggg
=====================================
Tests/GFF/c_elegans_WS199_shortened_gff.txt
=====================================
@@ -0,0 +1,177 @@
+I Orfeome PCR_product 12759747 12764936 . - . amplified=1;pcr_product=mv_B0019.1
+I SAGE_tag_unambiguously_mapped SAGE_tag 12763533 12763553 . - . count=1;gene=amx-2;sequence=SAGE:ggcagagtcttttggca;transcript=B0019.1
+I SAGE_tag_unambiguously_mapped SAGE_tag 12761492 12761512 . - . count=5;gene=amx-2;sequence=SAGE:aacggagccgtacacgc;transcript=B0019.1
+I SAGE_tag_most_three_prime SAGE_tag 12761499 12761512 . - . count=9;gene=amx-2;sequence=SAGE:aacggagccg;transcript=B0019.1
+X SAGE_tag SAGE_tag 6819353 6819366 . + . count=9;gene=amx-2;sequence=SAGE:aacggagccg;transcript=B0019.1
+I Expr_profile experimental_result_region 12762449 12764118 . + . expr_profile=B0019.1
+I Coding_transcript CDS 12759745 12759828 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12759949 12760013 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12760227 12760319 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12760365 12760494 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12760834 12760904 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12761172 12761516 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12761799 12761953 . - 1 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12762127 12762268 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12762648 12762806 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12763112 12763249 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12763448 12763655 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12763729 12763882 . - 1 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12763979 12764102 . - 2 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12764291 12764471 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I Coding_transcript CDS 12764812 12764937 . - 0 ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2;status=Partially_confirmed;wormpep=CE:CE40797
+I history CDS 12759745 12759828 . - 0 ID=CDS:B0019.1:wp173
+I history CDS 12759949 12760013 . - 2 ID=CDS:B0019.1:wp173
+I history CDS 12760227 12760319 . - 2 ID=CDS:B0019.1:wp173
+I history CDS 12760365 12760494 . - 0 ID=CDS:B0019.1:wp173
+I history CDS 12760834 12760904 . - 2 ID=CDS:B0019.1:wp173
+I history CDS 12761172 12761516 . - 2 ID=CDS:B0019.1:wp173
+I history CDS 12761577 12761626 . - 1 ID=CDS:B0019.1:wp173
+I history CDS 12761795 12761953 . - 1 ID=CDS:B0019.1:wp173
+I history CDS 12762127 12762268 . - 2 ID=CDS:B0019.1:wp173
+I history CDS 12762648 12762806 . - 2 ID=CDS:B0019.1:wp173
+I history CDS 12763112 12763249 . - 2 ID=CDS:B0019.1:wp173
+I history CDS 12763448 12763655 . - 0 ID=CDS:B0019.1:wp173
+I history CDS 12763729 12763882 . - 1 ID=CDS:B0019.1:wp173
+I history CDS 12763979 12764102 . - 2 ID=CDS:B0019.1:wp173
+I history CDS 12764291 12764471 . - 0 ID=CDS:B0019.1:wp173
+I history CDS 12764812 12764937 . - 0 ID=CDS:B0019.1:wp173
+I history CDS 12759745 12759828 . - 0 ID=CDS:B0019.1:wp90
+I history CDS 12759949 12760013 . - 2 ID=CDS:B0019.1:wp90
+I history CDS 12760227 12760319 . - 2 ID=CDS:B0019.1:wp90
+I history CDS 12761172 12761516 . - 2 ID=CDS:B0019.1:wp90
+I history CDS 12761577 12761626 . - 1 ID=CDS:B0019.1:wp90
+I history CDS 12761795 12761953 . - 1 ID=CDS:B0019.1:wp90
+I history CDS 12762127 12762268 . - 2 ID=CDS:B0019.1:wp90
+I history CDS 12762648 12762806 . - 2 ID=CDS:B0019.1:wp90
+I history CDS 12763112 12763249 . - 2 ID=CDS:B0019.1:wp90
+I history CDS 12763469 12763655 . - 0 ID=CDS:B0019.1:wp90
+I history CDS 12763729 12763882 . - 1 ID=CDS:B0019.1:wp90
+I history CDS 12763979 12764102 . - 2 ID=CDS:B0019.1:wp90
+I history CDS 12764291 12764471 . - 0 ID=CDS:B0019.1:wp90
+I history CDS 12764812 12764937 . - 0 ID=CDS:B0019.1:wp90
+I mass_spec_genome translated_nucleotide_match 12761920 12761953 . - . ID=Target:381130;Target=Mass_spec_peptide:MSP:FADFSPLDVSDVNFATDDLAK 10 21 +;Note=MSP:FADFSPLDVSDVNFATDDLAK;cds_matches=B0019.1;protein_matches=WP:CE40797;times_observed=3
+I mass_spec_genome translated_nucleotide_match 12762127 12762155 . - . ID=Target:381130;Target=Mass_spec_peptide:MSP:FADFSPLDVSDVNFATDDLAK 1 10 +;Note=MSP:FADFSPLDVSDVNFATDDLAK;cds_matches=B0019.1;protein_matches=WP:CE40797;times_observed=3
+I mass_spec_genome translated_nucleotide_match 12763506 12763559 . - . ID=Target:381133;Target=Mass_spec_peptide:MSP:FGHGQSLLAQGGMNEVVR 1 18 +;Note=MSP:FGHGQSLLAQGGMNEVVR;cds_matches=B0019.1;protein_matches=WP:CE40797;times_observed=1
+I mass_spec_genome translated_nucleotide_match 12764361 12764411 . - . ID=Target:381144;Target=Mass_spec_peptide:MSP:NIQQNRPGLSVLVLEAR 1 17 +;Note=MSP:NIQQNRPGLSVLVLEAR;cds_matches=B0019.1;protein_matches=WP:CE40797;times_observed=2
+I Coding_transcript mRNA 12759582 12764949 . - . ID=Transcript:B0019.1;Note=amx-2;Parent=Gene:WBGene00000138;cds=B0019.1;prediction_status=Partially_confirmed;wormpep=CE:CE40797
+I Allele SNP 12764272 12764272 . + . interpolated_map_position=14.003;rflp=No;variation=snp_B0019[1]
+I Oligo_set reagent 12759745 12761589 . - . oligo_set=Aff_B0019.1
+I Coding_transcript exon 12759745 12759828 . - 0 Parent=Transcript:B0019.1
+I Coding_transcript exon 12759949 12760013 . - 2 Parent=Transcript:B0019.1
+I Coding_transcript exon 12760227 12760319 . - 2 Parent=Transcript:B0019.1
+I Coding_transcript exon 12760365 12760494 . - 0 Parent=Transcript:B0019.1
+I Coding_transcript exon 12760834 12760904 . - 2 Parent=Transcript:B0019.1
+I Coding_transcript exon 12761172 12761516 . - 2 Parent=Transcript:B0019.1
+I Coding_transcript exon 12761799 12761953 . - 1 Parent=Transcript:B0019.1
+I Coding_transcript exon 12762127 12762268 . - 2 Parent=Transcript:B0019.1
+I Coding_transcript exon 12762648 12762806 . - 2 Parent=Transcript:B0019.1
+I Coding_transcript exon 12763112 12763249 . - 2 Parent=Transcript:B0019.1
+I Coding_transcript exon 12763448 12763655 . - 0 Parent=Transcript:B0019.1
+I Coding_transcript exon 12763729 12763882 . - 1 Parent=Transcript:B0019.1
+I Coding_transcript exon 12763979 12764102 . - 2 Parent=Transcript:B0019.1
+I Coding_transcript exon 12764291 12764471 . - 0 Parent=Transcript:B0019.1
+I Coding_transcript exon 12764812 12764937 . - 0 Parent=Transcript:B0019.1
+I Coding_transcript five_prime_UTR 12764938 12764949 . - . Parent=Transcript:B0019.1
+I Coding_transcript three_prime_UTR 12759582 12759744 . - . Parent=Transcript:B0019.1
+I Coding_transcript intron 12760495 12760833 . - . Parent=Transcript:B0019.1;confirmed_est=EC027594
+I Coding_transcript intron 12760905 12761171 . - . Parent=Transcript:B0019.1;confirmed_est=EC027594
+I Coding_transcript intron 12761517 12761798 . - . Parent=Transcript:B0019.1;confirmed_est=EC027594
+I Coding_transcript intron 12759829 12759948 . - . Parent=Transcript:B0019.1;confirmed_est=EC034652
+I Coding_transcript intron 12760014 12760226 . - . Parent=Transcript:B0019.1;confirmed_est=EC034652
+I Coding_transcript intron 12760320 12760364 . - . Parent=Transcript:B0019.1;confirmed_est=yk1054h04.3
+I Coding_transcript intron 12763883 12763978 . - . Parent=Transcript:B0019.1;confirmed_est=yk1054h04.5,OSTF088D9_1
+I Coding_transcript intron 12764103 12764290 . - . Parent=Transcript:B0019.1;confirmed_est=yk1054h04.5,OSTF088D9_1
+I Coding_transcript intron 12764472 12764811 . - . Parent=Transcript:B0019.1;confirmed_est=yk1054h04.5,OSTF088D9_1
+I Coding_transcript intron 12762807 12763111 . - . Parent=Transcript:B0019.1;confirmed_est=yk1056c07.5
+I Coding_transcript intron 12763250 12763447 . - . Parent=Transcript:B0019.1;confirmed_est=yk1056c07.5
+I Coding_transcript intron 12763656 12763728 . - . Parent=Transcript:B0019.1;confirmed_est=yk1056c07.5
+I Coding_transcript intron 12761954 12762126 . - . Parent=Transcript:B0019.1;confirmed_est=yk262g9.5
+I Coding_transcript intron 12762269 12762647 . - . Parent=Transcript:B0019.1;confirmed_est=yk262g9.5
+I Promoterome PCR_product 12764938 12766937 . + . pcr_product=p_B0019.1_93
+I GenePair_STS PCR_product 12762449 12764118 . + . pcr_product=sjj_B0019.1
+I Coding_transcript gene 12759582 12764949 . - . ID=Gene:WBGene00000138
+III Orfeome PCR_product 13780230 13780850 . + . amplified=1;pcr_product=mv_3R5.1.v6
+IV Orfeome PCR_product 17486939 17488952 . - . amplified=1;pcr_product=mv_4R79.1
+IV Orfeome PCR_product 17480353 17483284 . - . amplified=1;pcr_product=mv_4R79.2
+X Orfeome PCR_product 17714881 17718531 . + . amplified=1;pcr_product=mv_6R55.1
+X Orfeome PCR_product 17712787 17714742 . + . amplified=1;pcr_product=mv_6R55.2
+II Orfeome PCR_product 6995874 7010146 . + . amplified=1;pcr_product=mv_AAA03517
+III Orfeome PCR_product 5625097 5631795 . + . amplified=1;pcr_product=mv_AAA03544
+X GenePair_STS PCR_product 9962853 9963737 . + . pcr_product=cenix:102-c3
+II GenePair_STS PCR_product 5507236 5508135 . + . pcr_product=cenix:102-c4
+V GenePair_STS PCR_product 10117842 10118735 . + . pcr_product=cenix:102-c5
+IV GenePair_STS PCR_product 3566130 3567025 . + . pcr_product=cenix:102-c6
+X GenePair_STS PCR_product 6117180 6117930 . + . pcr_product=cenix:102-c7
+IV GenePair_STS PCR_product 7189492 7190369 . + . pcr_product=cenix:102-c9
+II GenePair_STS PCR_product 14462527 14463202 . + . pcr_product=cenix:102-d1
+X Promoterome PCR_product 2258069 2259336 . + . pcr_product=p_AH9.2_93
+IV Promoterome PCR_product 12157449 12159448 . + . pcr_product=p_B0001.6_93
+I Promoterome PCR_product 12764938 12766937 . + . pcr_product=p_B0019.1_93
+V Promoterome PCR_product 10320122 10320689 . + . pcr_product=p_B0024.12_93
+I Coding_transcript CDS 4581214 4581237 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577
+I Coding_transcript CDS 4581664 4582026 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577
+I Coding_transcript CDS 4582412 4582718 . - 1 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577
+I Coding_transcript CDS 4583190 4583374 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577
+I Coding_transcript CDS 4583426 4583509 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577
+I Coding_transcript CDS 4583560 4583805 . - 0 ID=CDS:D1007.5b;Parent=Transcript:D1007.5b.2,Transcript:D1007.5b.1;status=Confirmed;wormpep=WP:CE33577
+I Coding_transcript mRNA 4580734 4583815 . - . ID=Transcript:D1007.5b.1;Parent=Gene:WBGene00017003;cds=D1007.5b;prediction_status=Confirmed;wormpep=WP:CE33577
+I Coding_transcript mRNA 4581214 4583811 . - . ID=Transcript:D1007.5b.2;Parent=Gene:WBGene00017003;cds=D1007.5b;prediction_status=Confirmed;wormpep=WP:CE33577
+I Coding_transcript exon 4581214 4581237 . - 0 Parent=Transcript:D1007.5b.1
+I Coding_transcript exon 4581664 4582026 . - 0 Parent=Transcript:D1007.5b.1
+I Coding_transcript exon 4582412 4582718 . - 1 Parent=Transcript:D1007.5b.1
+I Coding_transcript exon 4583190 4583374 . - 0 Parent=Transcript:D1007.5b.1
+I Coding_transcript exon 4583426 4583509 . - 0 Parent=Transcript:D1007.5b.1
+I Coding_transcript exon 4583560 4583805 . - 0 Parent=Transcript:D1007.5b.1
+I Coding_transcript five_prime_UTR 4583806 4583815 . - . Parent=Transcript:D1007.5b.1
+I Coding_transcript three_prime_UTR 4580734 4581213 . - . Parent=Transcript:D1007.5b.1
+I Coding_transcript intron 4582027 4582411 . - . Parent=Transcript:D1007.5b.1;confirmed_est=EB994038
+I Coding_transcript intron 4583375 4583425 . - . Parent=Transcript:D1007.5b.1;confirmed_est=EC038345,OSTF085G5_1
+I Coding_transcript intron 4583510 4583559 . - . Parent=Transcript:D1007.5b.1;confirmed_est=EC038345,OSTF085G5_1
+I Coding_transcript intron 4582719 4583189 . - . Parent=Transcript:D1007.5b.1;confirmed_est=yk1055g06.5,OSTF085G5_1
+I Coding_transcript intron 4581238 4581663 . - . Parent=Transcript:D1007.5b.1;confirmed_est=yk1057e08.3
+I Coding_transcript exon 4581214 4581237 . - 0 Parent=Transcript:D1007.5b.2
+I Coding_transcript exon 4581664 4582026 . - 0 Parent=Transcript:D1007.5b.2
+I Coding_transcript exon 4582412 4582718 . - 1 Parent=Transcript:D1007.5b.2
+I Coding_transcript exon 4583190 4583374 . - 0 Parent=Transcript:D1007.5b.2
+I Coding_transcript exon 4583426 4583509 . - 0 Parent=Transcript:D1007.5b.2
+I Coding_transcript exon 4583560 4583805 . - 0 Parent=Transcript:D1007.5b.2
+I Coding_transcript five_prime_UTR 4583806 4583811 . - . Parent=Transcript:D1007.5b.2
+I Coding_transcript intron 4582027 4582411 . - . Parent=Transcript:D1007.5b.2;confirmed_est=EB994038
+I Coding_transcript intron 4583375 4583425 . - . Parent=Transcript:D1007.5b.2;confirmed_est=EC038345,OSTF085G5_1
+I Coding_transcript intron 4583510 4583559 . - . Parent=Transcript:D1007.5b.2;confirmed_est=EC038345,OSTF085G5_1
+I Coding_transcript intron 4582719 4583189 . - . Parent=Transcript:D1007.5b.2;confirmed_est=yk1055g06.5,OSTF085G5_1
+I Coding_transcript intron 4581238 4581663 . - . Parent=Transcript:D1007.5b.2;confirmed_est=yk1057e08.3
+I Coding_transcript gene 4580693 4583815 . - . ID=Gene:WBGene00017003
+I SAGE_tag_unambiguously_mapped SAGE_tag 4581093 4581113 . - . count=10;gene=D1007.5;sequence=SAGE:tttgcgaattacttgct;transcript=D1007.5b.1,D1007.5a
+I SAGE_tag_unambiguously_mapped SAGE_tag 4580748 4580768 . - . count=112;gene=D1007.5;sequence=SAGE:ttttccattaattttga;transcript=D1007.5b.1,D1007.5a
+I SAGE_tag_unambiguously_mapped SAGE_tag 4582415 4582428 . - . count=1;gene=D1007.5;sequence=SAGE:cattttcgtg;transcript=D1007.5b.2,D1007.5b.1,D1007.5a
+I SAGE_tag_unambiguously_mapped SAGE_tag 4580914 4580927 . - . count=1;gene=D1007.5;sequence=SAGE:taaatttcaa;transcript=D1007.5b.1,D1007.5a
+I SAGE_tag_unambiguously_mapped SAGE_tag 4581193 4581206 . - . count=1;gene=D1007.5;sequence=SAGE:tgctcgttcg;transcript=D1007.5b.1,D1007.5a
+I SAGE_tag_unambiguously_mapped SAGE_tag 4583465 4583478 . - . count=1;gene=D1007.5;sequence=SAGE:tgttggcctt;transcript=D1007.5b.2,D1007.5b.1,D1007.5a
+I SAGE_tag_unambiguously_mapped SAGE_tag 4583458 4583478 . - . count=1;gene=D1007.5;sequence=SAGE:tgttggccttttacttg;transcript=D1007.5b.2,D1007.5b.1,D1007.5a
+I SAGE_tag_unambiguously_mapped SAGE_tag 4582533 4582553 . - . count=2;gene=D1007.5;sequence=SAGE:tgcagtgatagtccagc;transcript=D1007.5b.2,D1007.5b.1,D1007.5a
+I SAGE_tag_unambiguously_mapped SAGE_tag 4581100 4581113 . - . count=2;gene=D1007.5;sequence=SAGE:tttgcgaatt;transcript=D1007.5b.1,D1007.5a
+I SAGE_tag_unambiguously_mapped SAGE_tag 4580755 4580768 . - . count=43;gene=D1007.5;sequence=SAGE:ttttccatta;transcript=D1007.5b.1,D1007.5a
+I Coding_transcript CDS 4580993 4581241 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034
+I Coding_transcript CDS 4581664 4582026 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034
+I Coding_transcript CDS 4582412 4582718 . - 1 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034
+I Coding_transcript CDS 4583190 4583374 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034
+I Coding_transcript CDS 4583426 4583509 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034
+I Coding_transcript CDS 4583560 4583805 . - 0 ID=CDS:D1007.5a;Parent=Transcript:D1007.5a;status=Confirmed;wormpep=CE:CE29034
+I mass_spec_genome translated_nucleotide_match 4580996 4581052 . - . ID=Target:277116;Target=Mass_spec_peptide:MSP:IYEPSQEDLLLMHQLQQER 1 19 +;Note=MSP:IYEPSQEDLLLMHQLQQER;cds_matches=D1007.5a;protein_matches=WP:CE29034;times_observed=1
+I mass_spec_genome translated_nucleotide_match 4581838 4581882 . - . ID=Target:277138;Target=Mass_spec_peptide:MSP:AAIHLGSWHQIEGPR 1 15 +;Note=MSP:AAIHLGSWHQIEGPR;cds_matches=D1007.5b D1007.5a;protein_matches=WP:CE33577 WP:CE29034;times_observed=1
+I mass_spec_genome translated_nucleotide_match 4583581 4583601 . - . ID=Target:277176;Target=Mass_spec_peptide:MSP:TLWWLPK 1 7 +;Note=MSP:TLWWLPK;cds_matches=D1007.5b D1007.5a;protein_matches=WP:CE33577 WP:CE29034;times_observed=1
+I Coding_transcript mRNA 4580693 4583811 . - . ID=Transcript:D1007.5a;Parent=Gene:WBGene00017003;cds=D1007.5a;prediction_status=Confirmed;wormpep=CE:CE29034
+I Coding_transcript exon 4580993 4581241 . - 0 Parent=Transcript:D1007.5a
+I Coding_transcript exon 4581664 4582026 . - 0 Parent=Transcript:D1007.5a
+I Coding_transcript exon 4582412 4582718 . - 1 Parent=Transcript:D1007.5a
+I Coding_transcript exon 4583190 4583374 . - 0 Parent=Transcript:D1007.5a
+I Coding_transcript exon 4583426 4583509 . - 0 Parent=Transcript:D1007.5a
+I Coding_transcript exon 4583560 4583805 . - 0 Parent=Transcript:D1007.5a
+I Coding_transcript five_prime_UTR 4583806 4583811 . - . Parent=Transcript:D1007.5a
+I Coding_transcript three_prime_UTR 4580693 4580992 . - . Parent=Transcript:D1007.5a
+I Coding_transcript intron 4582027 4582411 . - . Parent=Transcript:D1007.5a;confirmed_est=EB994038
+I Coding_transcript intron 4581242 4581663 . - . Parent=Transcript:D1007.5a;confirmed_est=EB994038,OSTR085G5_1
+I Coding_transcript intron 4583375 4583425 . - . Parent=Transcript:D1007.5a;confirmed_est=EC038345,OSTF085G5_1
+I Coding_transcript intron 4583510 4583559 . - . Parent=Transcript:D1007.5a;confirmed_est=EC038345,OSTF085G5_1
+I Coding_transcript intron 4582719 4583189 . - . Parent=Transcript:D1007.5a;confirmed_est=yk1055g06.5,OSTF085G5_1
=====================================
Tests/GFF/ensembl_gtf.txt
=====================================
@@ -0,0 +1,33 @@
+I snoRNA exon 3747 3909 . - . gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; gene_name "Y74C9A.6"; transcript_name "NR_001477.2";
+I protein_coding exon 12764812 12764949 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12764812 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding start_codon 12764935 12764937 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "1"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding exon 12764291 12764471 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12764291 12764471 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "2"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12763979 12764102 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12763979 12764102 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "3"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12763729 12763882 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12763729 12763882 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "4"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12763448 12763655 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12763448 12763655 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "5"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12763112 12763249 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12763112 12763249 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "6"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12762648 12762806 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12762648 12762806 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "7"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12762127 12762268 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12762127 12762268 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "8"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12761799 12761953 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12761799 12761953 . - 1 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "9"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12761172 12761516 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12761172 12761516 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "10"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12760834 12760904 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12760834 12760904 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "11"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12760365 12760494 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12760365 12760494 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "12"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12760227 12760319 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12760227 12760319 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "13"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12759949 12760013 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12759949 12760013 . - 2 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "14"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding exon 12759579 12759828 . - . gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1";
+I protein_coding CDS 12759748 12759828 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1"; protein_id "B0019.1";
+I protein_coding stop_codon 12759745 12759747 . - 0 gene_id "B0019.1"; transcript_id "B0019.1"; exon_number "15"; gene_name "amx-2"; transcript_name "B0019.1";
=====================================
Tests/GFF/glimmer_nokeyval.gff3
=====================================
@@ -0,0 +1,6 @@
+##gff-version 3
+##sequence-region scaffold4215_3 1 6526
+scaffold4215_3 glimmer gene 3 62 . - . ID=GL0000006;Name=GL0000006;Lack 3'-end;
+scaffold4215_3 glimmer mRNA 3 62 . - . ID=GL0000006;Name=GL0000006;Parent=GL0000006;Lack 3'-end;
+scaffold4215_3 glimmer CDS 3 62 2.84 - 0 Parent=GL0000006;Lack 3'-end;
+scaffold4215_3 glimmer gene 124 1983 . - . ID=GL0000007;Name=GL0000007;Complete;
=====================================
Tests/GFF/hybrid1.gff3
=====================================
@@ -0,0 +1,17 @@
+##gff-version 3
+##sequence-region foo 1 100
+##feature-ontology bar
+##attribute-ontology baz
+##source-ontology boo
+##sequence-region chr17 62467934 62469545
+chr17 UCSC mRNA 62467934 62469545 . - . ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:SOMA_HUMAN;Note=growth%20hormone%201;Alias=GH1
+chr17 UCSC CDS 62468039 62468236 . - 1 Parent=A00469
+chr17 UCSC CDS 62468490 62468654 . - 2 Parent=A00469
+chr17 UCSC CDS 62468747 62468866 . - 1 Parent=A00469
+chr17 UCSC CDS 62469076 62469236 . - 1 Parent=A00469
+chr17 UCSC CDS 62469497 62469506 . - 0 Parent=A00469
+###
+##FASTA
+>chr17
+GATTACA
+GATTACA
=====================================
Tests/GFF/jgi_gff2.txt
=====================================
@@ -0,0 +1,6 @@
+chr_1 JGI exon 37061 37174 . - . name "fgenesh1_pg.C_chr_1000007"; transcriptId 873
+chr_1 JGI CDS 37061 37174 . - 0 name "fgenesh1_pg.C_chr_1000007"; proteinId 873; exonNumber 3
+chr_1 JGI exon 37315 37620 . - . name "fgenesh1_pg.C_chr_1000007"; transcriptId 873
+chr_1 JGI CDS 37315 37620 . - 0 name "fgenesh1_pg.C_chr_1000007"; proteinId 873; exonNumber 2
+chr_1 JGI exon 37752 38216 . - . name "fgenesh1_pg.C_chr_1000007"; transcriptId 873
+chr_1 JGI CDS 37752 38216 . - 0 name "fgenesh1_pg.C_chr_1000007"; proteinId 873; exonNumber 1
=====================================
Tests/GFF/mouse_extra_comma.gff3
=====================================
@@ -0,0 +1,17 @@
+chr17 RefSeq gene 6797760 6818159 . + . ID=NC_000083.5:LOC100040603;Name=NC_000083.5:LOC100040603
+chr17 RefSeq mRNA 6797760 6818159 . + . ID=XM_001475631.1;Parent=NC_000083.5:LOC100040603
+chr17 RefSeq protein 6806527 6812289 . + . ID=;Parent=XM_001475631.1
+chr17 RefSeq five_prime_UTR 6797760 6797769 . + . Parent=XM_001475631.1
+chr17 RefSeq five_prime_UTR 6806513 6806526 . + . Parent=XM_001475631.1
+chr17 RefSeq CDS 6806527 6806553 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,
+chr17 RefSeq CDS 6808204 6808245 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,
+chr17 RefSeq CDS 6811330 6811453 . + 0 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,
+chr17 RefSeq CDS 6811792 6811869 . + 2 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,
+chr17 RefSeq CDS 6812219 6812289 . + 2 Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,
+chr17 RefSeq three_prime_UTR 6812290 6818159 . + . Parent=XM_001475631.1
+chr17 RefSeq exon 6797760 6797769 . + . Parent=XM_001475631.1
+chr17 RefSeq exon 6806513 6806553 . + . Parent=XM_001475631.1
+chr17 RefSeq exon 6808204 6808245 . + . Parent=XM_001475631.1
+chr17 RefSeq exon 6811330 6811453 . + . Parent=XM_001475631.1
+chr17 RefSeq exon 6811792 6811869 . + . Parent=XM_001475631.1
+chr17 RefSeq exon 6812219 6818159 . + . Parent=XM_001475631.1
=====================================
Tests/GFF/ncbi_gff3.txt
=====================================
@@ -0,0 +1,21 @@
+##gff-version 3
+##source-version NCBI C++ formatter 0.2
+##date 2009-04-25
+##Type DNA NC_008596.1
+NC_008596.1 RefSeq gene 12272 13301 . + . locus_tag=MSMEG_0013;note=ferric%20enterobactin%20transport%20system%20permease%20protein%20FepG%3B%20this%20gene%20contains%20a%20frame%20shift%20which%20is%20not%20the%20result%20of%20sequencing%20error%3B%20identified%20by%20match%20to%20protein%20family%20HMM%20PF01032;pseudo=;db_xref=GeneID:4537201
+NC_008596.1 RefSeq gene 1137579 1138550 . + . ID=NC_008596.1:speB;locus_tag=MSMEG_1072;db_xref=GeneID:4535378
+NC_008596.1 RefSeq CDS 1137579 1138547 . + 0 ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;exon_number=1
+NC_008596.1 RefSeq start_codon 1137579 1137581 . + 0 ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;exon_number=1
+NC_008596.1 RefSeq stop_codon 1138548 1138550 . + 0 ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;exon_number=1
+NC_008596.1 RefSeq gene 3597069 3598112 . + . ID=NC_008596.1:speB;locus_tag=MSMEG_3535;db_xref=GeneID:4533678
+NC_008596.1 RefSeq CDS 3597069 3598109 . + 0 ID=NC_008596.1:speB:unknown_transcript_2;Parent=NC_008596.1:speB;locus_tag=MSMEG_3535;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_887838.1;db_xref=GI:118470943;db_xref=GeneID:4533678;exon_number=1
+NC_008596.1 RefSeq start_codon 3597069 3597071 . + 0 ID=NC_008596.1:speB:unknown_transcript_2;Parent=NC_008596.1:speB;locus_tag=MSMEG_3535;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_887838.1;db_xref=GI:118470943;db_xref=GeneID:4533678;exon_number=1
+NC_008596.1 RefSeq stop_codon 3598110 3598112 . + 0 ID=NC_008596.1:speB:unknown_transcript_2;Parent=NC_008596.1:speB;locus_tag=MSMEG_3535;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_887838.1;db_xref=GI:118470943;db_xref=GeneID:4533678;exon_number=1
+NC_008596.1 RefSeq gene 4460713 4461672 . - . ID=NC_008596.1:speB;locus_tag=MSMEG_4374;db_xref=GeneID:4535424
+NC_008596.1 RefSeq CDS 4460716 4461672 . - 0 ID=NC_008596.1:speB:unknown_transcript_3;Parent=NC_008596.1:speB;locus_tag=MSMEG_4374;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888649.1;db_xref=GI:118469662;db_xref=GeneID:4535424;exon_number=1
+NC_008596.1 RefSeq start_codon 4461670 4461672 . - 0 ID=NC_008596.1:speB:unknown_transcript_3;Parent=NC_008596.1:speB;locus_tag=MSMEG_4374;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888649.1;db_xref=GI:118469662;db_xref=GeneID:4535424;exon_number=1
+NC_008596.1 RefSeq stop_codon 4460713 4460715 . - 0 ID=NC_008596.1:speB:unknown_transcript_3;Parent=NC_008596.1:speB;locus_tag=MSMEG_4374;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888649.1;db_xref=GI:118469662;db_xref=GeneID:4535424;exon_number=1
+NC_008596.1 RefSeq gene 4539385 4540344 . + . ID=NC_008596.1:speB;locus_tag=MSMEG_4459;db_xref=GeneID:4537057
+NC_008596.1 RefSeq CDS 4539385 4540341 . + 0 ID=NC_008596.1:speB:unknown_transcript_4;Parent=NC_008596.1:speB;locus_tag=MSMEG_4459;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888732.1;db_xref=GI:118472833;db_xref=GeneID:4537057;exon_number=1
+NC_008596.1 RefSeq start_codon 4539385 4539387 . + 0 ID=NC_008596.1:speB:unknown_transcript_4;Parent=NC_008596.1:speB;locus_tag=MSMEG_4459;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888732.1;db_xref=GI:118472833;db_xref=GeneID:4537057;exon_number=1
+NC_008596.1 RefSeq stop_codon 4540342 4540344 . + 0 ID=NC_008596.1:speB:unknown_transcript_4;Parent=NC_008596.1:speB;locus_tag=MSMEG_4459;EC_number=3.5.3.11;note=identified%20by%20match%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20protein%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;protein_id=YP_888732.1;db_xref=GI:118472833;db_xref=GeneID:4537057;exon_number=1
=====================================
Tests/GFF/problem_sequence_region.gff3
=====================================
@@ -0,0 +1,7 @@
+##gff-version 3
+#!gff-spec-version 1.21
+#!processor NCBI annotwriter
+##sequence-region 1 2482535
+##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1282
+1 Local region 1 2482535 . + . ID=1:1..2482535;Dbxref=taxon:1282;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA
+1 . gene 1 1356 . + . ID=gene-test_000001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding
=====================================
Tests/GFF/spaces.gff3
=====================================
@@ -0,0 +1,5 @@
+##gff-version 3
+contig1 . gene 1544 2057 . - . ID=contig1.1
+contig1 . mRNA 1544 2057 . - . ID=mRNA.contig1.1;Parent=contig1.1
+contig1 . mRNA 1544 2057 . - . foo=bar;ID=mRNA.contig1.1;Parent=contig1.1
+contig1 . mRNA 1544 2057 . - . ID=mRNA.contig1.1;Parent=contig1.1; foo=bar
=====================================
Tests/GFF/trans_splicing.gff3
=====================================
@@ -0,0 +1,11 @@
+1 manual gene 9559 9672 . + . ID=gene83;Name=rps12|lcl|NC_021456.1_cdsid_YP_008082803.1_8-gene;exception=trans-splicing
+1 manual gene 112442 113241 . + . ID=gene84;Name=rps12|lcl|NC_021456.1_cdsid_YP_008082803.1_8-gene;exception=trans-splicing
+1 manual mRNA 9559 9672 . + . ID=mRNA43;Parent=gene83,gene84;Name=rps12|lcl|NC_021456.1_cdsid_YP_008082803.1_8;exception=trans-splicing
+1 manual mRNA 112442 113241 . + . ID=mRNA43;Parent=gene83,gene84;Name=rps12|lcl|NC_021456.1_cdsid_YP_008082803.1_8;exception=trans-splicing
+1 manual exon 9559 9672 . + . Parent=mRNA43
+1 manual CDS 9559 9672 . + 0 Parent=mRNA43
+1 manual exon 112442 112673 . + . Parent=mRNA43
+1 manual CDS 112442 112673 . + 0 Parent=mRNA43
+1 manual intron 112674 113215 . + . Parent=mRNA43
+1 manual exon 113216 113241 . + . Parent=mRNA43
+1 manual CDS 113216 113241 . + 2 Parent=mRNA43
=====================================
Tests/GFF/transcripts.gff3
=====================================
@@ -0,0 +1,18 @@
+##gff-version 3
+##date 2013-11-13
+edit_test.fa . gene 500 2610 . + . ID=newGene
+edit_test.fa . mRNA 500 2385 . + . Parent=newGene;Namo=reinhard+did+this;Name=t1%28newGene%29;ID=t1;uri=http%3A//www.yahoo.com
+edit_test.fa . five_prime_UTR 500 802 . + . Parent=t1
+edit_test.fa . CDS 803 1012 . + . Parent=t1
+edit_test.fa . three_prime_UTR 1013 1168 . + . Parent=t1
+edit_test.fa . three_prime_UTR 1475 1654 . + . Parent=t1
+edit_test.fa . three_prime_UTR 1720 1908 . + . Parent=t1
+edit_test.fa . three_prime_UTR 2047 2385 . + . Parent=t1
+edit_test.fa . mRNA 1050 2610 . + . Parent=newGene;Name=t2%28newGene%29;ID=t2
+edit_test.fa . CDS 1050 1196 . + . Parent=t2
+edit_test.fa . CDS 1472 1651 . + . Parent=t2
+edit_test.fa . CDS 1732 2610 . + . Parent=t2
+edit_test.fa . mRNA 1050 2610 . + . Parent=newGene;Name=t3%28newGene%29;ID=t3
+edit_test.fa . CDS 1050 1196 . + . Parent=t3
+edit_test.fa . CDS 1472 1651 . + . Parent=t3
+edit_test.fa . CDS 1732 2610 . + . Parent=t3
=====================================
Tests/GFF/unescaped-semicolon.gff3
=====================================
@@ -0,0 +1,2 @@
+##gff-version 3
+chr1 . gene 1 100 . + . ID=PH01000020G1780;Description="osFTL6 FT-Like6 homologous to Flowering Locus T gene; contains Pfam profile PF01161: Phosphatidylethanolamine-binding protein, expressed"
\ No newline at end of file
=====================================
Tests/GFF/wormbase_gff2.txt
=====================================
@@ -0,0 +1,63 @@
+I Genomic_canonical region 1 2679 . + . Sequence "cTel33B" ; Note "Clone cTel33B; Genbank AC199162" ; Note "Clone cTel33B; Genbank AC199162"
+I Coding_transcript Transcript 12759582 12764949 . - . Transcript "B0019.1" ; WormPep "WP:CE40797" ; Note "amx-2" ; Prediction_status "Partially_confirmed" ; Gene "WBGene00000138" ; CDS "B0019.1" ; WormPep "WP:CE40797" ; Note "amx-2" ; Prediction_status "Partially_confirmed" ; Gene "WBGene00000138"
+I Coding_transcript intron 12759829 12759948 . - . Transcript "B0019.1" ; Confirmed_EST EC034652
+I Coding_transcript intron 12760014 12760226 . - . Transcript "B0019.1" ; Confirmed_EST EC034652
+I Coding_transcript intron 12760320 12760364 . - . Transcript "B0019.1" ; Confirmed_EST yk1054h04.3
+I Coding_transcript intron 12760495 12760833 . - . Transcript "B0019.1" ; Confirmed_EST EC027594
+I Coding_transcript intron 12760905 12761171 . - . Transcript "B0019.1" ; Confirmed_EST EC027594
+I Coding_transcript intron 12761517 12761798 . - . Transcript "B0019.1" ; Confirmed_EST EC027594
+I Coding_transcript intron 12761954 12762126 . - . Transcript "B0019.1" ; Confirmed_EST yk262g9.5
+I Coding_transcript intron 12762269 12762647 . - . Transcript "B0019.1" ; Confirmed_EST yk262g9.5
+I Coding_transcript intron 12762807 12763111 . - . Transcript "B0019.1" ; Confirmed_EST yk1056c07.5
+I Coding_transcript intron 12763250 12763447 . - . Transcript "B0019.1" ; Confirmed_EST yk1056c07.5
+I Coding_transcript intron 12763656 12763728 . - . Transcript "B0019.1" ; Confirmed_EST yk1056c07.5
+I Coding_transcript intron 12763883 12763978 . - . Transcript "B0019.1" ; Confirmed_EST yk1054h04.5 ; Confirmed_EST OSTF088D9_1
+I Coding_transcript intron 12764103 12764290 . - . Transcript "B0019.1" ; Confirmed_EST yk1054h04.5 ; Confirmed_EST OSTF088D9_1
+I Coding_transcript intron 12764472 12764811 . - . Transcript "B0019.1" ; Confirmed_EST yk1054h04.5 ; Confirmed_EST OSTF088D9_1
+I Coding_transcript exon 12759582 12759828 . - . Transcript "B0019.1"
+I Coding_transcript exon 12759949 12760013 . - . Transcript "B0019.1"
+I Coding_transcript exon 12760227 12760319 . - . Transcript "B0019.1"
+I Coding_transcript exon 12760365 12760494 . - . Transcript "B0019.1"
+I Coding_transcript exon 12760834 12760904 . - . Transcript "B0019.1"
+I Coding_transcript exon 12761172 12761516 . - . Transcript "B0019.1"
+I Coding_transcript exon 12761799 12761953 . - . Transcript "B0019.1"
+I Coding_transcript exon 12762127 12762268 . - . Transcript "B0019.1"
+I Coding_transcript exon 12762648 12762806 . - . Transcript "B0019.1"
+I Coding_transcript exon 12763112 12763249 . - . Transcript "B0019.1"
+I Coding_transcript exon 12763448 12763655 . - . Transcript "B0019.1"
+I Coding_transcript exon 12763729 12763882 . - . Transcript "B0019.1"
+I Coding_transcript exon 12763979 12764102 . - . Transcript "B0019.1"
+I Coding_transcript exon 12764291 12764471 . - . Transcript "B0019.1"
+I Coding_transcript exon 12764812 12764949 . - . Transcript "B0019.1"
+I SAGE_tag_unambiguously_mapped SAGE_tag 12761492 12761512 . - . Sequence SAGE:aacggagccgtacacgc;count 5;Gene amx-2;Transcript B0019.1
+I SAGE_tag_most_three_prime SAGE_tag 12761499 12761512 . - . Sequence SAGE:aacggagccg;count 9;Gene amx-2;Transcript B0019.1
+I mass_spec_genome translated_nucleotide_match 12761920 12761953 . - . Target "Mass_spec_peptide:MSP:FADFSPLDVSDVNFATDDLAK" 10 21 ; Note "MSP:FADFSPLDVSDVNFATDDLAK" ; Protein_matches "WP:CE40797" ; CDS_matches "B0019.1" ; Times_observed "3"
+I mass_spec_genome translated_nucleotide_match 12762127 12762155 . - . Target "Mass_spec_peptide:MSP:FADFSPLDVSDVNFATDDLAK" 1 10 ; Note "MSP:FADFSPLDVSDVNFATDDLAK" ; Protein_matches "WP:CE40797" ; CDS_matches "B0019.1" ; Times_observed "3"
+I mass_spec_genome translated_nucleotide_match 12763506 12763559 . - . Target "Mass_spec_peptide:MSP:FGHGQSLLAQGGMNEVVR" 1 18 ; Note "MSP:FGHGQSLLAQGGMNEVVR" ; Protein_matches "WP:CE40797" ; CDS_matches "B0019.1" ; Times_observed "1"
+I SAGE_tag_unambiguously_mapped SAGE_tag 12763533 12763553 . - . Sequence SAGE:ggcagagtcttttggca;count 1;Gene amx-2;Transcript B0019.1
+I mass_spec_genome translated_nucleotide_match 12764361 12764411 . - . Target "Mass_spec_peptide:MSP:NIQQNRPGLSVLVLEAR" 1 17 ; Note "MSP:NIQQNRPGLSVLVLEAR" ; Protein_matches "WP:CE40797" ; CDS_matches "B0019.1" ; Times_observed "2"
+I GenePair_STS PCR_product 12762449 12764118 . + . PCR_product "sjj_B0019.1"
+I Expr_profile experimental_result_region 12762449 12764118 . + . Expr_profile "B0019.1"
+I Allele SNP 12764272 12764272 . + . Variation "snp_B0019[1]" ; Interpolated_map_position "14.003" ; ; RFLP "No"
+I Promoterome PCR_product 12764938 12766937 . + . PCR_product "p_B0019.1_93"
+I Oligo_set reagent 12759745 12761589 . - . Oligo_set "Aff_B0019.1"
+I Orfeome PCR_product 12759747 12764936 . - . PCR_product "mv_B0019.1" ; Amplified 1 ; Amplified 1
+I Coding_transcript three_prime_UTR 12759582 12759744 . - . Transcript "B0019.1"
+I Coding_transcript coding_exon 12759745 12759828 . - 0 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12759949 12760013 . - 2 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12760227 12760319 . - 2 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12760365 12760494 . - 0 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12760834 12760904 . - 2 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12761172 12761516 . - 2 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12761799 12761953 . - 1 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12762127 12762268 . - 2 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12762648 12762806 . - 2 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12763112 12763249 . - 2 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12763448 12763655 . - 0 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12763729 12763882 . - 1 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12763979 12764102 . - 2 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript coding_exon 12764291 12764471 . - 0 Transcript "B0019.1" ; CDS "B0019.1"
+I Coding_transcript five_prime_UTR 12764938 12764949 . - . Transcript "B0019.1"
+I Coding_transcript coding_exon 12764812 12764937 . - 0 Transcript "B0019.1" ; CDS "B0019.1"
+X SAGE_tag SAGE_tag 6819353 6819366 . + . Sequence SAGE:aacggagccg;count 9;Gene amx-2;Transcript B0019.1
+X gene processed_transcript 944828 948883 . - . Gene "WBGene00004893"
=====================================
Tests/GFF/wormbase_gff2_alt.txt
=====================================
@@ -0,0 +1,9 @@
+Remanei_genome Genomic_canonical region 1 7816 . + . Sequence "Contig1020";
+Contig102 WU_MERGED CDS 1629 3377 . - . CDS "cr01.sctg102.wum.2.1"
+Contig102 WU_MERGED coding_exon 2927 3377 . - . CDS "cr01.sctg102.wum.2.1"
+Contig102 WU_MERGED coding_exon 2474 2875 . - . CDS "cr01.sctg102.wum.2.1"
+Contig102 WU_MERGED coding_exon 1928 2430 . - . CDS "cr01.sctg102.wum.2.1"
+Contig102 WU_MERGED coding_exon 1629 1883 . - . CDS "cr01.sctg102.wum.2.1"
+Contig102 WU_MERGED intron 2876 2926 . - . CDS "cr01.sctg102.wum.2.1"
+Contig102 WU_MERGED intron 2431 2473 . - . CDS "cr01.sctg102.wum.2.1"
+Contig102 WU_MERGED intron 1884 1927 . - . CDS "cr01.sctg102.wum.2.1"
=====================================
Tests/test_GFFSeqIOFeatureAdder.py
=====================================
@@ -0,0 +1,684 @@
+"""Test decoration of existing SeqRecords with GFF through a SeqIO interface.
+"""
+import sys
+import os
+import unittest
+import pprint
+
+import six
+from six import StringIO
+
+from Bio import SeqIO
+from BCBio import GFF
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from BCBio.GFF import (GFFExaminer, GFFParser, DiscoGFFParser)
+
+
+class MapReduceGFFTest(unittest.TestCase):
+ """Tests GFF parsing using a map-reduce framework for parallelization.
+ """
+
+ def setUp(self):
+ self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
+ self._test_gff_file = os.path.join(self._test_dir, "c_elegans_WS199_shortened_gff.txt")
+ self._disco_host = "http://localhost:7000"
+
+ def t_local_map_reduce(self):
+ """General map reduce framework without parallelization.
+ """
+ cds_limit_info = dict(gff_type=["gene", "mRNA", "CDS"], gff_id=['I'])
+ rec_dict = SeqIO.to_dict(GFF.parse(self._test_gff_file, limit_info=cds_limit_info))
+ test_rec = rec_dict['I']
+ assert len(test_rec.features) == 32
+
+ def t_disco_map_reduce(self):
+ """Map reduce framework parallelized using disco.
+ """
+ # this needs to be more generalized but fails okay with no disco
+ try:
+ import disco
+ import simplejson
+ except ImportError:
+ print("Skipping -- disco and json not found")
+ return
+ cds_limit_info = dict(
+ gff_source_type=[('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'),
+ ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')],
+ gff_id=['I']
+ )
+ parser = DiscoGFFParser(disco_host=self._disco_host)
+ rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, limit_info=cds_limit_info))
+ final_rec = rec_dict['I']
+ # second gene feature is multi-parent
+ assert len(final_rec.features) == 2 # two gene feature
+
+
+class GFF3Test(unittest.TestCase):
+ """Real live GFF3 tests from WormBase and NCBI.
+
+ Uses GFF3 data from:
+
+ ftp://ftp.wormbase.org/pub/wormbase/genomes/c_elegans/
+ genome_feature_tables/GFF3/
+ ftp://ftp.wormbase.org/pub/wormbase/genomes/c_elegans/sequences/dna/
+
+ and from NCBI.
+ """
+
+ def setUp(self):
+ self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
+ self._test_seq_file = os.path.join(self._test_dir, "c_elegans_WS199_dna_shortened.fa")
+ self._test_gff_file = os.path.join(self._test_dir, "c_elegans_WS199_shortened_gff.txt")
+ self._test_gff_ann_file = os.path.join(self._test_dir, "c_elegans_WS199_ann_gff.txt")
+ self._full_dir = "/usr/home/chapmanb/mgh/ruvkun_rnai/wormbase/" + \
+ "data_files_WS198"
+ self._test_ncbi = os.path.join(self._test_dir, "ncbi_gff3.txt")
+
+ def not_t_full_celegans(self):
+ """Test the full C elegans chromosome and GFF files.
+
+ This is used to test GFF on large files and is not run as a standard
+ test. You will need to download the files and adjust the paths
+ to run this.
+ """
+ # read the sequence information
+ seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa")
+ gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3")
+ seq_handle = open(seq_file)
+ seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
+ seq_handle.close()
+ #with open(gff_file) as gff_handle:
+ # possible_limits = feature_adder.available_limits(gff_handle)
+ # pprint.pprint(possible_limits)
+ rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')]
+ gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'),
+ ('Coding_transcript', 'CDS')]
+ limit_info = dict(gff_source_type=rnai_types + gene_types)
+ for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info):
+ pass
+
+ def _get_seq_dict(self):
+ """Internal reusable function to get the sequence dictionary.
+ """
+ seq_handle = open(self._test_seq_file)
+ seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
+ seq_handle.close()
+ return seq_dict
+
+ def t_possible_limits(self):
+ """Calculate possible queries to limit a GFF file.
+ """
+ gff_examiner = GFFExaminer()
+ possible_limits = gff_examiner.available_limits(self._test_gff_file)
+ print()
+ pprint.pprint(possible_limits)
+
+ def t_parent_child(self):
+ """Summarize parent-child relationships in a GFF file.
+ """
+ gff_examiner = GFFExaminer()
+ pc_map = gff_examiner.parent_child_map(self._test_gff_file)
+ print()
+ pprint.pprint(pc_map)
+
+ def t_parent_child_file_modes(self):
+ """Summarize parent-child relationships in a GFF file.
+ """
+ gff_examiner = GFFExaminer()
+ # Use the loaded-from-filename as reference
+ pc_map = gff_examiner.parent_child_map(self._test_gff_file)
+
+ with open(self._test_gff_file, "rt") as handle:
+ assert pc_map == gff_examiner.parent_child_map(handle)
+
+ with open(self._test_gff_file, "rb") as handle:
+ if six.PY2:
+ assert pc_map == gff_examiner.parent_child_map(handle)
+ else:
+ try:
+ gff_examiner.parent_child_map(handle)
+ except TypeError as e:
+ assert str(e) == "input handle must be opened in text mode", e
+ else:
+ assert False, "expected TypeError to be raised"
+
+ def t_flat_features(self):
+ """Check addition of flat non-nested features to multiple records.
+ """
+ seq_dict = self._get_seq_dict()
+ pcr_limit_info = dict(
+ gff_source_type=[('Orfeome', 'PCR_product'), ('GenePair_STS',
+ 'PCR_product'), ('Promoterome', 'PCR_product')]
+ )
+ parser = GFFParser()
+ rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=pcr_limit_info))
+ assert len(rec_dict['I'].features) == 4
+ assert len(rec_dict['X'].features) == 5
+
+ def t_nested_features(self):
+ """Check three-deep nesting of features with gene, mRNA and CDS.
+ """
+ seq_dict = self._get_seq_dict()
+ cds_limit_info = dict(
+ gff_source_type=[('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'),
+ ('Coding_transcript', 'CDS')],
+ gff_id=['I']
+ )
+ parser = GFFParser()
+ rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=cds_limit_info))
+ final_rec = rec_dict['I']
+ # first gene feature is plain
+ assert len(final_rec.features) == 2 # two gene feature
+ assert len(final_rec.features[0].sub_features) == 1 # one transcript
+ # 15 final CDS regions
+ assert len(final_rec.features[0].sub_features[0].sub_features) == 15
+
+ def t_nested_multiparent_features(self):
+ """Verify correct nesting of features with multiple parents.
+ """
+ seq_dict = self._get_seq_dict()
+ cds_limit_info = dict(
+ gff_source_type=[('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'),
+ ('Coding_transcript', 'CDS')],
+ gff_id=['I']
+ )
+ parser = GFFParser()
+ rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, seq_dict, limit_info=cds_limit_info))
+ final_rec = rec_dict['I']
+ # second gene feature is multi-parent
+ assert len(final_rec.features) == 2 # two gene feature
+ cur_subs = final_rec.features[1].sub_features
+ assert len(cur_subs) == 3 # three transcripts
+ # the first and second transcript have the same CDSs
+ assert len(cur_subs[0].sub_features) == 6
+ assert len(cur_subs[1].sub_features) == 6
+ assert cur_subs[0].sub_features[0] is cur_subs[1].sub_features[0]
+
+ def t_no_dict_error(self):
+ """Ensure an error is raised when no dictionary to map to is present.
+ """
+ parser = GFFParser(create_missing=False)
+ try:
+ for rec in parser.parse(self._test_gff_file):
+ pass
+ # no error -- problem
+ raise AssertionError('Did not complain with missing dictionary')
+ except KeyError:
+ pass
+
+ def t_unknown_seq(self):
+ """Prepare unknown base sequences with the correct length.
+ """
+ rec_dict = SeqIO.to_dict(GFF.parse(self._test_gff_file))
+ assert len(rec_dict["I"].seq) == 12766937
+ assert len(rec_dict["X"].seq) == 17718531
+
+ def t_gff_annotations(self):
+ """Check GFF annotations placed on an entire sequence.
+ """
+ parser = GFFParser()
+ rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_ann_file))
+ final_rec = rec_dict['I']
+ assert len(final_rec.annotations.keys()) == 2
+ assert final_rec.annotations['source'] == ['Expr_profile']
+ assert final_rec.annotations['expr_profile'] == ['B0019.1']
+
+ def t_gff3_iterator(self):
+ """Iterated parsing in GFF3 files with nested features.
+ """
+ parser = GFFParser()
+ recs = [r for r in parser.parse_in_parts(self._test_gff_file, target_lines=70)]
+ # should be one big set because we don't have a good place to split
+ assert len(recs) == 6
+ assert len(recs[0].features) == 59
+
+ def t_gff3_iterator_limit(self):
+ """Iterated interface using a limit query on GFF3 files.
+ """
+ cds_limit_info = dict(
+ gff_source_type=[('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'),
+ ('Coding_transcript', 'CDS')],
+ gff_id=['I']
+ )
+ parser = GFFParser()
+ rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file, limit_info=cds_limit_info))
+ assert len(rec_dict) == 1
+ tfeature = rec_dict["I"].features[0].sub_features[0]
+ for sub_test in tfeature.sub_features:
+ assert sub_test.type == "CDS", sub_test
+
+ def t_gff3_noval_attrib(self):
+ """Parse GFF3 file from NCBI with a key/value pair with no value..
+ """
+ parser = GFFParser()
+ rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi))
+ assert len(rec_dict) == 1
+ t_feature = list(rec_dict.values())[0].features[0]
+ assert t_feature.qualifiers["pseudo"] == ["true"]
+
+ def t_gff3_multiple_ids(self):
+ """Deal with GFF3 with non-unique ID attributes, using NCBI example.
+ """
+ parser = GFFParser()
+ rec_dict = SeqIO.to_dict(parser.parse(self._test_ncbi))
+ assert len(rec_dict) == 1
+ t_features = list(rec_dict.values())[0].features[1:]
+ # 4 feature sets, same ID, different positions, different attributes
+ assert len(t_features) == 4
+ for f in t_features:
+ assert len(f.sub_features) == 3
+
+ def t_simple_parsing(self):
+ """Parse GFF into a simple line by line dictionary without nesting.
+ """
+ parser = GFFParser()
+ num_lines = 0
+ for line_info in parser.parse_simple(self._test_gff_file):
+ num_lines += 1
+ assert num_lines == 177, num_lines
+ line_info = line_info['child'][0]
+ assert line_info['quals']['confirmed_est'] == \
+ ['yk1055g06.5', 'OSTF085G5_1']
+ assert line_info['location'] == [4582718, 4583189]
+
+ def t_simple_parsing_nesting(self):
+ """Simple parsing for lines with nesting, using the simplified API.
+ """
+ test_gff = os.path.join(self._test_dir, "transcripts.gff3")
+ num_lines = 0
+ for line_info in GFF.parse_simple(test_gff):
+ num_lines += 1
+ assert num_lines == 16, num_lines
+
+ def t_extra_comma(self):
+ """Correctly handle GFF3 files with extra trailing commas.
+ """
+ tfile = os.path.join(self._test_dir, "mouse_extra_comma.gff3")
+ in_handle = open(tfile)
+ for rec in GFF.parse(in_handle):
+ pass
+ in_handle.close()
+ tested = False
+ for sub_top in rec.features[0].sub_features:
+ for sub in sub_top.sub_features:
+ if sub.qualifiers.get("Name", "") == ["CDS:NC_000083.5:LOC100040603"]:
+ tested = True
+ assert len(sub.qualifiers["Parent"]) == 1
+ assert tested, "Did not find sub-feature to test"
+
+ def t_novalue_key(self):
+ """Handle GFF3 files with keys and no values.
+ """
+ tfile = os.path.join(self._test_dir, "glimmer_nokeyval.gff3")
+ rec = six.next(GFF.parse(tfile))
+ f1, f2 = rec.features
+ assert f1.qualifiers['ID'] == ['GL0000006']
+ assert len(f1.sub_features) == 2
+ assert f1.sub_features[0].qualifiers["Lack 3'-end"] == ["true"]
+ assert not "ID" in f1.sub_features[0].qualifiers
+ assert f2.qualifiers["Complete"] == ["true"]
+
+ def t_key_whitespace(self):
+ """Fix keys with problematic whitespace.
+ """
+ tfile = os.path.join(self._test_dir, "spaces.gff3")
+ for i, line_info in enumerate(GFF.parse_simple(tfile)):
+ if i > 2:
+ assert line_info["quals"]["foo"] == ["bar"]
+
+ def t_trans_spliicing(self):
+ """Parsing of transspliced genes from GFF3 spec where child locations don't match to parents.
+ """
+ fname = os.path.join(self._test_dir, "trans_splicing.gff3")
+ with open(fname) as in_handle:
+ rec = six.next(GFF.parse(in_handle))
+ assert len(rec.features) == 2
+ assert rec.features[0].id == "gene83"
+ assert len(rec.features[0].sub_features) == 2
+ assert len(rec.features[0].sub_features[0].sub_features) == 7
+
+ assert rec.features[1].id == "gene84"
+ assert len(rec.features[1].sub_features) == 2
+ assert len(rec.features[1].sub_features[0].sub_features) == 7
+
+
+class SolidGFFTester(unittest.TestCase):
+ """Test reading output from SOLiD analysis, as GFF3.
+
+ See more details on SOLiD GFF here:
+
+ http://solidsoftwaretools.com/gf/project/matogff/
+ """
+
+ def setUp(self):
+ self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
+ self._test_gff_file = os.path.join(self._test_dir, "F3-unique-3.v2.gff")
+
+ def t_basic_solid_parse(self):
+ """Basic parsing of SOLiD GFF results files.
+ """
+ parser = GFFParser()
+ rec_dict = SeqIO.to_dict(parser.parse(self._test_gff_file))
+ test_feature = rec_dict['3_341_424_F3'].features[0]
+ assert test_feature.location.nofuzzy_start == 102716
+ assert test_feature.location.nofuzzy_end == 102736
+ assert len(test_feature.qualifiers) == 7
+ assert test_feature.qualifiers['score'] == ['10.6']
+ assert test_feature.qualifiers['source'] == ['solid']
+ assert test_feature.strand == -1
+ assert test_feature.type == 'read'
+ assert test_feature.qualifiers['g'] == ['T2203031313223113212']
+ assert len(test_feature.qualifiers['q']) == 20
+
+ def t_solid_iterator(self):
+ """Iterated parsing in a flat file without nested features.
+ """
+ parser = GFFParser()
+ feature_sizes = []
+ for rec in parser.parse_in_parts(self._test_gff_file, target_lines=5):
+ feature_sizes.append(len(rec.features))
+ assert len(feature_sizes) == 112
+ assert max(feature_sizes) == 1
+
+ def t_line_adjust(self):
+ """Adjust lines during parsing to fix potential GFF problems.
+ """
+
+ def adjust_fn(results):
+ rec_index = results['quals']['i'][0]
+ read_name = results['rec_id']
+ results['quals']['read_name'] = [read_name]
+ results['rec_id'] = rec_index
+ return results
+
+ parser = GFFParser(line_adjust_fn=adjust_fn)
+ recs = [r for r in parser.parse(self._test_gff_file)]
+ assert len(recs) == 1
+ work_rec = recs[0]
+ assert work_rec.id == '1'
+ assert len(work_rec.features) == 112
+ assert work_rec.features[0].qualifiers['read_name'] == \
+ ['3_336_815_F3']
+
+
+class GFF2Tester(unittest.TestCase):
+ """Parse GFF2 and GTF files, building features.
+ """
+
+ def setUp(self):
+ self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
+ self._ensembl_file = os.path.join(self._test_dir, "ensembl_gtf.txt")
+ self._wormbase_file = os.path.join(self._test_dir, "wormbase_gff2.txt")
+ self._jgi_file = os.path.join(self._test_dir, "jgi_gff2.txt")
+ self._wb_alt_file = os.path.join(self._test_dir, "wormbase_gff2_alt.txt")
+
+ def t_basic_attributes(self):
+ """Parse out basic attributes of GFF2 from Ensembl GTF.
+ """
+ limit_info = dict(gff_source_type=[('snoRNA', 'exon')])
+ rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file, limit_info=limit_info))
+ work_rec = rec_dict['I']
+ assert len(work_rec.features) == 1
+ test_feature = work_rec.features[0]
+ qual_keys = list(test_feature.qualifiers.keys())
+ qual_keys.sort()
+ assert qual_keys == [
+ 'Parent', 'exon_number', 'gene_id', 'gene_name', 'source', 'transcript_id', 'transcript_name'
+ ]
+ assert test_feature.qualifiers['source'] == ['snoRNA']
+ assert test_feature.qualifiers['transcript_name'] == ['NR_001477.2']
+ assert test_feature.qualifiers['exon_number'] == ['1']
+
+ def t_tricky_semicolons(self):
+ """Parsing of tricky semi-colon positions in WormBase GFF2.
+ """
+ limit_info = dict(gff_source_type=[('Genomic_canonical', 'region')])
+ rec_dict = SeqIO.to_dict(GFF.parse(self._wormbase_file, limit_info=limit_info))
+ work_rec = rec_dict['I']
+ assert len(work_rec.features) == 1
+ test_feature = work_rec.features[0]
+ assert test_feature.qualifiers['Note'] == \
+ ['Clone cTel33B; Genbank AC199162', 'Clone cTel33B; Genbank AC199162'], test_feature.qualifiers["Note"]
+
+ def t_unescaped_semicolons(self):
+ """Parse inputs with unescaped semi-colons.
+ This is a band-aid to not fail rather than correct parsing, since
+ the combined feature will not be maintained.
+ """
+ f = os.path.join(self._test_dir, "unescaped-semicolon.gff3")
+ rec_dict = SeqIO.to_dict(GFF.parse(f))
+ f = rec_dict['chr1'].features[0]
+ assert f.qualifiers["Description"][0].startswith('osFTL6')
+ assert f.qualifiers["Description"][0].endswith('protein, expressed')
+
+ def t_jgi_gff(self):
+ """Parsing of JGI formatted GFF2, nested using transcriptId and proteinID
+ """
+ rec_dict = SeqIO.to_dict(GFF.parse(self._jgi_file))
+ tfeature = rec_dict['chr_1'].features[0]
+ assert tfeature.location.nofuzzy_start == 37060
+ assert tfeature.location.nofuzzy_end == 38216
+ assert tfeature.type == 'inferred_parent'
+ assert len(tfeature.sub_features) == 6
+ sfeature = tfeature.sub_features[1]
+ assert sfeature.qualifiers['proteinId'] == ['873']
+ assert sfeature.qualifiers['phase'] == ['0']
+
+ def t_ensembl_nested_features(self):
+ """Test nesting of features with GFF2 files using transcript_id.
+
+ XXX sub_features no longer supported in Biopython
+ """
+ rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file))
+ assert len(rec_dict["I"].features) == 2
+ t_feature = rec_dict["I"].features[0]
+ #assert len(t_feature.sub_features) == 32, len(t_feature.sub_features)
+
+ def t_wormbase_nested_features(self):
+ """Test nesting of features with GFF2 files using Transcript only.
+ """
+ rec_dict = SeqIO.to_dict(GFF.parse(self._wormbase_file))
+ assert len(rec_dict) == 3
+ parent_features = [f for f in rec_dict["I"].features if f.type == "Transcript"]
+ assert len(parent_features) == 1
+ inferred_features = [f for f in rec_dict["I"].features if f.type == "inferred_parent"]
+ assert len(inferred_features) == 0
+ tfeature = parent_features[0]
+ assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797"
+ assert len(tfeature.sub_features) == 46
+
+ def t_wb_cds_nested_features(self):
+ """Nesting of GFF2 features with a flat CDS key value pair.
+ """
+ rec_dict = SeqIO.to_dict(GFF.parse(self._wb_alt_file))
+ assert len(rec_dict) == 2
+ features = list(rec_dict.values())[0].features
+ assert len(features) == 1
+ tfeature = features[0]
+ assert tfeature.id == "cr01.sctg102.wum.2.1"
+ assert len(tfeature.sub_features) == 7
+
+ def t_gff2_iteration(self):
+ """Test iterated features with GFF2 files, breaking without parents.
+ """
+ recs = []
+ for rec in GFF.parse(self._wormbase_file, target_lines=15):
+ recs.append(rec)
+ assert len(recs) == 4
+ assert recs[0].features[0].type == 'region'
+ assert recs[0].features[1].type == 'SAGE_tag'
+ assert len(recs[0].features[2].sub_features) == 29
+
+
+class DirectivesTest(unittest.TestCase):
+ """Tests for parsing directives and other meta-data.
+ """
+
+ def setUp(self):
+ self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
+ self._gff_file = os.path.join(self._test_dir, "hybrid1.gff3")
+ self._problem_seq_region_file = os.path.join(self._test_dir, "problem_sequence_region.gff3")
+
+ def t_basic_directives(self):
+ """Parse out top level meta-data supplied in a GFF3 file.
+ """
+ recs = SeqIO.to_dict(GFF.parse(self._gff_file))
+ anns = recs['chr17'].annotations
+ assert anns['gff-version'] == ['3']
+ assert anns['attribute-ontology'] == ['baz']
+ assert anns['feature-ontology'] == ['bar']
+ assert anns['source-ontology'] == ['boo']
+ assert anns['sequence-region'] == [('foo', 0, 100), ('chr17', 62467933, 62469545)]
+
+ def t_fasta_directive(self):
+ """Parse FASTA sequence information contained in a GFF3 file.
+ """
+ recs = SeqIO.to_dict(GFF.parse(self._gff_file))
+ assert len(recs) == 1
+ test_rec = recs['chr17']
+ assert str(test_rec.seq) == "GATTACAGATTACA"
+
+ def t_examiner_with_fasta(self):
+ """Perform high level examination of files with FASTA directives..
+ """
+ examiner = GFFExaminer()
+ pc_map = examiner.parent_child_map(self._gff_file)
+ assert pc_map[('UCSC', 'mRNA')] == [('UCSC', 'CDS')]
+ limits = examiner.available_limits(self._gff_file)
+ assert list(limits['gff_id'].keys())[0][0] == 'chr17'
+ assert sorted(limits['gff_source_type'].keys()) == \
+ [('UCSC', 'CDS'), ('UCSC', 'mRNA')]
+
+ def t_problem_sequence_region(self):
+ """Avoid issues with sequence region directives lacking contigs
+ """
+ recs = SeqIO.to_dict(GFF.parse(self._problem_seq_region_file))
+ anns = recs['1'].annotations
+ assert anns['gff-version'] == ['3']
+ assert anns['sequence-region'] == [(0, 2482535)]
+
+
+class OutputTest(unittest.TestCase):
+ """Tests to write SeqFeatures to GFF3 output format.
+ """
+
+ def setUp(self):
+ self._test_dir = os.path.join(os.path.dirname(__file__), "GFF")
+ self._test_seq_file = os.path.join(self._test_dir, "c_elegans_WS199_dna_shortened.fa")
+ self._test_gff_file = os.path.join(self._test_dir, "c_elegans_WS199_shortened_gff.txt")
+ self._test_gff_ann_file = os.path.join(self._test_dir, "c_elegans_WS199_ann_gff.txt")
+ self._wormbase_file = os.path.join(self._test_dir, "wormbase_gff2.txt")
+
+ def t_gff3_to_gff3(self):
+ """Read in and write out GFF3 without any loss of information.
+ """
+ recs = SeqIO.to_dict(GFF.parse(self._test_gff_file))
+ out_handle = StringIO()
+ GFF.write(recs.values(), out_handle)
+ wrote_handle = StringIO(out_handle.getvalue())
+ recs_two = SeqIO.to_dict(GFF.parse(wrote_handle))
+
+ orig_rec = list(recs.values())[0]
+ re_rec = list(recs.values())[0]
+ assert len(orig_rec.features) == len(re_rec.features)
+ for i, orig_f in enumerate(orig_rec.features):
+ assert str(orig_f) == str(re_rec.features[i])
+
+ def t_gff2_to_gff3(self):
+ """Read in GFF2 and write out as GFF3.
+ """
+ recs = SeqIO.to_dict(GFF.parse(self._wormbase_file))
+ out_handle = StringIO()
+ GFF.write(recs.values(), out_handle)
+ wrote_handle = StringIO(out_handle.getvalue())
+ # check some tricky lines in the GFF2 file
+ checks = 0
+ for line in wrote_handle:
+ if line.find("Interpolated_map_position") >= 0:
+ checks += 1
+ assert line.find("RFLP=No") > 0
+ if line.find("Gene=WBGene00000138") > 0:
+ checks += 1
+ assert line.find("ID=B0019.1") > 0
+ if line.find("translated_nucleotide_match\t12762127") > 0:
+ checks += 1
+ assert line.find("Note=MSP:FADFSPLDVSDVNFATDDLAK") > 0
+ assert checks == 3, "Missing check line"
+
+ def t_write_from_recs(self):
+ """Write out GFF3 from SeqRecord inputs.
+ """
+ seq = Seq("GATCGATCGATCGATCGATC")
+ rec = SeqRecord(seq, "ID1")
+ qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"}
+ sub_qualifiers = {"source": "prediction"}
+ top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers)
+ top_feature.sub_features = [
+ SeqFeature(FeatureLocation(0, 5), type="exon", strand=1, qualifiers=sub_qualifiers),
+ SeqFeature(FeatureLocation(15, 20), type="exon", strand=1, qualifiers=sub_qualifiers)
+ ]
+ rec.features = [top_feature]
+ out_handle = StringIO()
+ GFF.write([rec], out_handle)
+ wrote_info = out_handle.getvalue().split("\n")
+ assert wrote_info[0] == "##gff-version 3"
+ assert wrote_info[1] == "##sequence-region ID1 1 20"
+ print(wrote_info[2].split("\t"))
+ assert wrote_info[2].split("\t") == [
+ 'ID1', 'prediction', 'gene', '1', '20', '10.0', '+', '.', 'ID=gene1;other=Some,annotations'
+ ]
+ assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5', '.', '+', '.', 'Parent=gene1']
+
+ def t_write_fasta(self):
+ """Include FASTA records in GFF output.
+ """
+ seq = Seq("GATCGATCGATCGATCGATC")
+ rec = SeqRecord(seq, "ID1")
+ qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"}
+ rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers)]
+ out_handle = StringIO()
+ GFF.write([rec], out_handle, include_fasta=True)
+ wrote_info = out_handle.getvalue().split("\n")
+ fasta_parts = wrote_info[3:]
+ assert fasta_parts[0] == "##FASTA"
+ assert fasta_parts[1] == ">ID1 <unknown description>"
+ assert fasta_parts[2] == str(seq)
+
+ def t_write_seqrecord(self):
+ """Write single SeqRecords.
+ """
+ seq = Seq("GATCGATCGATCGATCGATC")
+ rec = SeqRecord(seq, "ID1")
+ qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"}
+ rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers)]
+ out_handle = StringIO()
+ GFF.write([rec], out_handle, include_fasta=True)
+ wrote_info = out_handle.getvalue().split("\n")
+ gff_line = wrote_info[2]
+ assert gff_line.split("\t")[0] == "ID1"
+
+
+def run_tests(argv):
+ test_suite = testing_suite()
+ runner = unittest.TextTestRunner(sys.stdout, verbosity=2)
+ runner.run(test_suite)
+
+
+def testing_suite():
+ """Generate the suite of tests.
+ """
+ test_suite = unittest.TestSuite()
+ test_loader = unittest.TestLoader()
+ test_loader.testMethodPrefix = 't_'
+ tests = [GFF3Test, MapReduceGFFTest, SolidGFFTester, GFF2Tester, DirectivesTest, OutputTest]
+ #tests = [GFF3Test]
+ for test in tests:
+ cur_suite = test_loader.loadTestsFromTestCase(test)
+ test_suite.addTest(cur_suite)
+ return test_suite
+
+
+if __name__ == "__main__":
+ sys.exit(run_tests(sys.argv))
=====================================
bcbio_gff.egg-info/PKG-INFO
=====================================
@@ -1,10 +1,10 @@
Metadata-Version: 1.0
Name: bcbio-gff
-Version: 0.6.6
+Version: 0.6.7
Summary: Read and write Generic Feature Format (GFF) with Biopython integration.
Home-page: https://github.com/chapmanb/bcbb/tree/master/gff
Author: Brad Chapman
Author-email: chapmanb at 50mail.com
-License: UNKNOWN
+License: Biopython License
Description: UNKNOWN
Platform: UNKNOWN
=====================================
bcbio_gff.egg-info/SOURCES.txt
=====================================
@@ -8,6 +8,29 @@ BCBio/GFF/GFFOutput.py
BCBio/GFF/GFFParser.py
BCBio/GFF/__init__.py
BCBio/GFF/_utils.py
+Scripts/gff/access_gff_index.py
+Scripts/gff/genbank_to_gff.py
+Scripts/gff/gff2_to_gff3.py
+Scripts/gff/gff_to_biosql.py
+Scripts/gff/gff_to_genbank.py
+Tests/test_GFFSeqIOFeatureAdder.py
+Tests/GFF/F3-unique-3.v2.gff
+Tests/GFF/c_elegans_WS199_ann_gff.txt
+Tests/GFF/c_elegans_WS199_dna_shortened.fa
+Tests/GFF/c_elegans_WS199_shortened_gff.txt
+Tests/GFF/ensembl_gtf.txt
+Tests/GFF/glimmer_nokeyval.gff3
+Tests/GFF/hybrid1.gff3
+Tests/GFF/jgi_gff2.txt
+Tests/GFF/mouse_extra_comma.gff3
+Tests/GFF/ncbi_gff3.txt
+Tests/GFF/problem_sequence_region.gff3
+Tests/GFF/spaces.gff3
+Tests/GFF/trans_splicing.gff3
+Tests/GFF/transcripts.gff3
+Tests/GFF/unescaped-semicolon.gff3
+Tests/GFF/wormbase_gff2.txt
+Tests/GFF/wormbase_gff2_alt.txt
bcbio_gff.egg-info/PKG-INFO
bcbio_gff.egg-info/SOURCES.txt
bcbio_gff.egg-info/dependency_links.txt
=====================================
bcbio_gff.egg-info/requires.txt
=====================================
@@ -1 +1,2 @@
six
+biopython
=====================================
setup.py
=====================================
@@ -14,8 +14,9 @@ setup(name="bcbio-gff",
version=__version__,
author="Brad Chapman",
author_email="chapmanb at 50mail.com",
+ license="Biopython License",
description="Read and write Generic Feature Format (GFF) with Biopython integration.",
url="https://github.com/chapmanb/bcbb/tree/master/gff",
packages=find_packages(),
- install_requires=["six"]
+ install_requires=["six", "biopython"]
)
View it on GitLab: https://salsa.debian.org/med-team/python-bcbio-gff/-/commit/0195e22774cb530a026fc57fdfe9096c350997d6
--
View it on GitLab: https://salsa.debian.org/med-team/python-bcbio-gff/-/commit/0195e22774cb530a026fc57fdfe9096c350997d6
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211009/9a04fab0/attachment-0001.htm>
More information about the debian-med-commit
mailing list