[med-svn] [python-biotools] 01/02: New upstream version 1.2.12

Andreas Tille tille at debian.org
Thu Mar 2 12:01:26 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository python-biotools.

commit a79b3c093765aea079251badc88b9621864f6a1c
Author: Andreas Tille <tille at debian.org>
Date:   Thu Mar 2 12:43:00 2017 +0100

    New upstream version 1.2.12
---
 PKG-INFO                 |  21 ++++
 prok-geneseek            | 102 ++++++++++++++++
 setup.py                 |  30 +++++
 src/BLAST.py             | 310 +++++++++++++++++++++++++++++++++++++++++++++++
 src/IO/__init__.py       | 165 +++++++++++++++++++++++++
 src/IO/clustal.py        |  55 +++++++++
 src/IO/fasta.py          |  64 ++++++++++
 src/IO/fastq.py          |  72 +++++++++++
 src/IO/gff.py            |  28 +++++
 src/IO/manager.py        |  77 ++++++++++++
 src/IO/phylip.py         |  57 +++++++++
 src/__init__.py          |   3 +
 src/align.py             | 165 +++++++++++++++++++++++++
 src/analysis/__init__.py |   0
 src/analysis/cluster.py  | 116 ++++++++++++++++++
 src/analysis/loaddata.py | 176 +++++++++++++++++++++++++++
 src/analysis/options.py  | 168 +++++++++++++++++++++++++
 src/analysis/plot.py     | 122 +++++++++++++++++++
 src/analysis/predict.py  | 230 +++++++++++++++++++++++++++++++++++
 src/analysis/renamer.py  |  61 ++++++++++
 src/analysis/report.py   |  89 ++++++++++++++
 src/analysis/run.py      |  57 +++++++++
 src/analysis/variance.py |  95 +++++++++++++++
 src/annotation.py        | 112 +++++++++++++++++
 src/clustal.py           |  53 ++++++++
 src/complement.py        |  49 ++++++++
 src/sequence.py          | 174 ++++++++++++++++++++++++++
 src/translate.py         |  57 +++++++++
 28 files changed, 2708 insertions(+)

diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..d39577a
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,21 @@
+Metadata-Version: 1.1
+Name: biotools
+Version: 1.2.12
+Summary: A bunch of bioinformatics utilities.
+Home-page: https://github.com/sonwell/biotools
+Author: Andrew Kassen
+Author-email: atkassen at gmail.com
+License: UNKNOWN
+Description: Accompanies Bart, Rebecca, *et al.* High-throughput genomic sequencing of Cassava Bacterial Blight strains identifies conserved effectors to target for durable resistance. *PNAS Plus*.
+        
+        Currently depends on `clustalw <ftp://ftp.ebi.ac.uk/pub/software/clustalw2/2.1/>`_ and `BLAST <ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/>`_.
+        
+        You can grab the most current code from `github <https://github.com/sonwell/biotools>`_ or via git
+            git clone git://github.com/sonwell/biotools.git
+Keywords: gene prediction,prokaryotes,effectors
+Platform: UNKNOWN
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires: numpy
+Requires: matplotlib
diff --git a/prok-geneseek b/prok-geneseek
new file mode 100755
index 0000000..ed42dca
--- /dev/null
+++ b/prok-geneseek
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+# 1. predict
+# 2. cluster
+# 3. rename
+# 4. snps
+
+import biotools.analysis.run as pr
+import biotools.analysis.cluster as cw
+import biotools.analysis.renamer as mv
+import biotools.analysis.variance as cs
+import biotools.analysis.options as op
+import biotools.analysis.loaddata as ld
+import sys
+import os
+
+
+if __name__ == "__main__":
+    op.parse(sys.argv[1:])
+    if len(op.args) < 2:
+        op.help()
+        exit(0)
+
+    direc = op.DIRECTORY
+    plotter = op.PLOTTER
+    database = op.args[0]
+    names = op.args[1:]
+    sep = os.sep
+
+    predict = op.predicting
+    cluster = op.clustering
+    rename = op.renaming
+    plot = op.plotting
+    report = op.reporting
+    calculate = op.calculating
+
+    rename = rename and (cluster or not predict)
+    calculate = calculate and (cluster or not predict)
+    plot = plot and (calculate or not (predict or cluster))
+    report = report and calculate
+
+    if not (predict or cluster or rename or calculate or plot or report):
+        print "I don't know what you want, I give up."
+        exit(0)
+
+    if predict:
+        names = pr.run(database, names)
+        print "Homologous sequences written to " + direc + 'sequences' + sep
+
+    if cluster:
+        names = cw.run(direc + 'clusters' + sep, names)
+        print "Clustalw files written to " + direc + "clusters" + sep
+
+    if rename:
+        names = mv.rename(direc + 'clusters' + sep, database, names)
+
+    if plot and plotter.lower() != 'none':
+        try:
+            cv = __import__(plotter, globals(), locals(), ['plot'], -1)
+        except ImportError:
+            try:
+                if not plotter.endswith('.py'):
+                    plotter += '.py'
+                open(plotter, 'r')
+            except:
+                plot = False
+            else:
+                p = plotter.rfind(sep)
+                if p > -1:
+                    sys.path.append(plotter[:p])
+                plotter = plotter[p + len(sep):]
+
+                try:
+                    cv = __import__(plotter, globals(), locals(), ['plot'], -1)
+                except ImportError:
+                    plot = False
+
+    if calculate:
+        gen = cs.var(names)
+    elif plot:
+        gen = (ld.parse(s) for s in names)
+    else:
+        gen = []
+    for entry in gen:
+        plotdata = entry['plotdata']
+        metadata = entry['metadata']
+        if plot:
+            try:
+                cv.plot(plotdata, direc + 'plots' + sep,
+                        filename=metadata['filename'])
+            except Exception as e:
+                print metadata['filename'], e
+        if report:
+            try:
+                os.mkdir(direc + 'data' + sep)
+            except:
+                pass
+
+            fh = open(direc + 'data' + sep + metadata['strain'] + '.py', 'w')
+            fh.write('plotdata = ' + repr(plotdata) + '\n')
+            fh.write('metadata = ' + repr(metadata) + '\n')
+    print "Done"
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..b20bad0
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+from distutils.core import setup
+
+setup(name="biotools",
+      version='1.2.12',
+      description="A bunch of bioinformatics utilities.",
+      long_description="""Accompanies Bart, Rebecca, *et al.* High-throughput genomic sequencing of Cassava Bacterial Blight strains identifies conserved effectors to target for durable resistance. *PNAS Plus*.
+
+Currently depends on `clustalw <ftp://ftp.ebi.ac.uk/pub/software/clustalw2/2.1/>`_ and `BLAST <ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/>`_.
+
+You can grab the most current code from `github <https://github.com/sonwell/biotools>`_ or via git
+    git clone git://github.com/sonwell/biotools.git""",
+      author="Andrew Kassen",
+      maintainer="Andrew Kassen",
+      author_email="atkassen at gmail.com",
+      maintainer_email="atkassen at gmail.com",
+      url="https://github.com/sonwell/biotools",
+      requires=['numpy','matplotlib'],
+      packages=['biotools', 'biotools.analysis', 'biotools.IO'],
+      package_dir={'biotools': 'src',
+                   'biotools.analysis': 'src/analysis',
+                   'biotools.IO': 'src/IO'},
+      scripts=['prok-geneseek'],
+      keywords='gene prediction, prokaryotes, effectors',
+      classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Science/Research',
+        'Topic :: Scientific/Engineering :: Bio-Informatics'
+      ])
diff --git a/src/BLAST.py b/src/BLAST.py
new file mode 100644
index 0000000..6b68737
--- /dev/null
+++ b/src/BLAST.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+
+'''
+A module to manage BLAST databases and interface with the BLAST+ standalone
+program available from NCBI.
+'''
+
+import biotools.IO as io
+import subprocess
+from os import sep, getenv, listdir
+import shutil
+
+
+def run(db, sfile, mega_blast=False, **kwargs):
+    '''
+    Takes a database and a query and runs the appropriate type of BLAST on
+    them. The database can be an existing BLAST database or a fasta/fastq
+    file. If it is a sequence file, this function will look in the places
+    where BLAST would look for an existing database created from that file and
+    use that instead. If there is no such database, this function will make
+    one for you and then use the newly created database with BLAST.
+
+    Optional named arguments can currently only be `evalue`, `num_threads`,
+    `gapopen`, or `gapextend`. The correspond to the BLAST options of the same
+    name.
+    '''
+
+    cmds = {
+        'prot': {
+            'prot': 'blastp',
+            'nucl': 'tblastn'
+        },
+        'nucl': {
+            'nucl': 'blastn',
+            'prot': 'blastx'
+        }
+    }
+
+    seq = io.open(sfile, 'r').next()
+    qtype = seq.type
+
+    rcloc = ''
+    for loc in (".:~:" + (getenv("NCBI") or "")).split(':'):
+        if loc and loc[-1] == sep:
+            loc += sep
+        try:
+            for line in (l.strip() for l in open(loc + '.ncbirc', 'r')):
+                pos = line.find('=')
+                if pos >= 0 and line[:pos].strip() == "BLASTDB":
+                    rcloc = line[pos + 1:].strip()
+        except IOError:
+            pass
+
+    dbtype = None
+    bdbenv = getenv("BLASTDB")
+    dblocations = (":." + ((':' + bdbenv) if bdbenv else '') + 
+                   ((':' + rcloc) if rcloc else '')).split(':')
+    for loc in dblocations:
+        if loc and loc[-1] != sep:
+            loc += sep
+        try:
+            open(loc + db + '.pin', 'r')
+            dbtype = 'prot'
+            break
+        except IOError:
+            try:
+                open(loc + db + '.nin', 'r')
+                dbtype = 'nucl'
+                break
+            except IOError:
+                pass
+
+    if not dbtype:
+        odb = db
+        pos = db.rfind(".")
+        for seq in io.open(db, 'r'):
+            dbtype = seq.type
+            break
+        if not dbtype:
+            raise IOError("Database not found: " + odb)
+
+        ndb = None
+        sp = db.rfind(sep)
+        if sp > -1:
+            dbdir, db = db[:sp], db[sp + 1:pos]
+        else:
+            dbdir, db = '.', db[:pos]
+
+        for file in listdir(dbdir):
+            dpos = file.rfind('.')
+            if dpos >= 0 and file[dpos + 1:] == dbtype[0] + 'in':
+                fh = open(dbdir + sep + file, 'r')
+                c = ord(fh.read(12)[-1])
+                fname = fh.read(c)
+                if fname[0] in ("'", '"'):
+                    fname = fname[1:-1]
+                if fname.endswith(odb):
+                    ndb = dbdir + sep + file[:dpos]
+                    break
+        if not ndb:
+            ndb = '_'.join(db.split())
+            try:
+                ignore = open('/dev/null', 'w')
+            except IOError:
+                ignore = open('nul', 'w')
+
+            try:  # possible race condition
+                open(ndb, 'r').close()
+            except IOError:
+                subprocess.call(["makeblastdb", "-in", '"%s"' % odb,
+                                 "-out", ndb, "-dbtype", dbtype],
+                                 stdout=ignore)
+                try:
+                    for suff in ['in', 'hr', 'sq']:
+                        name = ndb + '.' + dbtype[0] + suff
+                        shutil.move(name, dbdir + sep + name)
+                except shutil.Error:
+                    pass
+            db = dbdir + sep + ndb
+        else:
+            db = ndb
+    else:
+        raise IOError("Database not found: " + db)
+    allowed = set(["evalue", "gapopen", "gapextend", "num_threads"]) & \
+        set(kwargs.keys())
+    cmd = cmds[qtype][dbtype]
+    pn = ["-db", "-query"]
+    if mega_blast:
+        cmd = "megablast"
+        pn = ["-d", "-i"]
+        allowed = ["e", "a"]
+    args = [cmd, pn[0], db, pn[1], sfile] + [arg for pair in
+            [["-" + k, str(kwargs[k])] for k in allowed] for arg in pair]
+    proc = subprocess.Popen(args, bufsize=1, stdout=subprocess.PIPE)
+    return Result(iter(proc.stdout.readline, ''))
+
+
+class Result(object):
+
+    '''
+    A class which take the raw output from BLAST and generates dictionaries
+    from the data from BLAST. This data includes the alignment, percent
+    identity, gaps, e-value, score, length of subject, length of query, and
+    start and stop positions for both sequences. This class should be used in
+    a for loop like so:
+
+    ```python
+        for res in Result(file_or_data):
+            pass
+    ```
+
+    The class instance has a single other property, `headers`, which are the
+    lines in BLAST results before the BLAST hits (e.g., citation info, etc.).
+    '''
+
+    def __init__(self, file):
+        self.file = file
+        self.headers = []
+
+    def __iter__(self):
+        try:
+            ipt = open(self.file, 'r')
+        except (IOError, TypeError):
+            try:
+                ipt = self.file.split('\n')
+            except:
+                ipt = self.file
+        mode = 0
+        headers = []
+        curr = None
+        length = 0
+
+        def sh(sn, qn, l):
+            qdl = ''
+            space = qn.find(' ')
+            if space > -1:
+                qn, qdl = qn[:space], qn[space + 1:].lstrip()
+            return {
+                'subject': {
+                    'name':  sn.lstrip(),
+                    'defline': '',
+                    'start': None,
+                    'end':   None,
+                    'sequence': ''
+                },
+                'query': {
+                    'name':  qn,
+                    'defline': qdl,
+                    'start': None,
+                    'end':   None,
+                    'sequence': ''
+                },
+                'length':  l
+            }
+
+        def ra(sh):
+            for res in ('subject', 'query'):
+                sh[res]['start'] = int(sh[res]['start'])
+                sh[res]['end'] = int(sh[res]['end'])
+                sh[res]['length'] = abs(sh[res]['end'] - sh[res]['start'] + 1)
+            return sh
+
+        def sh_fmt(l):
+            for pairs in (a.strip() for a in l.split(',')):
+                l, r = tuple(a.strip() for a in (pairs.split('=')[:2]
+                             if '=' in pairs else pairs.split(':')[:2]))
+                subheaders[l.lower().split('(')[0]] = r
+
+        for line in ipt:
+            line = line.rstrip('\n').lstrip()
+            if not line:
+                if mode == 4:
+                    mode = 5
+                continue
+
+            if mode == 0:
+                if line[:6] == 'Query=':
+                    mode = 1
+                    qname = line[6:].lstrip()
+                    self.headers = headers
+                else:
+                    headers.append(line)
+
+            elif mode == 1:
+                if line[0] == '>':
+                    mode = 3
+                    subheaders = sh(line[1:], qname, length)
+                elif line[:7] == 'Length=':
+                    length = int(''.join(line[7:].strip().split(',')))
+                    mode = 2
+                elif line[0] == '(' and line.endswith('letters)'):
+                    length = int(''.join(line[1:-8].strip().split(',')))
+                    mode = 2
+                elif line[:6] == 'Query=':
+                    qname = line[6:].lstrip()
+                else:
+                    qname += line
+
+            elif mode == 2:
+                if line[0] == '>':
+                    mode = 3
+                    subheaders = sh(line[1:], qname, length)
+                elif line[:6] == 'Query=':
+                    qname = line[6:].lstrip()
+                    mode = 1
+
+            elif mode == 3:
+                if line[:5] == 'Score':
+                    snm = subheaders['subject']['name']
+                    defline = ''
+                    space = snm.find(' ')
+                    if space > -1:
+                        snm, defline = snm[:space], snm[space + 1:]
+                    subheaders['subject']['name'] = snm
+                    subheaders['subject']['defline'] = defline
+                    sh_fmt(line)
+                    mode = 4
+                elif line[:7] == 'Length=':
+                    pass
+                elif line[0] == '(' and line.endswith('letters)'):
+                    pass
+                else:
+                    subheaders['subject']['name'] += line
+
+            elif mode == 4:
+                sh_fmt(line)
+
+            elif mode == 5:
+                if line[:6] == 'Query=':
+                    mode = 1
+                    qname = line[6:].lstrip()
+                    yield ra(subheaders)
+                    continue
+                elif line[0] == '>':
+                    yield ra(subheaders)
+                    subheaders = sh(line[1:], qname, length)
+                    mode = 3
+                    continue
+                elif line[:5] == 'Score':
+                    yield ra(subheaders)
+                    subheaders = sh(subheaders['subject']['name'], qname,
+                                    length)
+                    sh_fmt(line)
+                    mode = 4
+                    continue
+                elif line[:5] == 'Sbjct':
+                    curr = 'subject'
+                elif line[:5] == 'Query':
+                    curr = 'query'
+                else:
+                    continue
+
+                _, start, seq, end = line.split()
+                subheaders[curr]['start'] = subheaders[curr]['start'] or start
+                subheaders[curr]['end'] = end
+                subheaders[curr]['sequence'] += seq
+
+        try:
+            yield ra(subheaders)
+        except UnboundLocalError:
+            pass
+        raise StopIteration()
+
+if __name__ == '__main__':
+    import sys
+
+    if len(sys.argv) > 1:
+        output = open(sys.argv[1]).read()
+        for result in Result(output):
+            print(result)
diff --git a/src/IO/__init__.py b/src/IO/__init__.py
new file mode 100644
index 0000000..39ca342
--- /dev/null
+++ b/src/IO/__init__.py
@@ -0,0 +1,165 @@
+'''
+A module for reading and writing to sequence and annotation files. Currently
+supported file types are: FASTA, FASTQ, CLUSTAL alignments, and GFF3 files.
+'''
+
+from biotools.IO import fasta, fastq, gff, clustal
+from biotools.IO.manager import IOManager
+try:
+    import __builtin__
+except ImportError:
+    import builtins as __builtin__
+
+
+def get_methods():
+    methods = {}
+    nil = lambda *x: iter([])
+    for module in [fasta, fastq, gff, clustal]:
+        modname = module.__name__.split('.')[-1]
+        methods[modname] = {}
+        for method in ['read', 'write', 'rhook', 'whook', 'probe']:
+            methods[modname][method] = module.__dict__.get(method, nil)
+    return methods
+
+
+class IOBase(object):
+    '''
+    Generic IO class for sequence files.
+    '''
+    methods = IOManager(get_methods())
+
+    def __init__(self, name, mode):
+        '''
+        Opens file name with mode mode. This function will attempt to guess at
+        the filetype by 1. looking at the file extension and failing that,
+        will 2. read the first few lines to determine the file type.
+
+        Recoginized file extensions include fa, fsa, fas, fasta, fastq,
+        clustalw, clustal, aln.
+        '''
+
+        self.file = name
+        self.handle = __builtin__.open(name, mode)
+        self.method = self.methods.default
+        self.type = None
+
+        self.suffixes = {
+            'fsa': 'fasta',
+            'fa':  'fasta',
+            'fs': 'fasta',
+            'fas': 'fasta',
+            'fna': 'fasta',
+            'fasta': 'fasta',
+            'clustalw': 'clustal',
+            'clustal': 'clustal',
+            'aln': 'clustal',
+            'fastq': 'fastq',
+            'gff': 'gff',
+            'gff3': 'gff'
+        }
+
+        p = name.rfind('.')
+        if p > -1:
+            ext = name[p + 1:]
+            if ext in self.suffixes:
+                try:
+                    self.format(self.suffixes[ext])
+                    return
+                except ValueError:
+                    pass
+        try:
+            for method in IOBase.methods:
+                try:
+                    self.format(method)
+                    return
+                except ValueError:
+                    pass
+        except IOError:
+            raise
+
+    def format(self, fmt):
+        '''
+        Forces a file to be parsed as a particular format. By default, the
+        values for fmt can be any recognized format.
+        '''
+        if fmt in self.methods:
+            method = self.methods[fmt]
+            ret = method['probe'](__builtin__.open(self.file, 'r'))
+            if ret:
+                self.method = method
+                for key in ret:
+                    object.__setattr__(self, key, ret[key])
+                return self
+            else:
+                raise ValueError("File cannot be parsed as type %s." % fmt)
+        self.method = self.methods.default
+        return self
+
+    def close(self):
+        '''
+        Close the file handle.
+        '''
+        self.handle.close()
+
+
+class Reader(IOBase):
+    '''
+    A class that wraps IOBase and restricts the ability to write.
+    '''
+    def __init__(self, filename, mode='r'):
+        IOBase.__init__(self, filename, mode)
+        self.method['rhook'](self.handle)
+        self.iter = self.method['read'](self.handle)
+
+    def read(self, n=None):
+        '''
+        If `n` is provided, the next (up to) `n` entries are parsed and 
+        returned. Otherwise, all remaining entries are parsed and returned.
+        '''
+        if n is None:
+            return [s for s in self]
+        return [self.iter.next() for i in xrange(int(n))]
+
+    def __iter__(self):
+        return self.iter
+
+    def next(self):
+        '''
+        Reads a single entry in the file and returns it.
+        '''
+        try:
+            return self.read(1)[0]
+        except (StopIteration, ValueError, IndexError):
+            raise StopIteration()
+
+
+class Writer(IOBase):
+    '''
+    A class that wraps IOBase and restricts the ability to read.
+    '''
+
+    def __init__(self, filename, mode='w'):
+        IOBase.__init__(self, filename, mode)
+        self.haswritten = False
+
+    def write(self, sequence):
+        '''
+        Writes sequence as the correct format to the file.
+        '''
+        if not self.haswritten:
+            self.method['whook'](self.handle)
+            self.haswritten = True
+        self.method['write'](self.handle, sequence)
+
+
+def open(filename, mode='r'):
+    '''
+    Open a file for parsing or creation. Returns either a Reader or Writer
+    object, depending on the open mode.
+    '''
+    if mode == 'r':
+        return Reader(filename)
+    if mode == 'w':
+        return Writer(filename, mode='w')
+    if mode == 'a':
+        return Writer(filename, mode='a')
diff --git a/src/IO/clustal.py b/src/IO/clustal.py
new file mode 100644
index 0000000..40fad12
--- /dev/null
+++ b/src/IO/clustal.py
@@ -0,0 +1,55 @@
+'''
+Methods for manipulating clustalw alignment files.
+'''
+
+from biotools.sequence import Sequence
+
+
+def read(fh):
+    def clean_alignment(x):
+        i = 0
+        for c in x:
+            if c != "-":
+                break
+            i += 1
+        j = 0
+        for c in reversed(x):
+            if c != "-":
+                break
+            j += 1
+        return (' ' * i + x[(i or None):(-j or None)] + ' ' * j, i, len(x) - j)
+
+    seqs = {}
+    for line in fh:
+        if line.startswith(' '):
+            continue
+        st = line.strip()
+        if st:
+            bits = st.split()
+            if len(bits) != 2:
+                continue
+            if bits[0] not in seqs:
+                seqs[bits[0]] = ''
+            seqs[bits[0]] += bits[1]
+
+    for k in seqs:
+        seq, start, end = clean_alignment(seqs[k])
+        yield Sequence(k, seq, start=start, end=end)
+    raise StopIteration()
+
+
+def probe(fh):
+    for line in fh:
+        st = line.strip()
+        if st:
+            if st.startswith('CLUSTAL'):
+                return {'type': 'clustalw'}
+            return False
+    return {'type': 'clustalw'}
+
+
+def rhook(fh):
+    try:
+        fh.next()
+    except StopIteration:
+        pass
diff --git a/src/IO/fasta.py b/src/IO/fasta.py
new file mode 100644
index 0000000..e10982e
--- /dev/null
+++ b/src/IO/fasta.py
@@ -0,0 +1,64 @@
+'''
+Functions for manipulating FASTA files.
+'''
+
+from biotools.sequence import Sequence, chop
+
+
+def read(fh):
+    '''
+    Read sequences in FASTA format; identifiers (names and definition lines) 
+    are on lines that begin with carets ('>') and sequence is on lines that 
+    intervene between the carets. This function is a generator that yields
+    `Sequence` objects.
+    '''
+    name, defline, seq = '', '', ''
+    for line in fh:
+        line = line.strip()
+        if not line:
+            continue
+        if line[0] == '>':
+            if name or seq:
+                yield Sequence(name, seq, defline=defline)
+            seq = ''
+            name = line[1:].split()[0]
+            defline = line[1 + len(name):].strip()
+            continue
+        seq += line
+    if name or seq:
+        yield Sequence(name, seq, defline=defline)
+    raise StopIteration()
+
+
+def write(fh, s):
+    '''
+    Write sequences in FASTA format, i.e.,
+    
+    ```
+    >name defline
+    sequence ...
+    ```
+    
+    Sequences are wrapped to 70 characters by default.
+    '''
+    fh.write('>%s %s\n' % (s.name, s.defline) +
+             '\n'.join(chop(s.seq, 70)) + '\n')
+
+
+def probe(fh):
+    '''
+    Probe a file to determine whether or not it is a FASTA file. That is,
+    the first non-empty line should begin with a caret ('>'). If no caret is
+    found on the first line, then we conclude that it is not a FASTA file
+    and return False, otherwise, we return a dictionary with information
+    relevant to the FASTA file type.
+    '''
+    for line in fh:
+        st = line.strip()
+        if st:
+            fh.close()
+            if st[0] == '>':
+                return {'type': 'fasta'}
+            return False
+    fh.close()
+    return {'type': 'fasta'}
diff --git a/src/IO/fastq.py b/src/IO/fastq.py
new file mode 100644
index 0000000..2b0729c
--- /dev/null
+++ b/src/IO/fastq.py
@@ -0,0 +1,72 @@
+'''
+Functions for manipulating FASTQ files.
+'''
+
+from biotools.sequence import Sequence
+
+
+def read(fh):
+    '''
+    Read sequences in FASTQ format; identifiers are on lines that begin with 
+    at symbols ('@'), sequence follows on the next line, then a line that 
+    begins and sequence is with a plus sign ('+') and finally the quality 
+    scores on the subsequent line. Quality scores are encoded in Phred format,
+    the type of which (either 32 or 64) is determined when the file is probed
+    for opening. The scores are decoded into a list of integers. This function 
+    is a generator that yields `Sequence` objects.
+    '''
+    while 1:
+        try:
+            line = fh.next().strip()
+            if line[0] == '@':
+                name = line[1:].split()[0]
+                defline = line[1 + len(name):].strip()
+                seq = f.next().strip()
+                fh.next()
+                qual = [ord(c) - self.phred for c in fh.next().strip()]
+                yield Sequence(name, seq, qual=qual, defline=defline)
+        except StopIteration:
+            raise
+        finally:
+            fh.close()
+
+
+def write(fh, s):
+    '''
+    Write sequences in FASTA format, i.e.,
+    
+    ```
+    @name
+    sequence ...
+    +
+    quality scores
+    ```
+    '''
+    fh.write('@%s %s\n%s\n+\n%s\n' % (s.name, s.defline, s.seq,
+             ''.join(q + chr('A') - 1 for q in s.qual)) + '\n')
+
+
+def probe(fh):
+    '''
+    Probe a file to determine whether or not it is a FASTQ file. That is,
+    the first non-empty line should begin with a caret ('@') and the 3rd line
+    following that first non-empty line should contain no character with
+    ordinal value less than 32. If none of the characters have ordinal value
+    less than 64, then the file is guessed to be encoded in Phred64, otherwise
+    it is encoded in Phred32. This function will return False if the file is
+    not in FASTQ format and will return a dictionary with the phred score and
+    type ('fastq') if the file is FASTQ.
+    '''
+    for line in fh:
+        st = line.strip()
+        if st:
+            fh.close()
+            if st[0] == '@':
+                fh.next()
+                fh.next()
+                qual = [ord(c) for c in fh.next().strip()]
+                phred = 32 if min(qual) < ord('A') else 64
+                qual = [q - phred for q in qual]
+                return {'type': 'fastq', 'phred': phred}
+            return False
+    return {'type': 'fastq', 'phread': 64}
diff --git a/src/IO/gff.py b/src/IO/gff.py
new file mode 100644
index 0000000..c2f11f1
--- /dev/null
+++ b/src/IO/gff.py
@@ -0,0 +1,28 @@
+from biotools.annotation import Annotation
+
+
+def read(fh):
+    for line in fh:
+        if line[0] != '#':
+            yield Annotation(*line.split('\t'))
+    raise StopIteration()
+
+
+def write(fh, a):
+    # TODO: improve this...
+    fh.write(str(a) + '\n')
+
+
+def probe(fh):
+    for line in fh:
+        line = line.strip()
+        if line:
+            bits = line.split()
+            if bits[0] == '##gff-version':
+                return {'type': 'gff', 'version': float(bits[1])}
+            return False
+    return {'type': 'gff', 'version': 3}
+
+
+def whook(fh):
+    fh.write('##gff-version 3\n')
diff --git a/src/IO/manager.py b/src/IO/manager.py
new file mode 100644
index 0000000..9046496
--- /dev/null
+++ b/src/IO/manager.py
@@ -0,0 +1,77 @@
+'''
+This module is home to the IOManager class, which manages the various input
+and output formats (specifically, FASTA, FASTQ, CLUSTAL alignments, and GFF
+files, currently).
+'''
+
+
+class IOManager(object):
+    '''
+    A class used by the `IOBase` class to manage the various input and output
+    methods for the different file types. Additional file types can be added 
+    to the manager by using 
+
+    ```python
+    manager[format] = methods
+    ```
+
+    From the above example, `methods` is a dictionary with keys `rhook`, 
+    `read`, `whook`, `write`, and `probe`. Each of the values must be callable
+    object:
+    * `rhook` => takes a file handle opened for reading; called before reading
+        of the file has begun,
+    * `whook` => takes a file handle opened for writing; called before writing
+        to the file has begun,
+    * `read`  => takes a file handle opened for reading; should be a generator
+        that yields entries,
+    * `write` => takes a file handle opened for writing and a single entry;
+        writes the entry to the file,
+    * `probe` => takes a file handle opened for reading; returns a dictionary
+        of attributes to be applied to the `IOBase` instance.
+
+    This class behaves similarly to a dictionary, except that the get method
+    will default to the default method (which does nothing) if no truthy
+    second parameter is passed.
+    '''
+
+    def __init__(self, methods=None):
+        '''
+        Instantiates an `IOManager` with methods, where the keys of methods are
+        the formats and the values are dictionaries with `rhook`, `whook`, 
+        `read`, `write`, and `probe` callables.
+        '''
+        self.methods = methods or {}
+        self.protected = set(self.methods.keys())
+
+        nil = lambda *x: iter([])
+        self.default = {
+            'rhook': nil,
+            'read':  nil,
+            'whook': nil,
+            'write': nil,
+            'probe': nil
+        }
+
+    def __contains__(self, key):
+        return key in self.methods
+
+    def __getitem__(self, key):
+        return self.methods.get(key, self.default)
+
+    def get(self, key, default=None):
+        '''
+        Try to get a set of methods via format (e.g., 'fasta') or fall-back
+        to the default methods (which do nothing).
+        '''
+        try:
+            return self.methods[key]
+        except KeyError:
+            return default or self.default
+
+    def __setitem__(self, key, value):
+        if key not in self.protected:
+            self.methods[key] = value
+        return value
+
+    def __iter__(self):
+        return iter(self.methods)
diff --git a/src/IO/phylip.py b/src/IO/phylip.py
new file mode 100644
index 0000000..0489714
--- /dev/null
+++ b/src/IO/phylip.py
@@ -0,0 +1,57 @@
+'''
+Functions for manipulating PHYLIP files.
+'''
+
+from biotools.sequence import Sequence, chop
+
+
+def read(fh):
+    '''
+    Read sequences in PHYLIP format; This function is a generator that yields
+    `Sequence` objects.
+    '''
+    grid, seqs, names = None, [], []
+    
+    try:
+        line = fh.next().strip()
+        while not line:
+            line = fh.next.strip()
+    except StopIteration:
+        fh.close()
+        raise StopIteration()
+    grid = [int(x) for x in line.strip()]
+    while True:
+        lines = []
+        try:
+            for i in xrange(grid[0]):
+                line = fh.next().strip()
+                while not line:
+                    line = fh.next().strip()
+                lines.append(line)
+            if not names:
+                names = [l.split()[0] for l in lines]
+                try:
+                    seqs = [''.join(l.split()[1:]) for l in lines]
+                except IndexError:
+                    seqs = [''] * grid[0]
+            else:
+                temp = [''.join(l.split()) for l in lines]
+                seqs = [a + b for a, b in zip(seqs, temp)]
+        except StopIteration:
+            break
+    for name, seq in zip(names, seqs):
+        yield Sequence(name, seq)
+    fh.close()
+    raise StopIteration()
+
+
+def probe(fh):
+    '''
+    '''
+    for line in fh:
+        bits = line.split()
+        if len(bits) == 2 and int(bits[0]) > 0 and int(bits[1]) > 0:
+            return {'type': 'phylip'}
+        else:
+            return False
+    return {'type': 'phylip'}
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..8b85d47
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,3 @@
+'''
+A bunch of bioinformatics utilities.
+'''
diff --git a/src/align.py b/src/align.py
new file mode 100644
index 0000000..36a086e
--- /dev/null
+++ b/src/align.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python
+
+'''
+This module is used to align sequences. Currently, there is only a single
+alignment algorithm implementented; it is a hybrid between Needleman-Wunsch
+and Smith-Waterman and is used to find the subsequence within a larger sequence
+that best aligns to a reference.
+'''
+
+from biotools.translate import translate
+import biotools.analysis.options as options
+
+DIAG_MARK, VGAP_MARK, HGAP_MARK = 3, 2, 1
+bl = {
+ '*': {'*': 0, 'A': -9, 'C': -9, 'E': -9, 'D': -9, 'G': -9, 'F': -9, 'I': -9,
+       'H': -9, 'K': -9, 'M': -9, 'L': -9, 'N': -9, 'Q': -9, 'P': -9, 'S': -9,
+       'R': -9, 'T': -9, 'W': -9, 'V': -9, 'Y': -9, 'X': 0},
+ 'A': {'*': -9, 'A': 4, 'C': 0, 'E': -1, 'D': -2, 'G': 0, 'F': -2, 'I': -1,
+       'H': -2, 'K': -1, 'M': -1, 'L': -1, 'N': -1, 'Q': -1, 'P': -1, 'S': 1,
+       'R': -1, 'T': -1, 'W': -3, 'V': -2, 'Y': -2, 'X': 0},
+ 'C': {'*': -9, 'A': 0, 'C': 9, 'E': -4, 'D': -3, 'G': -3, 'F': -2, 'I': -1,
+       'H': -3, 'K': -3, 'M': -1, 'L': -1, 'N': -3, 'Q': -3, 'P': -3, 'S': -1,
+       'R': -3, 'T': -1, 'W': -2, 'V': -1, 'Y': -2, 'X': 0},
+ 'E': {'*': -9, 'A': -1, 'C': -4, 'E': 5, 'D': 2, 'G': -2, 'F': -3, 'I': -3,
+       'H': 0, 'K': 1, 'M': -2, 'L': -3, 'N': 0, 'Q': 2, 'P': -1, 'S': 0,
+       'R': 0, 'T': 0, 'W': -3, 'V': -3, 'Y': -2, 'X': 0},
+ 'D': {'*': -9, 'A': -2, 'C': -3, 'E': 2, 'D': 6, 'G': -1, 'F': -3, 'I': -3,
+       'H': -1, 'K': -1, 'M': -3, 'L': -4, 'N': 1, 'Q': 0, 'P': -1, 'S': 0,
+       'R': -2, 'T': 1, 'W': -4, 'V': -3, 'Y': -3, 'X': 0},
+ 'G': {'*': -9, 'A': 0, 'C': -3, 'E': -2, 'D': -1, 'G': 6, 'F': -3, 'I': -4,
+       'H': -2, 'K': -2, 'M': -3, 'L': -4, 'N': -2, 'Q': -2, 'P': -2, 'S': 0,
+       'R': -2, 'T': 1, 'W': -2, 'V': 0, 'Y': -3, 'X': 0},
+ 'F': {'*': -9, 'A': -2, 'C': -2, 'E': -3, 'D': -3, 'G': -3, 'F': 6, 'I': 0,
+       'H': -1, 'K': -3, 'M': 0, 'L': 0, 'N': -3, 'Q': -3, 'P': -4, 'S': -2,
+       'R': -3, 'T': -2, 'W': 1, 'V': -1, 'Y': 3, 'X': 0},
+ 'I': {'*': -9, 'A': -1, 'C': -1, 'E': -3, 'D': -3, 'G': -4, 'F': 0, 'I': 4,
+       'H': -3, 'K': -3, 'M': 1, 'L': 2, 'N': -3, 'Q': -3, 'P': -3, 'S': -2,
+       'R': -3, 'T': -2, 'W': -3, 'V': 1, 'Y': -1, 'X': 0},
+ 'H': {'*': -9, 'A': -2, 'C': -3, 'E': 0, 'D': 1, 'G': -2, 'F': -1, 'I': -3,
+       'H': 8, 'K': -1, 'M': -2, 'L': -3, 'N': 1, 'Q': 0, 'P': -2, 'S': -1,
+       'R': 0, 'T': 0, 'W': -2, 'V': -2, 'Y': 2, 'X': 0},
+ 'K': {'*': -9, 'A': -1, 'C': -3, 'E': 1, 'D': -1, 'G': -2, 'F': -3, 'I': -3,
+       'H': -1, 'K': 5, 'M': -1, 'L': -2, 'N': 0, 'Q': 1, 'P': -1, 'S': 0,
+       'R': 2, 'T': 0, 'W': -3, 'V': -3, 'Y': -2, 'X': 0},
+ 'M': {'*': -9, 'A': -1, 'C': -1, 'E': -2, 'D': -3, 'G': -3, 'F': 0, 'I': 1,
+       'H': -2, 'K': -1, 'M': 5, 'L': 2, 'N': -2, 'Q': 0, 'P': -2, 'S': -1,
+       'R': -1, 'T': -1, 'W': -1, 'V': -2, 'Y': -1, 'X': 0},
+ 'L': {'*': -9, 'A': -1, 'C': -1, 'E': -3, 'D': -4, 'G': -4, 'F': 0, 'I': 2,
+       'H': -3, 'K': -2, 'M': 2, 'L': 4, 'N': -3, 'Q': -2, 'P': -3, 'S': -2,
+       'R': -2, 'T': -2, 'W': -2, 'V': 3, 'Y': -1, 'X': 0},
+ 'N': {'*': -9, 'A': -2, 'C': -3, 'E': 0, 'D': 1, 'G': 0, 'F': -3, 'I': -3,
+       'H': -1, 'K': 0, 'M': -2, 'L': -3, 'N': 6, 'Q': 0, 'P': -2, 'S': 1,
+       'R': 0, 'T': 0, 'W': -4, 'V': -3, 'Y': -2, 'X': 0},
+ 'Q': {'*': -9, 'A': -1, 'C': -3, 'E': 2, 'D': 0, 'G': -2, 'F': -3, 'I': -3,
+       'H': 0, 'K': 1, 'M': 0, 'L': -2, 'N': 0, 'Q': 5, 'P': -1, 'S': 0,
+       'R': 1, 'T': 0, 'W': -2, 'V': -2, 'Y': -1, 'X': 0},
+ 'P': {'*': -9, 'A': -1, 'C': -3, 'E': -1, 'D': -1, 'G': -2, 'F': -4, 'I': -3,
+       'H': -2, 'K': -1, 'M': -2, 'L': -3, 'N': -1, 'Q': -1, 'P': 7, 'S': -1,
+       'R': -2, 'T': 1, 'W': -4, 'V': -2, 'Y': -3, 'X': 0},
+ 'S': {'*': -9, 'A': 1, 'C': -1, 'E': 0, 'D': 0, 'G': 0, 'F': -2, 'I': -2,
+       'H': -1, 'K': 0, 'M': -1, 'L': -2, 'N': 1, 'Q': 0, 'P': -1, 'S': 4,
+       'R': -1, 'T': 1, 'W': -3, 'V': -2, 'Y': -2, 'X': 0},
+ 'R': {'*': -9, 'A': -1, 'C': -3, 'E': 0, 'D': -2, 'G': -2, 'F': -3, 'I': -3,
+       'H': 0, 'K': 2, 'M': -1, 'L': -2, 'N': 0, 'Q': 1, 'P': -2, 'S': -1,
+       'R': 5, 'T': -1, 'W': -3, 'V': -3, 'Y': -2, 'X': 0},
+ 'T': {'*': -9, 'A': -1, 'C': -1, 'E': 0, 'D': 1, 'G': 1, 'F': -2, 'I': -2,
+       'H': 0, 'K': 0, 'M': -1, 'L': -2, 'N': 0, 'Q': 0, 'P': 1, 'S': 1,
+       'R': -1, 'T': 4, 'W': -3, 'V': -2, 'Y': -2, 'X': 0},
+ 'W': {'*': -9, 'A': -3, 'C': -2, 'E': -3, 'D': -4, 'G': -2, 'F': 1, 'I': -3,
+       'H': -2, 'K': -3, 'M': -1, 'L': -2, 'N': -4, 'Q': -2, 'P': -4, 'S': -3,
+       'R': -3, 'T': -3, 'W': 11, 'V': -3, 'Y': 2, 'X': 0},
+ 'V': {'*': -9, 'A': 0, 'C': -1, 'E': -2, 'D': -3, 'G': -3, 'F': -1, 'I': 3,
+       'H': -3, 'K': -2, 'M': 1, 'L': 1, 'N': -3, 'Q': -2, 'P': -2, 'S': -2,
+       'R': -3, 'T': -2, 'W': -3, 'V': 4, 'Y': -1, 'X': 0},
+ 'Y': {'*': -9, 'A': -2, 'C': -2, 'E': -2, 'D': -3, 'G': -3, 'F': 3, 'I': -1,
+       'H': 2, 'K': -2, 'M': -1, 'L': -1, 'N': -2, 'Q': -1, 'P': -3, 'S': -2,
+       'R': -2, 'T': -2, 'W': 2, 'V': -1, 'Y': 7, 'X': 0},
+ 'X': {'*': 0, 'A': 0, 'C': 0, 'E': 0, 'D': 0, 'G': 0, 'F': 0, 'I': 0,
+       'H': 0, 'K': 0, 'M': 0, 'L': 0, 'N': 0, 'Q': 0, 'P': 0, 'S': 0,
+       'R': 0, 'T': 0, 'W': 0, 'V': 0, 'Y': 0, 'X': 0}
+}
+
+
+def OptimalCTether(reference, translation, extend=1, create=10):
+    '''
+    This function will take two sequences: a `reference` sequence and  another
+    protein sequence (`translation`; usually, this is an open reading frame
+    that has been translated). Needleman-Wunsch alignment will be performed
+    and the substring of translation with the highest identity that begins
+    with a start codon [default: `['ATG']`] is reported.
+
+    This function returns a dictionary of relevent information from the
+    alignment; specifically, the alignments itself [keys: `query`, `subject`],
+    the score [key: `score`], the length of the alignment [key: `length`], the
+    length of the substring of translation used [key: `sublength`], the number
+    of identities [key: `identities`], and the number of gaps [key: `gaps`].
+    '''
+
+    starts = set(translate(s) for s in options.START_CODONS)
+    v, w = reference, translation
+
+    try:
+        v = v.seq
+    except AttributeError:
+        pass
+    try:
+        w = w.seq
+    except AttributeError:
+        pass
+    if not starts & set(w):
+        raise ValueError("Open reading frame does not contain a start codon.")
+
+    v, w = v[::-1], w[::-1]
+    lv, lw = len(v), len(w)
+    rv, rw = range(lv + 1), range(lw + 1)
+    gpc = [[create * int(not (i | j)) for i in rw] for j in rv]
+    mat = [[-(i + j) * extend - create * (not (i | j) and w[0] != v[0])
+           for i in rw] for j in rv]
+    pnt = [[VGAP_MARK if i > j else HGAP_MARK if j > i else DIAG_MARK
+           for i in rw] for j in rv]
+    ids = [[0 for i in rw] for j in rv]
+    optimal = [None, 0, 0]
+
+    for i in range(lv):
+        for j in range(lw):
+            vals = [[mat[i][j] + bl[v[i]][w[j]], DIAG_MARK],
+                    [mat[i + 1][j] - extend - gpc[i + 1][j], VGAP_MARK],
+                    [mat[i][j + 1] - extend - gpc[i][j + 1], HGAP_MARK]]
+            mat[i + 1][j + 1], pnt[i + 1][j + 1] = max(vals)
+            gpc[i + 1][j + 1] = create * int(pnt[i + 1][j + 1] == DIAG_MARK)
+            if (optimal[0] is None or mat[i + 1][j + 1] > optimal[0]) and \
+                    abs(lv - i) / float(lv) <= options.LENGTH_ERR and \
+                    w[j] in starts:
+                optimal = [mat[i + 1][j + 1], i + 1, j + 1]
+
+    i, j = optimal[1], optimal[2]
+    seq, ids = ['', ''], 0
+    gapcount, length, sublen = 0, 0, 0
+    methods = {
+        VGAP_MARK:
+            lambda s, i, j, l, g, n:
+                (['-' + s[0], w[j - 1] + s[1]], i, j - 1, l + 1, g + 1, n),
+        DIAG_MARK:
+            lambda s, i, j, l, g, n:
+                ([v[i - 1] + s[0], w[j - 1] + s[1]], i - 1, j - 1,
+                 l + 1, g, n + (w[j - 1] == v[i - 1])),
+        HGAP_MARK:
+            lambda s, i, j, l, g, n:
+                ([v[i - 1] + s[0], '-' + s[1]], i - 1, j, l, g + 1, n)
+    }
+
+    while [i, j] != [0, 0]:
+        length += 1
+        state = (seq, i, j, sublen, gapcount, ids)
+        seq, i, j, sublen, gapcount, ids = methods[pnt[i][j]](*state)
+
+    return {
+        'subject': seq[0][::-1],
+        'query': seq[1][::-1],
+        'score': optimal[0],
+        'gaps': gapcount,
+        'length': length,
+        'sublength': sublen,
+        'identities': ids
+    }
diff --git a/src/analysis/__init__.py b/src/analysis/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/analysis/cluster.py b/src/analysis/cluster.py
new file mode 100644
index 0000000..8e53d79
--- /dev/null
+++ b/src/analysis/cluster.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+import biotools.sequence as sequ
+import biotools.IO as io
+import biotools.translate as tran
+import biotools.clustal as clustal
+import biotools.analysis.options as options
+try:
+    import Queue as queue
+except ImportError:
+    import queue
+import hashlib
+import subprocess
+import threading
+from os import sep, mkdir
+
+
+def run(direc, inputs):
+    '''
+    Takes a collection of files generated by gene prediction, creates clusters
+    based off of the genes that have homology to those predicted genes, and
+    creates new fasta files in the clusters sub directory under the given
+    directory and separated according to whether they are nucleotide or amino
+    acid sequnces. These new fasta files are then used to create clustalw
+    alignments of the genes if more than 1 sequence exists in the fasta file.
+    '''
+
+    clusters = {}
+    all_ids = set()
+    ids = {}
+    q = queue.Queue()
+    filenames = []
+
+    def run_clustal():
+        while not q.empty():
+            cid = q.get()
+            dig = hashlib.md5()
+            dig.update(' '.join(cid))
+            dig = dig.hexdigest()
+
+            fpre = direc + 'nt' + sep + dig
+            apre = direc + 'aa' + sep + dig
+            fname = fpre + ".fasta"
+            aname = apre + ".fasta"
+
+            fh = io.open(fname, 'w')
+            ah = io.open(aname, 'w')
+            for ipt in clusters:
+                counter = 0
+                name = '_'.join(ipt.split(sep)[-1].split('.')[0].split())
+                for cluster in clusters[ipt]:
+                    if cid & cluster[0]:
+                        nm = name + '_' + str(counter)
+                        seq = cluster[1]
+                        curr = sequ.Sequence(nm, seq, defline=', '.join(cid))
+                        tr = tran.translate(curr)
+                        tr.name = curr.name
+                        fh.write(curr)
+                        ah.write(tr)
+                        counter += 1
+            fh.close()
+            ah.close()
+
+            try:
+                clustal.run(fname, fpre + '.clustalw')
+                clustal.run(aname, apre + '.clustalw')
+                filenames.append(dig + '.fasta')
+            except ValueError:
+                pass
+
+            q.task_done()
+
+    if direc:
+        for d in [direc, direc + 'nt' + sep, direc + 'aa' + sep]:
+            try:
+                mkdir(d)
+            except OSError:
+                pass
+
+    for ipt in inputs:
+        seqs = {}
+        ids[ipt] = set()
+        for seq in io.open(ipt, 'r'):
+            ids[ipt].add(seq.name)
+            all_ids.add(seq.name)
+            if seq.seq not in seqs:
+                seqs[seq.seq] = set()
+            seqs[seq.seq].add(seq.name)
+        clusters[ipt] = [(seqs[k], k) for k in seqs]
+        del seqs
+
+    sub_ids = []
+    while all_ids:
+        cid = all_ids.pop()
+        subcluster = (all_ids | set([cid])) & \
+            set(i for ipt in clusters for cluster in clusters[ipt]
+                for i in cluster[0] if cid in cluster[0])
+
+        for ipt in clusters:
+            for cluster in clusters[ipt]:
+                if cid in cluster[0]:
+                    subcluster = (subcluster & cluster[0]) | \
+                        (subcluster - ids[ipt])
+        sub_ids.append(subcluster)
+        all_ids -= subcluster
+
+    for cid in sub_ids:
+        q.put(cid)
+
+    threads = []
+    for i in xrange(options.NUM_PROCESSES - 1):
+        curr = threading.Thread(target=run_clustal)
+        threads.append(curr)
+        curr.start()
+    run_clustal()
+    q.join()
+    return filenames
diff --git a/src/analysis/loaddata.py b/src/analysis/loaddata.py
new file mode 100644
index 0000000..a267912
--- /dev/null
+++ b/src/analysis/loaddata.py
@@ -0,0 +1,176 @@
+'''
+This is a pretty simple JSON-like parser. Specifically, it can load Python-like
+object, list, and other literals, i.e., the sort of stuff you'd get it you
+dumped the the string representation of some data into a file.
+
+The real difference is that you must specify a variable name, e.g.:
+
+```python
+my_stuff = { ... }
+```
+
+These variable names don't need to be on a newline or anything like that, you
+should be able to omit any and all whitespace. The result of a successful 
+parse is a dictionary:
+
+```python
+{'my_stuff': { ... }}
+```
+
+This function really only works for `None`, `True`, `False`, numbers, strings, 
+dictionaries, and lists.
+'''
+
+def parse(ipt):
+    par = {'char': '', 'pos': -1}
+    leng = 0
+    val = ""
+    ret = {}
+
+    def advance(test=None):
+        curr = par['char']
+        if test and test != curr:
+            raise ValueError("Expected %s, saw %s." % (test, curr))
+
+        par['pos'] += 1
+        if par['pos'] < leng:
+            par['char'] = val[par['pos']]
+        else:
+            par['char'] = ''
+
+    def whitespace():
+        while par['char'] and ord(par['char']) <= ord(' '):
+            advance()
+
+    def variable():
+        whitespace()
+        start = par['pos']
+        while \
+                ('0' <= par['char'] <= '9') or \
+                ('A' <= par['char'] <= 'Z') or \
+                ('a' <= par['char'] <= 'z'):
+            advance()
+        stop = par['pos']
+        whitespace()
+        advance('=')
+
+        return (val[start:stop], value())
+
+    def value():
+        whitespace()
+        if par['char'] == '{':
+            return dictionary()
+        if par['char'] == '[':
+            return array()
+        if par['char'] in ("'", '"'):
+            return string()
+        if '0' <= par['char'] <= '9' or par['char'] == '-':
+            return number()
+        at = par['pos']
+        if par['char'] == 'T':
+            advance('r')
+            advance('u')
+            advance('e')
+            return True
+        if par['char'] == 'F':
+            advance('a')
+            advance('l')
+            advance('s')
+            advance('e')
+            return False
+        if par['char'] == 'N':
+            advance('o')
+            advance('n')
+            advance('e')
+            return None
+        prefix = val[at:par['pos']]
+        raise ValueError("Unexpected value starting with %s." % prefix)
+
+    def dictionary():
+        ret = {}
+        advance('{')
+        whitespace()
+        while par['char'] != '}':
+            s = value()
+            advance(':')
+            v = value()
+            ret[s] = v
+            if par['char'] == ',':
+                advance(',')
+            else:
+                break
+        advance('}')
+        whitespace()
+        return ret
+
+    def array():
+        ret = []
+        advance('[')
+        whitespace()
+        while par['char'] != ']':
+            ret.append(value())
+            if par['char'] == ',':
+                advance(',')
+            else:
+                break
+        advance(']')
+        whitespace()
+        return ret
+
+    def string():
+        ret = ""
+        q = par['char']
+        advance(q)
+        while par['char'] != q:
+            if par['char'] == '\\':
+                advance('\\')
+                if par['char'] == 'n':
+                    ret += '\n'
+                elif par['char'] == 't':
+                    ret += '\t'
+                else:
+                    ret += par['char']
+                advance()
+                continue
+            ret += par['char']
+            advance()
+        advance(q)
+        whitespace()
+        return ret
+
+    def number():
+        ret = ""
+        if par['char'] == '-':
+            ret += '-'
+            advance('-')
+        while '0' <= par['char'] <= '9':
+            ret += par['char']
+            advance()
+        if par['char'] == '.':
+            ret += '.'
+            advance('.')
+            while '0' <= par['char'] <= '9':
+                ret += par['char']
+                advance()
+        if par['char'] in ('e', 'E'):
+            ret += par['char']
+            advance()
+            if par['char'] == '-':
+                ret += '-'
+                advance('-')
+            while '0' <= par['char'] <= '9':
+                ret += par['char']
+                advance()
+        whitespace()
+        return float(ret)
+
+    for line in ipt:
+        val += line
+        leng += len(line)
+
+    advance()
+    while par['pos'] < leng:
+        k, v = variable()
+        ret[k] = v
+
+    return ret
diff --git a/src/analysis/options.py b/src/analysis/options.py
new file mode 100644
index 0000000..9f8a79d
--- /dev/null
+++ b/src/analysis/options.py
@@ -0,0 +1,168 @@
+from optparse import OptionParser
+from threading import Lock
+from os import sep, makedirs
+from sys import stderr
+
+LENGTH_ERR = 0.2
+MIN_IDENTITY = 0.45
+MAX_EVALUE = 1e-30
+MIN_ORFLEN = 300
+NUM_THREADS = 16
+NUM_PROCESSES = 2
+DIRECTORY = '.' + sep
+PLOTTER = 'biotools.analysis.plot'
+
+START_CODONS = ["ATG"]
+STOP_CODONS = ["TAG", "TAA", "TGA"]
+args = tuple()
+
+predicting = True
+clustering = True
+renaming = True
+calculating = True
+reporting = True
+plotting = True
+verbose = False
+
+lock = Lock()
+
+parser = OptionParser(usage="Usage: %prog [options] " +
+                      "<database> <sequences ...>")
+parser.add_option("-S", "--start", action="append", dest="start",
+                  default=START_CODONS, type="string",
+                  help="define a start codon [default: %s]" %
+                  ' '.join("-S " + s for s in START_CODONS))
+parser.add_option("-E", "--stop", action="append", dest="stop",
+                  default=STOP_CODONS, type="string",
+                  help="define a stop codon [default: %s]" %
+                  ' '.join("-E " + s for s in STOP_CODONS))
+parser.add_option("-j", "--threads", action="store", dest="threads",
+                  default=NUM_THREADS, type="int",
+                  help="number of threads [default: %default]")
+parser.add_option("-p", "--processes", action="store", dest="processes",
+                  default=NUM_PROCESSES, type="int",
+                  help="number of parallel processes to run " +
+                  "[default: %default]")
+parser.add_option("-e", "--evalue", action="store", dest="evalue",
+                  default=MAX_EVALUE, type="float",
+                  help="maximum e-value [default: %default]")
+parser.add_option("-I", "--identity", action="store", dest="identity",
+                  default=MIN_IDENTITY, type="float",
+                  help="minimum percent identity [default: %default]")
+parser.add_option("-L", "--length", action="store", dest="fraction",
+                  default=LENGTH_ERR, type="float",
+                  help="allowable relative error in hit length " +
+                  "[default: %default]")
+parser.add_option("-O", "--orflen", action="store", dest="orflen",
+                  metavar="bases", default=MIN_ORFLEN, type="int",
+                  help="minimum allowable length for ORFs [default: %default]")
+parser.add_option("-d", "--directory", action="store", dest="directory",
+                  default=DIRECTORY, type="string",
+                  help="set working directory [default: current]")
+parser.add_option("-P", "--plotter", action="store", dest="plotter",
+                  default=PLOTTER, type="string",
+                  help="plotting module [default: %default]")
+parser.add_option("-v", "--verbose", action="store_true", dest="verbose",
+                  default=verbose,
+                  help="print debug messages [default: False]")
+
+parser.add_option("--no-plots", action="store_false", dest="plotting",
+                  default=plotting,
+                  help="suppress the drawing of plots [default: False]")
+parser.add_option("--no-predict", action="store_false", dest="predicting",
+                  default=predicting,
+                  help="don't predict genes, instead treat the input files " +
+                  "as predicted genes [default: False]")
+parser.add_option("--no-cluster", action="store_false", dest="clustering",
+                  default=True, help="don't cluster the sequences, " +
+                  "instead treat the input files as alignments " +
+                  "[default: False]")
+parser.add_option("--no-rename", action="store_false", dest="renaming",
+                  default=True,
+                  help="don't rename the fasta and clustal files " +
+                  "[default: False]")
+parser.add_option("--no-reports", action="store_false", dest="reporting",
+                  default=True,
+                  help="don't generate files for variance data " +
+                  "[default: False]")
+parser.add_option("--no-calculation", action="store_false",
+                  dest="calculating", default=True,
+                  help="don't calculate sequence variance [default: False]")
+
+
+def debug(msg):
+    if verbose:
+        lock.acquire(True)
+        stderr.write(str(msg) + '\n')
+        lock.release()
+
+
+def parse(pargs):
+    '''
+    Parses `pargs` and sets global variables to be accessible to other
+    modules.
+
+    These variables are:
+    * `LENGTH_ERR`
+    * `MIN_IDENTITY`
+    * `MAX_EVALUE`
+    * `NUM_THREADS`
+    * `NUM_PROCESSES`
+    * `START_CODONS`
+    * `START_CODONS`
+    * `DIRECTORY`
+    * `PLOTTER`
+    * `args`
+    '''
+    global \
+        LENGTH_ERR, MIN_IDENTITY, MAX_EVALUE, MIN_ORFLEN, \
+        NUM_THREADS, NUM_PROCESSES, START_CODONS, STOP_CODONS, \
+        DIRECTORY, PLOTTER, args, predicting, clustering, \
+        renaming, calculating, reporting, plotting, verbose
+
+    opts, largs = parser.parse_args(pargs)
+
+    if opts.directory[-1] != sep:
+        opts.directory += sep
+    try:
+        makedirs(opts.directory)
+    except OSError:
+        pass
+
+    if '*' in opts.start:
+        DNA = 'ATCG'
+        opts.start = set(i + j + k for i in DNA for j in DNA for k in DNA) - \
+            set(opts.stop)
+
+    opts.processes = min(opts.processes, len(largs))
+    if not (0 <= opts.fraction <= 1):
+        raise RuntimeError("Allowable length error must be between 0 and 1.")
+
+    LENGTH_ERR = opts.fraction
+    MIN_IDENTITY = opts.identity
+    MAX_EVALUE = opts.evalue
+    MIN_ORFLEN = opts.orflen
+    NUM_THREADS = opts.threads
+    NUM_PROCESSES = opts.processes
+    STOP_CODONS = opts.stop
+    START_CODONS = opts.start
+    DIRECTORY = opts.directory
+    PLOTTER = opts.plotter
+
+    predicting = opts.predicting
+    clustering = opts.clustering
+    renaming = opts.renaming
+    calculating = opts.calculating
+    reporting = opts.reporting
+    plotting = opts.plotting
+
+    verbose = opts.verbose
+
+    args = largs
+
+
+def help():
+    '''
+    Prints the usage.
+    '''
+    parser.print_help()
diff --git a/src/analysis/plot.py b/src/analysis/plot.py
new file mode 100644
index 0000000..1854aba
--- /dev/null
+++ b/src/analysis/plot.py
@@ -0,0 +1,122 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import math
+from os import sep, mkdir
+
+
+def smoothed(unsmoothed, factor):
+    l = len(unsmoothed)
+    r = [math.exp(-(i - l) ** 2 / float(factor)) for i in xrange(1, l)]
+    w = np.array(r + [1] + r[::-1])
+    c = (w[l - i - 1:2 * l - i - 1] for i in xrange(l))
+    wm = np.array([x / sum(x) for x in c])
+    return np.dot(unsmoothed, wm.transpose())
+
+
+def plot(plotdata, directory, bottom=True, side=True, legend=True,
+         save=True, filename='untitled.pdf', upperbound=0.05, factor=21,
+         fig=None, **kwargs):
+    if fig is None:
+        fig = plt.figure(None, facecolor='w', edgecolor='w')
+
+    if not directory.endswith(sep):
+        directory += sep
+
+    try:
+        mkdir(directory)
+    except OSError:
+        pass
+
+    # plotting data
+    ntvar = plotdata['nt']['var']
+    aavar = plotdata['aa']['var']
+
+    # gene models
+    starts = plotdata['aa']['starts']
+    ends = plotdata['aa']['ends']
+    counts = plotdata['aa']['count']
+
+    # smooth the data
+    snt = smoothed(ntvar, factor)
+    lnt = len(ntvar)
+    saa = smoothed(aavar, factor)
+    laa = len(aavar)
+
+    # bounding rectangle
+    bound = [0, laa, -upperbound / 6.0, upperbound]
+
+    # x-values to align nucleotide & amino acids
+    xnt = np.arange(lnt) / 3.0 + 1
+    xaa = np.arange(laa) + 1
+
+    ax = axes(bottom, side, bound, fig, **kwargs)
+    ntl = draw(xnt, snt, ax, '#0000ff', **kwargs)
+    aal = draw(xaa, saa, ax, '#00ff00', **kwargs)
+    models(starts, ends, counts, bound, ax, **kwargs)
+    report(filename, ntvar, aavar, lnt, laa)
+
+    if legend:
+        fig.legend((ntl, aal), ('Nucleotide', 'Amino acid'), 'upper right')
+    if save:
+        fig.savefig(directory + filename)
+
+
+def axes(bottom, side, bound, fig, **kwargs):
+    # create the proper sized frame, depending on
+    # how we draw the plot
+    x = 0.09 if side else 0.02
+    y = 0.09 if bottom else 0.04
+
+    xs = [bound[0], bound[1] * 1.06]
+    # construct the axes
+    ax = fig.add_axes([x, y, 0.98 - x, 0.98 - y], xlim=xs, ylim=bound[2:])
+    ax.minorticks_on()
+    ax.tick_params(axis='x', which='minor', length=3)
+
+    # hide the unwanted axis lines (typically the top & right)
+    # label the wanted axes and draw them
+    for loc, spine in ax.spines.iteritems():
+        if loc in ['right', 'top']:
+            spine.set_color('none')
+            continue
+        if loc == 'bottom':
+            ax.xaxis.set_ticks_position('bottom')
+            if bottom:
+                ax.set_xlabel("Amino acids")
+            continue
+        if loc == 'left':
+            if side:
+                ax.set_ylabel("Sequence variance")
+                ax.yaxis.set_ticks_position('left')
+            else:
+                spine.set_color('none')
+                ax.tick_params('y', which='both', color='none',
+                               labelcolor='none')
+
+    ax.hlines(ax.get_yticks(), xs[0], xs[1], color='0.75',
+              linestyle='dashed')
+    ax.hlines(0, xs[0], xs[1], color='k', linestyle='solid')
+    return ax
+
+
+def draw(x, y, ax, color, **kwargs):
+    return ax.plot(x, y, color=color, linestyle='solid')
+
+
+def models(starts, ends, counts, bound, ax, **kwargs):
+    lb, l = bound[2], len(starts)
+    scale = bound[1] / max(ends)
+    ys, i = np.arange(1, l + 1) * lb / 3.0, 0
+
+    # draw the gene models
+    ax.hlines(ys, starts, ends, colors='k', lw=4, linestyle='solid')
+    for c in counts:
+        ax.text(bound[1] + 10, lb / 3.0 * (i + 1.25), int(c))
+        i += 1
+
+
+def report(filename, ntvar, aavar, lnt, laa):
+    print(filename.center(80, '='))
+    print('Average variance: ')
+    print('\t%f per base pair' % (sum(ntvar) / lnt))
+    print('\t%f per amino acid' % (sum(aavar) / laa))
diff --git a/src/analysis/predict.py b/src/analysis/predict.py
new file mode 100644
index 0000000..7e15814
--- /dev/null
+++ b/src/analysis/predict.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+import biotools.IO as io
+import biotools.BLAST as BLAST
+import biotools.analysis.options as options
+from biotools.sequence import Sequence, annotation as ann
+from biotools.align import OptimalCTether as align
+from biotools.translate import translate
+from biotools.complement import complement
+try:
+    import Queue as queue
+except ImportError:
+    import queue
+import threading
+from os import sep, mkdir
+
+PIPING = True
+
+
+def ORFGenerator(sequ):
+    '''
+    Scans both strands of the given sequence and yields the longest subsequence
+    that starts with a start codon and contains no stop codon other than the
+    final codon.
+    '''
+    comp = complement(sequ[::-1])
+    seq = sequ.seq
+    cseq = comp.seq
+    slen = len(sequ)
+    starts = [-1, 0, 1, -1, 0, 1]  # locations of start codons in each frame
+    stops = [0, 1, 2,  0, 1, 2]  # locations of stop  codons in each frame
+    mlen = options.MIN_ORFLEN
+
+    for i in xrange(slen - 2):
+        fcodon, rcodon = seq[i:i + 3], cseq[i:i + 3]
+        if fcodon in options.STOP_CODONS:
+            if i - mlen >= starts[i % 3 + 3] >= stops[i % 3 + 3]:
+                yield sequ[starts[i % 3 + 3]:i + 3]
+            stops[i % 3 + 3] = i + 3
+        elif fcodon in options.START_CODONS:
+            if starts[i % 3 + 3] < stops[i % 3 + 3]:
+                starts[i % 3 + 3] = i
+        if rcodon in options.STOP_CODONS:
+            if i - mlen >= starts[i % 3] >= stops[i % 3]:
+                yield comp[starts[i % 3]:i + 3]
+            stops[i % 3] = i + 3
+        elif rcodon in options.START_CODONS:
+            if starts[i % 3] < stops[i % 3]:
+                starts[i % 3] = i
+    raise StopIteration()
+
+
+class ThreadQueue(queue.Queue):
+
+    def __init__(self, target):
+        queue.Queue.__init__(self)
+        self.threadcount = 0
+        self.target = target
+
+    def put(self, item):
+        options.lock.acquire()
+        queue.Queue.put(self, item)
+        if self.threadcount < options.NUM_THREADS - 1:
+            thread = threading.Thread(target=self.target)
+            thread.start()
+            self.threadcount += 1
+        options.lock.release()
+
+
+def GeneFromBLAST(db, sequences, pref, names):
+    '''
+    BLASTs database against sequences, and for those results that pass the
+    length and percent identity requirements, attempt to locate the full gene
+    that corresponds to that BLAST hit. Genes that are found are saved in the
+    subdirectory sequences under the given directory, divided depending on
+    whether the sequnece is amino acid or nucleotide.
+    '''
+    PIPING = True
+    wd = options.DIRECTORY + 'sequences' + sep
+
+    for d in [options.DIRECTORY, wd]:
+        try:
+            mkdir(d)
+        except OSError:
+            pass
+
+    subj = dict((s.name, s) for s in io.open(db, 'r'))
+    options.debug("Database sequences loaded from file %s." % db)
+
+    try:
+        orfs = dict((s.name, [orf for orf in ORFGenerator(s)])
+                    for s in io.open(sequences, 'r'))
+        options.debug("ORFs loaded from file %s." % sequences)
+    except IOError:
+        options.debug("No file \"" + sequences + ",\" skipping.")
+        return
+
+    def target():
+        while 1:
+            try:
+                res = qin.get(PIPING, 1)
+            except queue.Empty:
+                if not PIPING:
+                    break
+                else:
+                    continue
+
+            qname, sname = res['query']['name'], res['subject']['name']
+            start, end = res['query']['start'], res['query']['end']
+            alignments = []
+            max_match = (options.MIN_IDENTITY, None)
+
+            if subj[sname].type == 'nucl':
+                subject = translate(subj[sname])
+            else:
+                subject = subj[sname]
+
+            while qname:
+                try:
+                    o = orfs[qname]
+                    break
+                except KeyError:
+                    qname = qname[:-1]
+            if not qname:
+                qin.task_done()
+                continue
+
+            for orf in o:
+                if in_range(orf, start, end, res['frame']):
+                    orf = orf[:-3]
+                    query = translate(orf)
+                    options.debug("Aligning %33s v. %33s." % (qname, sname))
+                    alignment = align(subject.seq, query.seq)
+                    alignments.append((orf, sname, alignment))
+
+            for orf, refname, aln in alignments:
+                hitlen = aln['sublength']
+                region = orf[-3 * hitlen:]
+                identity = float(aln['identities']) / aln['length']
+                if identity >= max_match[0]:
+                    max_match = (identity, (region, sname, aln))
+
+            if max_match[1]:
+                seq, name, _ = max_match[1]
+                odl = subject.defline.split('[')[0].strip()
+                src = seq.original.name
+                start, end, strand = seq.start, seq.end, seq.step
+                defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \
+                    (odl + (' ' if odl else ''), src, start, end, strand)
+
+                new = Sequence(name.strip(), seq.seq, defline=defline,
+                               original=seq.original, type=seq.type,
+                               start=seq.start, end=seq.end, step=seq.step)
+                qout.put(new)
+            qin.task_done()
+
+    def in_range(seq, start, end, frame):
+        ss, se = sorted((seq.start, seq.end))
+        os, oe = sorted((start, end))
+        frame = int(frame)
+
+        return (ss < oe and se > os and
+                (se % 3 == oe % 3 or ss % 3 == oe % 3) and
+                ((frame < 0 and seq.step < 0) or
+                 (frame > 0 and seq.step > 0)))
+
+    qout = queue.Queue()
+    qin = ThreadQueue(target)
+
+    blastopts = {
+        'evalue': options.MAX_EVALUE,
+        'num_threads': options.NUM_THREADS
+    }
+
+    for res in BLAST.run(db, sequences, **blastopts):
+        if float(res['expect']) > options.MAX_EVALUE:
+            continue
+
+        sbjl = len(subj[res['subject']['name']])
+        ident = float(res['identities'].split('(')[1][:-2]) / 100
+        lerr = float(res['subject']['length']) / sbjl
+
+        if ident >= options.MIN_IDENTITY:
+            if lerr >= (1.0 - options.LENGTH_ERR):
+                qin.put(res)
+    PIPING = False
+    options.debug("BLAST done.")
+
+    target()
+    qin.join()
+    options.debug("Done Aligning sequences.")
+
+    options.debug("Now writing sequences (%d)." % qout.qsize())
+    seqs = {}
+    nuc_file = io.open(wd + pref + '.fasta', 'w')
+    count = 0
+    while 1:
+        try:
+            seq = qout.get(False)
+            if seq.seq not in seqs:
+                seqs[seq.seq] = set()
+            seqs[seq.seq].add(seq)
+            nuc_file.write(seq)
+            count += 1
+            options.debug("Wrote %s (%d)." % (seq.name, count));
+        except queue.Empty:
+            break
+    nuc_file.close()
+    options.debug("Done Aligning sequences.")
+
+    gh = io.open(wd + pref + '.gff3', 'w')
+    names.append(wd + pref + '.fasta')
+
+    for id in seqs:
+        gh.write(ann(seqs[id].copy().pop(), pref, 'gene',
+                     homologs=','.join(s.name for s in seqs[id])))
+    gh.close()
+
+
+def run(subject, query, prefix, names):
+    GeneFromBLAST(subject, query, prefix, names)
+
+
+if __name__ == '__main__':
+    options.START_CODONS = ['TTG']
+    import sys
+    f = io.open(sys.argv[1], 'r')
+    for seq in f:
+        print(seq.name + ' ' + seq.defline)
+        for orf in ORFGenerator(seq):
+            print('%d ... %d' % (orf.start, orf.end))
diff --git a/src/analysis/renamer.py b/src/analysis/renamer.py
new file mode 100644
index 0000000..6064736
--- /dev/null
+++ b/src/analysis/renamer.py
@@ -0,0 +1,61 @@
+from __future__ import print_function
+from os import sep, rename as mv
+import biotools.IO as io
+import biotools.analysis.options as options
+
+try:
+    get_input = raw_input
+except:
+    get_input = input
+
+
+def rename(direc, db, files):
+    '''
+    This isn't really for bioinformatics, this is more for the pipeline, to
+    rename the files generated by cluster.py with a little human interaction.
+    '''
+
+    names = []
+    seqdb = dict((s.name, s) for s in io.open(db, 'r'))
+    nt_dir, aa_dir = direc + 'nt' + sep, direc + 'aa' + sep
+    for f in files:
+        seq = io.open(nt_dir + f, 'r').next()
+        ids = seq.defline.split(', ')
+        print("File\033[33;1m", f, \
+              "\033[0mis described by the following sequences:")
+        try:
+            for id in ids:
+                seqdb[id]
+                print("* " + seqdb[id].name + ': ' +
+                      seqdb[id].defline.split('[')[0])
+        except KeyError:
+            print("* (none)")
+            continue
+        pre = get_input("\033[33;1mWhat should we call this file " +
+                        "(or hit enter to skip)? \033[0m")
+        fpre = f[:f.find('.')]
+
+        if pre != "":
+            count = 0
+            while True:
+                rpre = pre + ((" (%d)" % count) if count > 0 else "")
+                try:
+                    fh = open(nt_dir + rpre + ".fasta", 'r')
+                    fh.close()
+                    count += 1
+                    continue
+                except IOError:
+                    nt_old, nt_new = nt_dir + fpre, nt_dir + rpre
+                    aa_old, aa_new = aa_dir + fpre, aa_dir + rpre
+                    print("Renaming " + fpre + ".* to " + rpre + ".*")
+                    try:
+                        mv(nt_old + ".fasta", nt_new + ".fasta")
+                        mv(aa_old + ".fasta", aa_new + ".fasta")
+                        mv(nt_old + ".clustalw", nt_new + ".clustalw")
+                        mv(aa_old + ".clustalw", aa_new + ".clustalw")
+                        names.append(nt_new + '.clustalw')
+                        names.append(aa_new + '.clustalw')
+                    except OSError:
+                        pass
+                    break
+    return names
diff --git a/src/analysis/report.py b/src/analysis/report.py
new file mode 100644
index 0000000..45d9654
--- /dev/null
+++ b/src/analysis/report.py
@@ -0,0 +1,89 @@
+import matplotlib.pyplot as plt
+import biotools.analysis.plot as bap
+from os import sep, mkdir
+
+
+# report areas of high conservation or variation
+def report(plotdata, **kwargs):
+    pass
+
+
+# wraps biotools.analysis.plot.plot()
+def plot(plotdata, directory, bottom=True, side=True, legend=True,
+         save=True, filename='untitled.pdf', upperbound=0.05, factor=21,
+         fig=plt.figure(None, facecolor='w', edgecolor='w'), **kwargs):
+
+    ranges = report(plotdata, **kwargs)
+
+    try:
+        mkdir(directory)
+    except OSError:
+        pass
+
+    lowerbound = -upperbound / 6
+
+    # smooth the data
+    snt = smoothed(snpdata['nt']['var'], factor)
+    lnt = len(snpdata['nt']['var'])
+    saa = smoothed(snpdata['aa']['var'], factor)
+    laa = len(snpdata['aa']['var'])
+
+    # generate x-ranges so that amino acids
+    # and nucleotides align
+    xnt = np.arange(lnt) * (0.0 + laa) / lnt + 1
+    xaa = np.arange(laa) + 1
+
+    # create the proper sized frame, depending on
+    # how we draw the plot
+    x = 0.09 if side else 0.02
+    y = 0.09 if bottom else 0.04
+
+    ax = fig.add_axes([x, y, 0.98 - x, 0.98 - y], xlim=[0, laa * 1.06],
+                      ylim=[lowerbound, upperbound])
+    ax.minorticks_on()
+    ax.tick_params(axis='x', which='minor', length=3)
+
+    for loc, spine in ax.spines.iteritems():
+        if loc in ['right', 'top']:
+            spine.set_color('none')
+            continue
+        if loc == 'bottom':
+            ax.xaxis.set_ticks_position('bottom')
+            if bottom:
+                ax.set_xlabel("Amino acids")
+            continue
+        if loc == 'left':
+            if side:
+                ax.set_ylabel("Sequence variance")
+                ax.yaxis.set_ticks_position('left')
+            else:
+                spine.set_color('none')
+                ax.tick_params('y', which='both', color='none',
+                               labelcolor='none')
+
+    ax.hlines(ax.get_yticks(), 0, laa * 1.06, color='0.75', linestyle='dashed')
+    ax.hlines(0, 0, laa * 1.06, color='k', linestyle='solid')
+
+    nt_lines = ax.plot(xnt, snt, color='#0000ff', linestyle='solid')
+    aa_lines = ax.plot(xaa, saa, color='#00ff00', linestyle='solid')
+
+    starts = snpdata['aa']['starts']
+    ends = snpdata['aa']['ends']
+    counts = snpdata['aa']['count']
+    scale = laa / max(ends)
+    ys = (np.arange(len(starts)) + 1) * lowerbound / 3
+
+    ax.hlines(ys, starts, ends, colors='k', lw=4, linestyle='solid')
+    for i, c in zip(xrange(len(counts)), counts):
+        ax.text(laa + 10, lowerbound / 3 * (i + 1.25), c)
+    if legend:
+        fig.legend((nt_lines, aa_lines), ('Nucleotide', 'Amino acid'),
+                   'upper right')
+
+    if save:
+        fig.savefig(directory + filename)
+
+    print '=============', filename, '============='
+    print 'Average variance: '
+    print '\t', sum(snpdata['nt']['var']) / lnt, 'per base pair'
+    print '\t', sum(snpdata['aa']['var']) / laa, 'per amino acid'
diff --git a/src/analysis/run.py b/src/analysis/run.py
new file mode 100644
index 0000000..4621ef0
--- /dev/null
+++ b/src/analysis/run.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+import biotools.analysis.predict as genepredict
+import biotools.analysis.options as options
+import threading
+from os import sep
+try:
+    import Queue as queue
+except ImportError:
+    import queue
+
+
+def run(infile, strains):
+    '''
+    Run several instances of `genepredict.run` at once.
+    '''
+
+    q = queue.Queue()
+    filenames = []
+
+    def run_predict():
+        while 1:
+            try:
+                strainf = q.get(False)
+            except queue.Empty:
+                break
+            strain = strainf.split(sep)[-1]
+            pos = strain.rfind('.')
+            if pos > 1 or (pos == 1 and strain[0] != '.'):
+                strain = strain[:pos]
+
+            options.debug("Predicting for %s." % strain)
+
+            try:
+                genepredict.run(infile, strainf, strain, filenames)
+            except RuntimeError:
+                pass
+            q.task_done()
+
+    for strain in strains:
+        q.put(strain)
+
+    for i in range(options.NUM_PROCESSES - 1):
+        curr = threading.Thread(target=run_predict)
+        curr.start()
+
+    run_predict()
+    q.join()
+
+    return filenames
+
+if __name__ == "__main__":
+    import sys
+    try:
+        run(sys.argv[1:])
+    except IndexError:
+        pass
diff --git a/src/analysis/variance.py b/src/analysis/variance.py
new file mode 100644
index 0000000..9632afd
--- /dev/null
+++ b/src/analysis/variance.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import biotools.IO as io
+import biotools.analysis.options as options
+from biotools.translate import translate
+from os import sep
+
+
+def SaySNPs(input):
+    '''
+    Takes a clustalw alignment and will return a dictionary of data
+    relevent to plotting the sequence variance for the sequences in the
+    given clustalw alignment. These data are:
+    * `var`: the measure of sequence variation,
+    * `starts`: the starting positions for each gene model in amino acids,
+    * `ends`: the ending positions for each gene model in amino acids, and
+    * `count`: the number of sequences with a particular gene model.
+    The values given in `starts`, `ends`, and `counts` are sorted to that the 
+    nth element in starts corresponds to the nth value in ends and the nth 
+    value in counts.
+    '''
+
+    catalogue = []
+    lengths = {}
+    for seq in io.open(input, 'r'):
+        key = (seq.start, seq.end)
+        lengths[key] = lengths.get(key, 0) + 1
+        for (i, c) in zip(xrange(key[1] - key[0] + 1), seq):
+            if i >= len(catalogue):
+                catalogue.append({})
+            if c != " ":
+                catalogue[i][c] = catalogue[i].get(c, 0) + 1
+
+    calc = []
+    for s in catalogue:
+        tot = float(sum(s.values()))
+        cnt = float(len(s))
+        calc.append(1.0 - sum((s[c] / tot) ** 2 for c in s))
+    llist = sorted(list(lengths.keys()))
+
+    return {
+        'var': calc,
+        'starts': [s for s, e in llist],
+        'ends': [e for s, e in llist],
+        'count': [lengths[k] for k in llist]
+    }
+
+
+def var(files):
+    '''
+    Returns plot data and metadata for plotting later on in the pipeline.
+    '''
+    sort = {}
+    for f in files:
+        seqs = [s for s in io.open(f)]
+        type = set(s.type for s in seqs)
+        if len(type) > 1:
+            type = set(['prot'])
+        fid = (type.pop(), f)
+        seqs = [''.join(s.seq.split('-')).strip() for s in seqs]
+        seqs = [translate(s) if fid[0] == 'nucl' else s for s in seqs]
+        sset = frozenset(seqs)
+        srtr = (len(seqs), sset)
+        sort[srtr] = sort.get(srtr, set()) | set([fid])
+
+    couples = []
+    for partners in sort.values():
+        trim = lambda x: '.'.join(x.split('.')[:-1]) \
+                         if f.endswith('.clustalw') or \
+                            f.endswith('.clustal') or \
+                            f.endswith('.aln') else x
+        names = ', '.join(set(trim(f.split(sep)[-1]) for type, f in partners))
+        pair = {}
+        for type, f in partners:
+            if len(pair) == 2:
+                break
+            if type in pair:
+                continue
+            pair[type] = f
+        if 0 < len(pair) < 2:
+            raise TypeError("Unmatched clustal alignment(s): " + 
+                            ", ".join(f for type, f in partners))
+        if len(pair) == 0:
+          continue
+        couples.append((pair['nucl'], pair['prot'], names))
+
+    for nt, aa, strain in couples:
+        plotdata = {
+            'nt': SaySNPs(nt),
+            'aa': SaySNPs(aa)
+        }
+        metadata = {'strain': strain, 'filename': strain + '.pdf'}
+
+        yield {'plotdata': plotdata, 'metadata': metadata}
+    raise StopIteration
diff --git a/src/annotation.py b/src/annotation.py
new file mode 100644
index 0000000..aa950d0
--- /dev/null
+++ b/src/annotation.py
@@ -0,0 +1,112 @@
+'''
+This module is used to create annotation files (currently, only GFF files).
+The annotations can be used to create a heirarchy among the annotations (e.g.,
+genes contain exons, introns, ... etc.).
+'''
+
+
+class Annotation(object):
+    '''
+    An object to help with reading and writing GFF files.
+    '''
+    unknowns = 0
+
+    def __init__(self, ref, src, type, start, end, score, strand, phase,
+                 attr, name_token='ID', gff_token='='):
+        '''
+        Constructs an `Annotation` object with the necessary values. The
+        parameters are passed in the same order as the columns from a GFF
+        (version 3) file and the name_token and gff_token parameters are the
+        defaults for a gff version 3 file from phytozome. Just write (e.g.)
+
+        ```python
+        Annotation(*line.split('\\t'))  #(splitting on tabs)
+        ```
+
+        and the rest of the work will be done for you. Other sources may
+        require changes to `name_tokens` and `gff_token`.
+
+        Instantiating an `Annotation` will generate for it an id of the form
+        *SEQNAME*_*TYPE*[START:END], where *SEQNAME* is the name of the
+        sequence (column 1) from the GFF file, and type is like 'gene' or
+        'CDS'. If no *SEQNAME* is provided, then `X` be used in its place, and
+        if no identifier can be found in the attributes, the `Annotation` will
+        generate an identifier for itself in the form of `unknown #`.
+        '''
+
+        def parse_attrs(attr, keyvalsep='=', attrsep=';'):
+            '''
+            Creates a dictionary from the atrributes (9th column) of a gff
+            file. By default, key-value separator (`keyvalsep`) is `=`, which
+            is the separator used in gff version 3.
+
+            In other words, `attr` `"a=b;c=d;"` and `keyvalsep` `=` will
+            yield the dictionary `{'a':'b','c':'d'}`. The other separator
+            (`attrsep`) separates individual attributes and defaults to ';',
+            which is also the norm in GFF files.
+            '''
+
+            attributes = {}
+            if keyvalsep not in attr:
+                keyvalsep = ' '
+            l = len(keyvalsep)
+            attrs = [a.strip() for a in attr.strip().split(attrsep)]
+            for attribute in attrs:
+                pos = attribute.find(keyvalsep)
+                if pos > -1:
+                    var, val = attribute[:pos], attribute[pos + l:]
+                    attributes[var] = attributes.get(var, []) + [val]
+
+            for key in attributes:
+                attributes[key] = ','.join(attributes[key])
+            return attributes
+
+        start, end = int(start), int(end)
+        self.strand = strand
+        self.type = type
+        self.source = src
+        self.seq = ref
+        self.start = min(start, end)
+        self.end = max(end, start)
+        self.attr = parse_attrs(attr, gff_token)
+        self.phase = phase
+        self.score = score
+        self.ntoken = name_token
+        self.id = ((self.seq or 'X') + '_' + self.type +
+                   "[%d:%d]" % (self.start, self.end))
+        try:
+            self.name = self.attr[name_token]
+        except KeyError:
+            Annotation.unknowns += 1
+            self.name = "unknown %d" % Annotation.unknowns
+        self.parent = None
+        self.children = []
+
+    '''
+    Some things that you can do to `Annotation` objects:
+    * `len(annotation)` => length of the annotation (`end-start+1`)
+    * `dictionary[annotation]` => store annotations as keys of a dictionary or
+        as elements in a set
+    * `annA == annB` => compare two Annotations, they are the same if they have
+        the same id.
+    * `print annotation` => prints the annotation as a line of a GFF version 3
+        file.
+    '''
+
+    def __len__(self):
+        return max(self.start, self.end) - min(self.end, self.start) + 1
+
+    def __hash__(self):
+        return self.id.__hash__()
+
+    def __eq__(self, other):
+        try:
+            return self.id == other.id
+        except AttributeError:
+            return False
+
+    def __str__(self):
+        return '\t'.join((self.seq, self.source,
+                         self.type, str(self.start), str(self.end), self.score,
+                         self.strand, str(self.phase),
+                         ';'.join(k + '=' + self.attr[k] for k in self.attr)))
diff --git a/src/clustal.py b/src/clustal.py
new file mode 100644
index 0000000..18be815
--- /dev/null
+++ b/src/clustal.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+import biotools.IO as io
+import subprocess
+from os import remove
+
+
+def run(infile, outfile, **kwargs):
+    n = 0
+    for seq in io.open(infile, 'r'):
+        n += 1
+        if n > 1:
+            seqtype = seq.type
+            break
+
+    if n > 1:
+        cmd = "clustalw"
+
+        try:
+            ignore = open('/dev/null', 'w')
+        except IOError:
+            ignore = open('nul', 'w')
+
+        if seqtype == 'nucl':
+            defaults = {
+                'OUTORDER': 'ALIGNED',
+                'GAPOPEN': '10',
+                'GAPEXT': '0.1',
+                'DNAMATRIX': 'IUB'
+            }
+            others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg]))
+                      for arg in set(name.upper() for name in kwargs) &
+                      set(defaults.keys())]
+            subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN", "-TYPE=DNA",
+                            "-OUTFILE=" + outfile] + others, stdout=ignore)
+        else:
+            defaults = {
+                'OUTORDER': 'ALIGNED',
+                'GAPOPEN': '10',
+                'GAPEXT': '0.1',
+                'MATRIX': 'BLOSUM'
+            }
+            others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg]))
+                      for arg in set(name.upper() for name in kwargs) &
+                      set(defaults.keys())]
+            subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN",
+                            "-TYPE=PROTEIN", "-OUTFILE=" + outfile] + others,
+                            stdout=ignore)
+        pos = infile.rfind('.')
+        if pos > -1:
+            prefix = infile[:pos]
+        else:
+            prefix = infile
+        remove(prefix + '.dnd')
diff --git a/src/complement.py b/src/complement.py
new file mode 100644
index 0000000..7a9da09
--- /dev/null
+++ b/src/complement.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+from biotools.sequence import isprot
+
+_ref = {
+    'DNA': {
+        'A': 'T', 'T': 'A', 'a': 't', 't': 'a', 
+        'C': 'G', 'G': 'C', 'c': 'g', 'g': 'c',
+        'R': 'Y', 'Y': 'R', 'r': 'y', 'y': 'r',
+        ' ': ' ', '-': '-'
+    },
+    'RNA': {
+        'A': 'U', 'U': 'A', 'a': 'u', 'u': 'a',
+        'C': 'G', 'G': 'C', 'c': 'g', 'g': 'c',
+        'R': 'Y', 'Y': 'R', 'r': 'y', 'y': 'r',
+        ' ': ' ', '-': '-'
+    }
+}
+
+
+def complement(s):
+    '''
+    Creates the complement of a sequence, which can then be reversed by using
+    `seq[::-1]`, if it needs to be reversed. This function accepts either
+    `Sequence`s or strings.
+    '''
+
+    if isprot(s):
+        return s
+    has_u = ('U' in s or 'u' in s)
+    has_t = ('T' in s or 't' in s)
+    if has_u and not has_t:
+        repl = _ref['RNA']
+    elif has_t and not has_u:
+        repl = _ref['DNA']
+    else:
+        repl = _ref['DNA']
+    value = ''.join(repl.get(c, 'N') for c in s)
+    try:
+        return s.__class__("complement(%s)" % s.name, value,
+                           original=s.original, start=s.start,
+                           end=s.end, step=s.step, qual=s.qual)
+    except (AttributeError, TypeError):
+        return s.__class__(value)
+
+
+if __name__ == '__main__':
+    assert complement('ATCGTAGCTGATCGAT') == 'TAGCATCGACTAGCTA'
+    assert complement('AUCGUAGCUGAUCGAU') == 'UAGCAUCGACUAGCUA'
+    print(complement('AUCgu--cuGAUCGAU'))
diff --git a/src/sequence.py b/src/sequence.py
new file mode 100644
index 0000000..f4fcd0a
--- /dev/null
+++ b/src/sequence.py
@@ -0,0 +1,174 @@
+from biotools.annotation import Annotation
+
+
+def chop(seq, length=70):
+    '''
+    Yields a chunk of a sequence of no more than `length` characters,
+    it is meant to be used to print fasta files.
+    '''
+
+    while seq:
+        try:
+            piece, seq = seq[:length], seq[length:]
+        except IndexError:
+            piece, seq = seq, ''
+        yield piece
+    raise StopIteration()
+
+
+def isprot(seq, nucleotides='ATUCGNYRatucgnyr- '):
+    '''
+    Check whether the current sequence is a protein or nucleotide sequence.
+    '''
+
+    for c in seq:
+        if c not in nucleotides:
+            return True
+    else:
+        return False
+
+
+class Sequence(object):
+    '''
+    A wrapper class for sequences.
+    '''
+
+    def __init__(self, name, seq, **kwargs):
+        '''
+        Instantiates a Sequence object with a given sequence. Some other
+        useful parameters that the `Sequence` constructor can handle are:
+            * `qual` => the quality scores (an array of integers) of the
+                sequence,
+            * `type` => the type of the sequence, either prot or nucl,
+            * `start` => the starting position of the sequence within a
+                supersequence,
+            * `end` => the ending position of the sequnece within a
+                supersequence,
+            * `step` => the 'step' of the sequence, usually +1 for top-strand
+                sequences, and -1 for bottom-strand sequences, but can handle
+                other values as well,
+            * `original` => the original `Sequence` object from which this one
+                derives,
+            * `defline` => the definition line for this sequnce from a fasta
+                file.
+        If one of these are not given, they will default to the most logical
+        value that can be determined from the other values and sequence (e.g.,
+        if `end < start`, then `step` is probably -1).
+        '''
+
+        self.name = name
+        self.seq = seq
+        self.qual = kwargs.get('qual', None)
+        if 'type' in kwargs:
+            self.type = kwargs['type']
+        self.start = kwargs.get('start', 1)
+        self.end = kwargs.get('end', self.start - 1 + len(seq))
+        self.step = kwargs.get('step', -1 if self.start > self.end else 1)
+        self.original = kwargs.get('original', self)
+        self.defline = kwargs.get('defline', '')
+
+    def __getattr__(self, attr):
+        if attr == 'type':
+            self.type = 'prot' if isprot(self.seq) else 'nucl'
+            return self.type
+        raise AttributeError('%r object has no attribute %r' % 
+                             (self.__class__.__name__, attr))
+
+    def __getitem__(self, key):
+        '''
+        sequence[i] or sequence[start:end] or sequence[start:end:step]
+        constructs a new Sequence that is a subsequence as described by the
+        way this function is called. This function will automatically fill in
+        the name, sequence, start, stop, step, original, and type of the
+        subsequence. It also tries to fill in the annotations, but annotations
+        are handled pretty poorly right now, so it's probably best not to
+        worry about those, but it will work if you really want to.
+        '''
+
+        try:
+            start, stop, step = key.indices(len(self.seq))
+        except AttributeError:
+            start, stop, step = key, key + 1, 1
+
+        order = abs(self.step) / self.step
+        r = stop - (stop - start) % step - step
+        seq = ''.join(self.seq[x] for x in xrange(start, stop, step))
+        qual = self.qual and [self.qual[x] for x in xrange(start, stop, step)]
+        info = (self.name, start, stop, step)
+        self.type
+        return Sequence("subsequence(%s, %d, %d, %d)" % info, seq,
+                        qual=qual, original=self.original, type=self.type,
+                        start=self.start + start * order,
+                        end=self.start + r * order,
+                        step=step * self.step)
+
+    '''
+    Some other things you can do with a Sequence object:
+    * len(sequence) => gives the length of the sequence.
+    * for character in sequence: => allows you to loop over each character in
+        the sequence.
+    * dictionary[sequence] => allows sequences to be used as keys for
+        dictionaries and allows you to have sequences in sets. This relies on
+        the test seqA == seqB, described next.
+    * seqA == seqB => compare two sequences. The sequences are the same if
+        they have the same sequence AND name. Therefore, two sequences with
+        different names are treated as separate items in a set and separate
+        keys in a dictionary. If you need to match only the sequence, use
+        seqA.seq == seqB.seq.
+    * print sequence => print a fasta / fastq (depending on whether there are
+        any quality scores) representation of the sequence. Sequence objects
+        in any other data structure (e.g., list, dictionary) are printed as
+        (e.g., <Sequence 0x000000>). If you want to change that, you can do:
+            def __repr__(self):
+                return self.__str__()
+    '''
+
+    def upper(self):
+        return Sequence(self.name, self.seq.upper(), type=self.type,
+                        qual=self.qual, original=self.original,
+                        defline=self.defline, start=self.start, step=self.step,
+                        end=self.end)
+
+    def __iter__(self):
+        for c in self.seq:
+            yield c
+        raise StopIteration()
+
+    def __len__(self):
+        return len(self.seq)
+
+    def __hash__(self):
+        return hash(self.seq)
+
+    def __eq__(self, other):
+        try:
+            return self.seq == other.seq and self.name == other.name
+        except AttributeError:
+            return (self.seq == other)
+
+    def __str__(self):
+        if self.qual:
+            return '@%s\n%s\n+\n%s' % (self.name, self.seq,
+                                       ''.join(chr(ord('A') - 1 + q)
+                                               for q in self.qual))
+        else:
+            return '>%s %s\n%s' % (self.name, self.defline,
+                                   '\n'.join(chop(self.seq, 70)))
+
+
+def annotation(seq, source, type, **kwargs):
+    '''
+    Creates an `Annotation` object for the given sequence from a source
+    (e.g., "phytozome7.0") of a particular type (e.g., "gene").
+    '''
+    try:
+        sname = source.name
+    except AttributeError:
+        sname = source
+
+    start, end = min(seq.start, seq.end), max(seq.start, seq.end)
+    strand = '+' if seq.step == 1 else '-'
+    attrs = ';'.join('%s=%s' % (key, str(kwargs[key])) for key in kwargs)
+
+    return Annotation(seq.original.name, sname, type, start, end, '.',
+                      strand, start % 3, attrs)
diff --git a/src/translate.py b/src/translate.py
new file mode 100644
index 0000000..4da0062
--- /dev/null
+++ b/src/translate.py
@@ -0,0 +1,57 @@
+_gencode = {
+    'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
+    'AUA': 'I', 'AUC': 'I', 'AUU': 'I', 'AUG': 'M',
+    'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
+    'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACU': 'T',
+    'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
+    'AAC': 'N', 'AAU': 'N', 'AAA': 'K', 'AAG': 'K',
+    'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
+    'AGC': 'S', 'AGU': 'S', 'AGA': 'R', 'AGG': 'R',
+    'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
+    'CUA': 'L', 'CUC': 'L', 'CUG': 'L', 'CUU': 'L',
+    'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
+    'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCU': 'P',
+    'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
+    'CAC': 'H', 'CAU': 'H', 'CAA': 'Q', 'CAG': 'Q',
+    'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
+    'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGU': 'R',
+    'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
+    'GUA': 'V', 'GUC': 'V', 'GUG': 'V', 'GUU': 'V',
+    'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
+    'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCU': 'A',
+    'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
+    'GAC': 'D', 'GAU': 'D', 'GAA': 'E', 'GAG': 'E',
+    'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
+    'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGU': 'G',
+    'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
+    'UCA': 'S', 'UCC': 'S', 'UCG': 'S', 'UCU': 'S',
+    'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
+    'UUC': 'F', 'UUU': 'F', 'UUA': 'L', 'UUG': 'L',
+    'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
+    'UAC': 'Y', 'UAU': 'Y', 'UAA': '*', 'UAG': '*',
+    'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
+    'UGC': 'C', 'UGU': 'C', 'UGA': '*', 'UGG': 'W'}
+
+
+def translate(sequence):
+    '''
+    Translate a nucleotide using the standard genetic code. The sequence
+    parameter can be either a string or a `Sequence` object. Stop codons are
+    denoted with an asterisk (*).
+    '''
+
+    try:
+        value = ''.join(_gencode.get(sequence.seq[i:i + 3].upper(), 'X')
+                        for i in xrange(0, int(len(sequence) / 3) * 3, 3))
+        return sequence.__class__("translate(%s)" % sequence.name, value,
+                                  original=sequence.original, type='prot',
+                                  defline=sequence.defline)
+    except AttributeError:
+        value = ''.join(_gencode.get(sequence[i:i + 3].upper(), 'X')
+                        for i in xrange(0, int(len(sequence) / 3) * 3, 3))
+        return sequence.__class__(value)
+
+
+if __name__ == '__main__':
+    import sys
+    print(translate(sys.argv[1]))

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-biotools.git



More information about the debian-med-commit mailing list