[med-svn] [python-biotools] 01/02: New upstream version 1.2.12
Andreas Tille
tille at debian.org
Thu Mar 2 12:01:26 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository python-biotools.
commit a79b3c093765aea079251badc88b9621864f6a1c
Author: Andreas Tille <tille at debian.org>
Date: Thu Mar 2 12:43:00 2017 +0100
New upstream version 1.2.12
---
PKG-INFO | 21 ++++
prok-geneseek | 102 ++++++++++++++++
setup.py | 30 +++++
src/BLAST.py | 310 +++++++++++++++++++++++++++++++++++++++++++++++
src/IO/__init__.py | 165 +++++++++++++++++++++++++
src/IO/clustal.py | 55 +++++++++
src/IO/fasta.py | 64 ++++++++++
src/IO/fastq.py | 72 +++++++++++
src/IO/gff.py | 28 +++++
src/IO/manager.py | 77 ++++++++++++
src/IO/phylip.py | 57 +++++++++
src/__init__.py | 3 +
src/align.py | 165 +++++++++++++++++++++++++
src/analysis/__init__.py | 0
src/analysis/cluster.py | 116 ++++++++++++++++++
src/analysis/loaddata.py | 176 +++++++++++++++++++++++++++
src/analysis/options.py | 168 +++++++++++++++++++++++++
src/analysis/plot.py | 122 +++++++++++++++++++
src/analysis/predict.py | 230 +++++++++++++++++++++++++++++++++++
src/analysis/renamer.py | 61 ++++++++++
src/analysis/report.py | 89 ++++++++++++++
src/analysis/run.py | 57 +++++++++
src/analysis/variance.py | 95 +++++++++++++++
src/annotation.py | 112 +++++++++++++++++
src/clustal.py | 53 ++++++++
src/complement.py | 49 ++++++++
src/sequence.py | 174 ++++++++++++++++++++++++++
src/translate.py | 57 +++++++++
28 files changed, 2708 insertions(+)
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..d39577a
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,21 @@
+Metadata-Version: 1.1
+Name: biotools
+Version: 1.2.12
+Summary: A bunch of bioinformatics utilities.
+Home-page: https://github.com/sonwell/biotools
+Author: Andrew Kassen
+Author-email: atkassen at gmail.com
+License: UNKNOWN
+Description: Accompanies Bart, Rebecca, *et al.* High-throughput genomic sequencing of Cassava Bacterial Blight strains identifies conserved effectors to target for durable resistance. *PNAS Plus*.
+
+ Currently depends on `clustalw <ftp://ftp.ebi.ac.uk/pub/software/clustalw2/2.1/>`_ and `BLAST <ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/>`_.
+
+ You can grab the most current code from `github <https://github.com/sonwell/biotools>`_ or via git
+ git clone git://github.com/sonwell/biotools.git
+Keywords: gene prediction,prokaryotes,effectors
+Platform: UNKNOWN
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Requires: numpy
+Requires: matplotlib
diff --git a/prok-geneseek b/prok-geneseek
new file mode 100755
index 0000000..ed42dca
--- /dev/null
+++ b/prok-geneseek
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+
+# 1. predict
+# 2. cluster
+# 3. rename
+# 4. snps
+
+import biotools.analysis.run as pr
+import biotools.analysis.cluster as cw
+import biotools.analysis.renamer as mv
+import biotools.analysis.variance as cs
+import biotools.analysis.options as op
+import biotools.analysis.loaddata as ld
+import sys
+import os
+
+
+if __name__ == "__main__":
+ op.parse(sys.argv[1:])
+ if len(op.args) < 2:
+ op.help()
+ exit(0)
+
+ direc = op.DIRECTORY
+ plotter = op.PLOTTER
+ database = op.args[0]
+ names = op.args[1:]
+ sep = os.sep
+
+ predict = op.predicting
+ cluster = op.clustering
+ rename = op.renaming
+ plot = op.plotting
+ report = op.reporting
+ calculate = op.calculating
+
+ rename = rename and (cluster or not predict)
+ calculate = calculate and (cluster or not predict)
+ plot = plot and (calculate or not (predict or cluster))
+ report = report and calculate
+
+ if not (predict or cluster or rename or calculate or plot or report):
+ print "I don't know what you want, I give up."
+ exit(0)
+
+ if predict:
+ names = pr.run(database, names)
+ print "Homologous sequences written to " + direc + 'sequences' + sep
+
+ if cluster:
+ names = cw.run(direc + 'clusters' + sep, names)
+ print "Clustalw files written to " + direc + "clusters" + sep
+
+ if rename:
+ names = mv.rename(direc + 'clusters' + sep, database, names)
+
+ if plot and plotter.lower() != 'none':
+ try:
+ cv = __import__(plotter, globals(), locals(), ['plot'], -1)
+ except ImportError:
+ try:
+ if not plotter.endswith('.py'):
+ plotter += '.py'
+ open(plotter, 'r')
+ except:
+ plot = False
+ else:
+ p = plotter.rfind(sep)
+ if p > -1:
+ sys.path.append(plotter[:p])
+ plotter = plotter[p + len(sep):]
+
+ try:
+ cv = __import__(plotter, globals(), locals(), ['plot'], -1)
+ except ImportError:
+ plot = False
+
+ if calculate:
+ gen = cs.var(names)
+ elif plot:
+ gen = (ld.parse(s) for s in names)
+ else:
+ gen = []
+ for entry in gen:
+ plotdata = entry['plotdata']
+ metadata = entry['metadata']
+ if plot:
+ try:
+ cv.plot(plotdata, direc + 'plots' + sep,
+ filename=metadata['filename'])
+ except Exception as e:
+ print metadata['filename'], e
+ if report:
+ try:
+ os.mkdir(direc + 'data' + sep)
+ except:
+ pass
+
+ fh = open(direc + 'data' + sep + metadata['strain'] + '.py', 'w')
+ fh.write('plotdata = ' + repr(plotdata) + '\n')
+ fh.write('metadata = ' + repr(metadata) + '\n')
+ print "Done"
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..b20bad0
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+from distutils.core import setup
+
+setup(name="biotools",
+ version='1.2.12',
+ description="A bunch of bioinformatics utilities.",
+ long_description="""Accompanies Bart, Rebecca, *et al.* High-throughput genomic sequencing of Cassava Bacterial Blight strains identifies conserved effectors to target for durable resistance. *PNAS Plus*.
+
+Currently depends on `clustalw <ftp://ftp.ebi.ac.uk/pub/software/clustalw2/2.1/>`_ and `BLAST <ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/>`_.
+
+You can grab the most current code from `github <https://github.com/sonwell/biotools>`_ or via git
+ git clone git://github.com/sonwell/biotools.git""",
+ author="Andrew Kassen",
+ maintainer="Andrew Kassen",
+ author_email="atkassen at gmail.com",
+ maintainer_email="atkassen at gmail.com",
+ url="https://github.com/sonwell/biotools",
+ requires=['numpy','matplotlib'],
+ packages=['biotools', 'biotools.analysis', 'biotools.IO'],
+ package_dir={'biotools': 'src',
+ 'biotools.analysis': 'src/analysis',
+ 'biotools.IO': 'src/IO'},
+ scripts=['prok-geneseek'],
+ keywords='gene prediction, prokaryotes, effectors',
+ classifiers=[
+ 'Development Status :: 4 - Beta',
+ 'Intended Audience :: Science/Research',
+ 'Topic :: Scientific/Engineering :: Bio-Informatics'
+ ])
diff --git a/src/BLAST.py b/src/BLAST.py
new file mode 100644
index 0000000..6b68737
--- /dev/null
+++ b/src/BLAST.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+
+'''
+A module to manage BLAST databases and interface with the BLAST+ standalone
+program available from NCBI.
+'''
+
+import biotools.IO as io
+import subprocess
+from os import sep, getenv, listdir
+import shutil
+
+
+def run(db, sfile, mega_blast=False, **kwargs):
+ '''
+ Takes a database and a query and runs the appropriate type of BLAST on
+ them. The database can be an existing BLAST database or a fasta/fastq
+ file. If it is a sequence file, this function will look in the places
+ where BLAST would look for an existing database created from that file and
+ use that instead. If there is no such database, this function will make
+ one for you and then use the newly created database with BLAST.
+
+ Optional named arguments can currently only be `evalue`, `num_threads`,
+ `gapopen`, or `gapextend`. The correspond to the BLAST options of the same
+ name.
+ '''
+
+ cmds = {
+ 'prot': {
+ 'prot': 'blastp',
+ 'nucl': 'tblastn'
+ },
+ 'nucl': {
+ 'nucl': 'blastn',
+ 'prot': 'blastx'
+ }
+ }
+
+ seq = io.open(sfile, 'r').next()
+ qtype = seq.type
+
+ rcloc = ''
+ for loc in (".:~:" + (getenv("NCBI") or "")).split(':'):
+ if loc and loc[-1] == sep:
+ loc += sep
+ try:
+ for line in (l.strip() for l in open(loc + '.ncbirc', 'r')):
+ pos = line.find('=')
+ if pos >= 0 and line[:pos].strip() == "BLASTDB":
+ rcloc = line[pos + 1:].strip()
+ except IOError:
+ pass
+
+ dbtype = None
+ bdbenv = getenv("BLASTDB")
+ dblocations = (":." + ((':' + bdbenv) if bdbenv else '') +
+ ((':' + rcloc) if rcloc else '')).split(':')
+ for loc in dblocations:
+ if loc and loc[-1] != sep:
+ loc += sep
+ try:
+ open(loc + db + '.pin', 'r')
+ dbtype = 'prot'
+ break
+ except IOError:
+ try:
+ open(loc + db + '.nin', 'r')
+ dbtype = 'nucl'
+ break
+ except IOError:
+ pass
+
+ if not dbtype:
+ odb = db
+ pos = db.rfind(".")
+ for seq in io.open(db, 'r'):
+ dbtype = seq.type
+ break
+ if not dbtype:
+ raise IOError("Database not found: " + odb)
+
+ ndb = None
+ sp = db.rfind(sep)
+ if sp > -1:
+ dbdir, db = db[:sp], db[sp + 1:pos]
+ else:
+ dbdir, db = '.', db[:pos]
+
+ for file in listdir(dbdir):
+ dpos = file.rfind('.')
+ if dpos >= 0 and file[dpos + 1:] == dbtype[0] + 'in':
+ fh = open(dbdir + sep + file, 'r')
+ c = ord(fh.read(12)[-1])
+ fname = fh.read(c)
+ if fname[0] in ("'", '"'):
+ fname = fname[1:-1]
+ if fname.endswith(odb):
+ ndb = dbdir + sep + file[:dpos]
+ break
+ if not ndb:
+ ndb = '_'.join(db.split())
+ try:
+ ignore = open('/dev/null', 'w')
+ except IOError:
+ ignore = open('nul', 'w')
+
+ try: # possible race condition
+ open(ndb, 'r').close()
+ except IOError:
+ subprocess.call(["makeblastdb", "-in", '"%s"' % odb,
+ "-out", ndb, "-dbtype", dbtype],
+ stdout=ignore)
+ try:
+ for suff in ['in', 'hr', 'sq']:
+ name = ndb + '.' + dbtype[0] + suff
+ shutil.move(name, dbdir + sep + name)
+ except shutil.Error:
+ pass
+ db = dbdir + sep + ndb
+ else:
+ db = ndb
+ else:
+ raise IOError("Database not found: " + db)
+ allowed = set(["evalue", "gapopen", "gapextend", "num_threads"]) & \
+ set(kwargs.keys())
+ cmd = cmds[qtype][dbtype]
+ pn = ["-db", "-query"]
+ if mega_blast:
+ cmd = "megablast"
+ pn = ["-d", "-i"]
+ allowed = ["e", "a"]
+ args = [cmd, pn[0], db, pn[1], sfile] + [arg for pair in
+ [["-" + k, str(kwargs[k])] for k in allowed] for arg in pair]
+ proc = subprocess.Popen(args, bufsize=1, stdout=subprocess.PIPE)
+ return Result(iter(proc.stdout.readline, ''))
+
+
+class Result(object):
+
+ '''
+ A class which take the raw output from BLAST and generates dictionaries
+ from the data from BLAST. This data includes the alignment, percent
+ identity, gaps, e-value, score, length of subject, length of query, and
+ start and stop positions for both sequences. This class should be used in
+ a for loop like so:
+
+ ```python
+ for res in Result(file_or_data):
+ pass
+ ```
+
+ The class instance has a single other property, `headers`, which are the
+ lines in BLAST results before the BLAST hits (e.g., citation info, etc.).
+ '''
+
+ def __init__(self, file):
+ self.file = file
+ self.headers = []
+
+ def __iter__(self):
+ try:
+ ipt = open(self.file, 'r')
+ except (IOError, TypeError):
+ try:
+ ipt = self.file.split('\n')
+ except:
+ ipt = self.file
+ mode = 0
+ headers = []
+ curr = None
+ length = 0
+
+ def sh(sn, qn, l):
+ qdl = ''
+ space = qn.find(' ')
+ if space > -1:
+ qn, qdl = qn[:space], qn[space + 1:].lstrip()
+ return {
+ 'subject': {
+ 'name': sn.lstrip(),
+ 'defline': '',
+ 'start': None,
+ 'end': None,
+ 'sequence': ''
+ },
+ 'query': {
+ 'name': qn,
+ 'defline': qdl,
+ 'start': None,
+ 'end': None,
+ 'sequence': ''
+ },
+ 'length': l
+ }
+
+ def ra(sh):
+ for res in ('subject', 'query'):
+ sh[res]['start'] = int(sh[res]['start'])
+ sh[res]['end'] = int(sh[res]['end'])
+ sh[res]['length'] = abs(sh[res]['end'] - sh[res]['start'] + 1)
+ return sh
+
+ def sh_fmt(l):
+ for pairs in (a.strip() for a in l.split(',')):
+ l, r = tuple(a.strip() for a in (pairs.split('=')[:2]
+ if '=' in pairs else pairs.split(':')[:2]))
+ subheaders[l.lower().split('(')[0]] = r
+
+ for line in ipt:
+ line = line.rstrip('\n').lstrip()
+ if not line:
+ if mode == 4:
+ mode = 5
+ continue
+
+ if mode == 0:
+ if line[:6] == 'Query=':
+ mode = 1
+ qname = line[6:].lstrip()
+ self.headers = headers
+ else:
+ headers.append(line)
+
+ elif mode == 1:
+ if line[0] == '>':
+ mode = 3
+ subheaders = sh(line[1:], qname, length)
+ elif line[:7] == 'Length=':
+ length = int(''.join(line[7:].strip().split(',')))
+ mode = 2
+ elif line[0] == '(' and line.endswith('letters)'):
+ length = int(''.join(line[1:-8].strip().split(',')))
+ mode = 2
+ elif line[:6] == 'Query=':
+ qname = line[6:].lstrip()
+ else:
+ qname += line
+
+ elif mode == 2:
+ if line[0] == '>':
+ mode = 3
+ subheaders = sh(line[1:], qname, length)
+ elif line[:6] == 'Query=':
+ qname = line[6:].lstrip()
+ mode = 1
+
+ elif mode == 3:
+ if line[:5] == 'Score':
+ snm = subheaders['subject']['name']
+ defline = ''
+ space = snm.find(' ')
+ if space > -1:
+ snm, defline = snm[:space], snm[space + 1:]
+ subheaders['subject']['name'] = snm
+ subheaders['subject']['defline'] = defline
+ sh_fmt(line)
+ mode = 4
+ elif line[:7] == 'Length=':
+ pass
+ elif line[0] == '(' and line.endswith('letters)'):
+ pass
+ else:
+ subheaders['subject']['name'] += line
+
+ elif mode == 4:
+ sh_fmt(line)
+
+ elif mode == 5:
+ if line[:6] == 'Query=':
+ mode = 1
+ qname = line[6:].lstrip()
+ yield ra(subheaders)
+ continue
+ elif line[0] == '>':
+ yield ra(subheaders)
+ subheaders = sh(line[1:], qname, length)
+ mode = 3
+ continue
+ elif line[:5] == 'Score':
+ yield ra(subheaders)
+ subheaders = sh(subheaders['subject']['name'], qname,
+ length)
+ sh_fmt(line)
+ mode = 4
+ continue
+ elif line[:5] == 'Sbjct':
+ curr = 'subject'
+ elif line[:5] == 'Query':
+ curr = 'query'
+ else:
+ continue
+
+ _, start, seq, end = line.split()
+ subheaders[curr]['start'] = subheaders[curr]['start'] or start
+ subheaders[curr]['end'] = end
+ subheaders[curr]['sequence'] += seq
+
+ try:
+ yield ra(subheaders)
+ except UnboundLocalError:
+ pass
+ raise StopIteration()
+
+if __name__ == '__main__':
+ import sys
+
+ if len(sys.argv) > 1:
+ output = open(sys.argv[1]).read()
+ for result in Result(output):
+ print(result)
diff --git a/src/IO/__init__.py b/src/IO/__init__.py
new file mode 100644
index 0000000..39ca342
--- /dev/null
+++ b/src/IO/__init__.py
@@ -0,0 +1,165 @@
+'''
+A module for reading and writing to sequence and annotation files. Currently
+supported file types are: FASTA, FASTQ, CLUSTAL alignments, and GFF3 files.
+'''
+
+from biotools.IO import fasta, fastq, gff, clustal
+from biotools.IO.manager import IOManager
+try:
+ import __builtin__
+except ImportError:
+ import builtins as __builtin__
+
+
+def get_methods():
+ methods = {}
+ nil = lambda *x: iter([])
+ for module in [fasta, fastq, gff, clustal]:
+ modname = module.__name__.split('.')[-1]
+ methods[modname] = {}
+ for method in ['read', 'write', 'rhook', 'whook', 'probe']:
+ methods[modname][method] = module.__dict__.get(method, nil)
+ return methods
+
+
+class IOBase(object):
+ '''
+ Generic IO class for sequence files.
+ '''
+ methods = IOManager(get_methods())
+
+ def __init__(self, name, mode):
+ '''
+ Opens file name with mode mode. This function will attempt to guess at
+ the filetype by 1. looking at the file extension and failing that,
+ will 2. read the first few lines to determine the file type.
+
+ Recoginized file extensions include fa, fsa, fas, fasta, fastq,
+ clustalw, clustal, aln.
+ '''
+
+ self.file = name
+ self.handle = __builtin__.open(name, mode)
+ self.method = self.methods.default
+ self.type = None
+
+ self.suffixes = {
+ 'fsa': 'fasta',
+ 'fa': 'fasta',
+ 'fs': 'fasta',
+ 'fas': 'fasta',
+ 'fna': 'fasta',
+ 'fasta': 'fasta',
+ 'clustalw': 'clustal',
+ 'clustal': 'clustal',
+ 'aln': 'clustal',
+ 'fastq': 'fastq',
+ 'gff': 'gff',
+ 'gff3': 'gff'
+ }
+
+ p = name.rfind('.')
+ if p > -1:
+ ext = name[p + 1:]
+ if ext in self.suffixes:
+ try:
+ self.format(self.suffixes[ext])
+ return
+ except ValueError:
+ pass
+ try:
+ for method in IOBase.methods:
+ try:
+ self.format(method)
+ return
+ except ValueError:
+ pass
+ except IOError:
+ raise
+
+ def format(self, fmt):
+ '''
+ Forces a file to be parsed as a particular format. By default, the
+ values for fmt can be any recognized format.
+ '''
+ if fmt in self.methods:
+ method = self.methods[fmt]
+ ret = method['probe'](__builtin__.open(self.file, 'r'))
+ if ret:
+ self.method = method
+ for key in ret:
+ object.__setattr__(self, key, ret[key])
+ return self
+ else:
+ raise ValueError("File cannot be parsed as type %s." % fmt)
+ self.method = self.methods.default
+ return self
+
+ def close(self):
+ '''
+ Close the file handle.
+ '''
+ self.handle.close()
+
+
+class Reader(IOBase):
+ '''
+ A class that wraps IOBase and restricts the ability to write.
+ '''
+ def __init__(self, filename, mode='r'):
+ IOBase.__init__(self, filename, mode)
+ self.method['rhook'](self.handle)
+ self.iter = self.method['read'](self.handle)
+
+ def read(self, n=None):
+ '''
+ If `n` is provided, the next (up to) `n` entries are parsed and
+ returned. Otherwise, all remaining entries are parsed and returned.
+ '''
+ if n is None:
+ return [s for s in self]
+ return [self.iter.next() for i in xrange(int(n))]
+
+ def __iter__(self):
+ return self.iter
+
+ def next(self):
+ '''
+ Reads a single entry in the file and returns it.
+ '''
+ try:
+ return self.read(1)[0]
+ except (StopIteration, ValueError, IndexError):
+ raise StopIteration()
+
+
+class Writer(IOBase):
+ '''
+ A class that wraps IOBase and restricts the ability to read.
+ '''
+
+ def __init__(self, filename, mode='w'):
+ IOBase.__init__(self, filename, mode)
+ self.haswritten = False
+
+ def write(self, sequence):
+ '''
+ Writes sequence as the correct format to the file.
+ '''
+ if not self.haswritten:
+ self.method['whook'](self.handle)
+ self.haswritten = True
+ self.method['write'](self.handle, sequence)
+
+
+def open(filename, mode='r'):
+ '''
+ Open a file for parsing or creation. Returns either a Reader or Writer
+ object, depending on the open mode.
+ '''
+ if mode == 'r':
+ return Reader(filename)
+ if mode == 'w':
+ return Writer(filename, mode='w')
+ if mode == 'a':
+ return Writer(filename, mode='a')
diff --git a/src/IO/clustal.py b/src/IO/clustal.py
new file mode 100644
index 0000000..40fad12
--- /dev/null
+++ b/src/IO/clustal.py
@@ -0,0 +1,55 @@
+'''
+Methods for manipulating clustalw alignment files.
+'''
+
+from biotools.sequence import Sequence
+
+
+def read(fh):
+ def clean_alignment(x):
+ i = 0
+ for c in x:
+ if c != "-":
+ break
+ i += 1
+ j = 0
+ for c in reversed(x):
+ if c != "-":
+ break
+ j += 1
+ return (' ' * i + x[(i or None):(-j or None)] + ' ' * j, i, len(x) - j)
+
+ seqs = {}
+ for line in fh:
+ if line.startswith(' '):
+ continue
+ st = line.strip()
+ if st:
+ bits = st.split()
+ if len(bits) != 2:
+ continue
+ if bits[0] not in seqs:
+ seqs[bits[0]] = ''
+ seqs[bits[0]] += bits[1]
+
+ for k in seqs:
+ seq, start, end = clean_alignment(seqs[k])
+ yield Sequence(k, seq, start=start, end=end)
+ raise StopIteration()
+
+
+def probe(fh):
+ for line in fh:
+ st = line.strip()
+ if st:
+ if st.startswith('CLUSTAL'):
+ return {'type': 'clustalw'}
+ return False
+ return {'type': 'clustalw'}
+
+
+def rhook(fh):
+ try:
+ fh.next()
+ except StopIteration:
+ pass
diff --git a/src/IO/fasta.py b/src/IO/fasta.py
new file mode 100644
index 0000000..e10982e
--- /dev/null
+++ b/src/IO/fasta.py
@@ -0,0 +1,64 @@
+'''
+Functions for manipulating FASTA files.
+'''
+
+from biotools.sequence import Sequence, chop
+
+
+def read(fh):
+ '''
+ Read sequences in FASTA format; identifiers (names and definition lines)
+ are on lines that begin with carets ('>') and sequence is on lines that
+ intervene between the carets. This function is a generator that yields
+ `Sequence` objects.
+ '''
+ name, defline, seq = '', '', ''
+ for line in fh:
+ line = line.strip()
+ if not line:
+ continue
+ if line[0] == '>':
+ if name or seq:
+ yield Sequence(name, seq, defline=defline)
+ seq = ''
+ name = line[1:].split()[0]
+ defline = line[1 + len(name):].strip()
+ continue
+ seq += line
+ if name or seq:
+ yield Sequence(name, seq, defline=defline)
+ raise StopIteration()
+
+
+def write(fh, s):
+ '''
+ Write sequences in FASTA format, i.e.,
+
+ ```
+ >name defline
+ sequence ...
+ ```
+
+ Sequences are wrapped to 70 characters by default.
+ '''
+ fh.write('>%s %s\n' % (s.name, s.defline) +
+ '\n'.join(chop(s.seq, 70)) + '\n')
+
+
+def probe(fh):
+ '''
+ Probe a file to determine whether or not it is a FASTA file. That is,
+ the first non-empty line should begin with a caret ('>'). If no caret is
+ found on the first line, then we conclude that it is not a FASTA file
+ and return False, otherwise, we return a dictionary with information
+ relevant to the FASTA file type.
+ '''
+ for line in fh:
+ st = line.strip()
+ if st:
+ fh.close()
+ if st[0] == '>':
+ return {'type': 'fasta'}
+ return False
+ fh.close()
+ return {'type': 'fasta'}
diff --git a/src/IO/fastq.py b/src/IO/fastq.py
new file mode 100644
index 0000000..2b0729c
--- /dev/null
+++ b/src/IO/fastq.py
@@ -0,0 +1,72 @@
+'''
+Functions for manipulating FASTQ files.
+'''
+
+from biotools.sequence import Sequence
+
+
+def read(fh):
+ '''
+ Read sequences in FASTQ format; identifiers are on lines that begin with
+ at symbols ('@'), sequence follows on the next line, then a line that
+ begins and sequence is with a plus sign ('+') and finally the quality
+ scores on the subsequent line. Quality scores are encoded in Phred format,
+ the type of which (either 32 or 64) is determined when the file is probed
+ for opening. The scores are decoded into a list of integers. This function
+ is a generator that yields `Sequence` objects.
+ '''
+ while 1:
+ try:
+ line = fh.next().strip()
+ if line[0] == '@':
+ name = line[1:].split()[0]
+ defline = line[1 + len(name):].strip()
+ seq = f.next().strip()
+ fh.next()
+ qual = [ord(c) - self.phred for c in fh.next().strip()]
+ yield Sequence(name, seq, qual=qual, defline=defline)
+ except StopIteration:
+ raise
+ finally:
+ fh.close()
+
+
+def write(fh, s):
+ '''
+ Write sequences in FASTA format, i.e.,
+
+ ```
+ @name
+ sequence ...
+ +
+ quality scores
+ ```
+ '''
+ fh.write('@%s %s\n%s\n+\n%s\n' % (s.name, s.defline, s.seq,
+ ''.join(q + chr('A') - 1 for q in s.qual)) + '\n')
+
+
+def probe(fh):
+ '''
+ Probe a file to determine whether or not it is a FASTQ file. That is,
+ the first non-empty line should begin with a caret ('@') and the 3rd line
+ following that first non-empty line should contain no character with
+ ordinal value less than 32. If none of the characters have ordinal value
+ less than 64, then the file is guessed to be encoded in Phred64, otherwise
+ it is encoded in Phred32. This function will return False if the file is
+ not in FASTQ format and will return a dictionary with the phred score and
+ type ('fastq') if the file is FASTQ.
+ '''
+ for line in fh:
+ st = line.strip()
+ if st:
+ fh.close()
+ if st[0] == '@':
+ fh.next()
+ fh.next()
+ qual = [ord(c) for c in fh.next().strip()]
+ phred = 32 if min(qual) < ord('A') else 64
+ qual = [q - phred for q in qual]
+ return {'type': 'fastq', 'phred': phred}
+ return False
+ return {'type': 'fastq', 'phread': 64}
diff --git a/src/IO/gff.py b/src/IO/gff.py
new file mode 100644
index 0000000..c2f11f1
--- /dev/null
+++ b/src/IO/gff.py
@@ -0,0 +1,28 @@
+from biotools.annotation import Annotation
+
+
+def read(fh):
+ for line in fh:
+ if line[0] != '#':
+ yield Annotation(*line.split('\t'))
+ raise StopIteration()
+
+
+def write(fh, a):
+ # TODO: improve this...
+ fh.write(str(a) + '\n')
+
+
+def probe(fh):
+ for line in fh:
+ line = line.strip()
+ if line:
+ bits = line.split()
+ if bits[0] == '##gff-version':
+ return {'type': 'gff', 'version': float(bits[1])}
+ return False
+ return {'type': 'gff', 'version': 3}
+
+
+def whook(fh):
+ fh.write('##gff-version 3\n')
diff --git a/src/IO/manager.py b/src/IO/manager.py
new file mode 100644
index 0000000..9046496
--- /dev/null
+++ b/src/IO/manager.py
@@ -0,0 +1,77 @@
+'''
+This module is home to the IOManager class, which manages the various input
+and output formats (specifically, FASTA, FASTQ, CLUSTAL alignments, and GFF
+files, currently).
+'''
+
+
+class IOManager(object):
+ '''
+ A class used by the `IOBase` class to manage the various input and output
+ methods for the different file types. Additional file types can be added
+ to the manager by using
+
+ ```python
+ manager[format] = methods
+ ```
+
+ From the above example, `methods` is a dictionary with keys `rhook`,
+ `read`, `whook`, `write`, and `probe`. Each of the values must be callable
+ object:
+ * `rhook` => takes a file handle opened for reading; called before reading
+ of the file has begun,
+ * `whook` => takes a file handle opened for writing; called before writing
+ to the file has begun,
+ * `read` => takes a file handle opened for reading; should be a generator
+ that yields entries,
+ * `write` => takes a file handle opened for writing and a single entry;
+ writes the entry to the file,
+ * `probe` => takes a file handle opened for reading; returns a dictionary
+ of attributes to be applied to the `IOBase` instance.
+
+ This class behaves similarly to a dictionary, except that the get method
+ will default to the default method (which does nothing) if no truthy
+ second parameter is passed.
+ '''
+
+ def __init__(self, methods=None):
+ '''
+ Instantiates an `IOManager` with methods, where the keys of methods are
+ the formats and the values are dictionaries with `rhook`, `whook`,
+ `read`, `write`, and `probe` callables.
+ '''
+ self.methods = methods or {}
+ self.protected = set(self.methods.keys())
+
+ nil = lambda *x: iter([])
+ self.default = {
+ 'rhook': nil,
+ 'read': nil,
+ 'whook': nil,
+ 'write': nil,
+ 'probe': nil
+ }
+
+ def __contains__(self, key):
+ return key in self.methods
+
+ def __getitem__(self, key):
+ return self.methods.get(key, self.default)
+
+ def get(self, key, default=None):
+ '''
+ Try to get a set of methods via format (e.g., 'fasta') or fall-back
+ to the default methods (which do nothing).
+ '''
+ try:
+ return self.methods[key]
+ except KeyError:
+ return default or self.default
+
+ def __setitem__(self, key, value):
+ if key not in self.protected:
+ self.methods[key] = value
+ return value
+
+ def __iter__(self):
+ return iter(self.methods)
diff --git a/src/IO/phylip.py b/src/IO/phylip.py
new file mode 100644
index 0000000..0489714
--- /dev/null
+++ b/src/IO/phylip.py
@@ -0,0 +1,57 @@
+'''
+Functions for manipulating PHYLIP files.
+'''
+
+from biotools.sequence import Sequence, chop
+
+
+def read(fh):
+ '''
+ Read sequences in PHYLIP format; This function is a generator that yields
+ `Sequence` objects.
+ '''
+ grid, seqs, names = None, [], []
+
+ try:
+ line = fh.next().strip()
+ while not line:
+ line = fh.next.strip()
+ except StopIteration:
+ fh.close()
+ raise StopIteration()
+ grid = [int(x) for x in line.strip()]
+ while True:
+ lines = []
+ try:
+ for i in xrange(grid[0]):
+ line = fh.next().strip()
+ while not line:
+ line = fh.next().strip()
+ lines.append(line)
+ if not names:
+ names = [l.split()[0] for l in lines]
+ try:
+ seqs = [''.join(l.split()[1:]) for l in lines]
+ except IndexError:
+ seqs = [''] * grid[0]
+ else:
+ temp = [''.join(l.split()) for l in lines]
+ seqs = [a + b for a, b in zip(seqs, temp)]
+ except StopIteration:
+ break
+ for name, seq in zip(names, seqs):
+ yield Sequence(name, seq)
+ fh.close()
+ raise StopIteration()
+
+
+def probe(fh):
+ '''
+ '''
+ for line in fh:
+ bits = line.split()
+ if len(bits) == 2 and int(bits[0]) > 0 and int(bits[1]) > 0:
+ return {'type': 'phylip'}
+ else:
+ return False
+ return {'type': 'phylip'}
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..8b85d47
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,3 @@
+'''
+A bunch of bioinformatics utilities.
+'''
diff --git a/src/align.py b/src/align.py
new file mode 100644
index 0000000..36a086e
--- /dev/null
+++ b/src/align.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python
+
+'''
+This module is used to align sequences. Currently, there is only a single
+alignment algorithm implementented; it is a hybrid between Needleman-Wunsch
+and Smith-Waterman and is used to find the subsequence within a larger sequence
+that best aligns to a reference.
+'''
+
+from biotools.translate import translate
+import biotools.analysis.options as options
+
+DIAG_MARK, VGAP_MARK, HGAP_MARK = 3, 2, 1
+bl = {
+ '*': {'*': 0, 'A': -9, 'C': -9, 'E': -9, 'D': -9, 'G': -9, 'F': -9, 'I': -9,
+ 'H': -9, 'K': -9, 'M': -9, 'L': -9, 'N': -9, 'Q': -9, 'P': -9, 'S': -9,
+ 'R': -9, 'T': -9, 'W': -9, 'V': -9, 'Y': -9, 'X': 0},
+ 'A': {'*': -9, 'A': 4, 'C': 0, 'E': -1, 'D': -2, 'G': 0, 'F': -2, 'I': -1,
+ 'H': -2, 'K': -1, 'M': -1, 'L': -1, 'N': -1, 'Q': -1, 'P': -1, 'S': 1,
+ 'R': -1, 'T': -1, 'W': -3, 'V': -2, 'Y': -2, 'X': 0},
+ 'C': {'*': -9, 'A': 0, 'C': 9, 'E': -4, 'D': -3, 'G': -3, 'F': -2, 'I': -1,
+ 'H': -3, 'K': -3, 'M': -1, 'L': -1, 'N': -3, 'Q': -3, 'P': -3, 'S': -1,
+ 'R': -3, 'T': -1, 'W': -2, 'V': -1, 'Y': -2, 'X': 0},
+ 'E': {'*': -9, 'A': -1, 'C': -4, 'E': 5, 'D': 2, 'G': -2, 'F': -3, 'I': -3,
+ 'H': 0, 'K': 1, 'M': -2, 'L': -3, 'N': 0, 'Q': 2, 'P': -1, 'S': 0,
+ 'R': 0, 'T': 0, 'W': -3, 'V': -3, 'Y': -2, 'X': 0},
+ 'D': {'*': -9, 'A': -2, 'C': -3, 'E': 2, 'D': 6, 'G': -1, 'F': -3, 'I': -3,
+ 'H': -1, 'K': -1, 'M': -3, 'L': -4, 'N': 1, 'Q': 0, 'P': -1, 'S': 0,
+ 'R': -2, 'T': 1, 'W': -4, 'V': -3, 'Y': -3, 'X': 0},
+ 'G': {'*': -9, 'A': 0, 'C': -3, 'E': -2, 'D': -1, 'G': 6, 'F': -3, 'I': -4,
+ 'H': -2, 'K': -2, 'M': -3, 'L': -4, 'N': -2, 'Q': -2, 'P': -2, 'S': 0,
+ 'R': -2, 'T': 1, 'W': -2, 'V': 0, 'Y': -3, 'X': 0},
+ 'F': {'*': -9, 'A': -2, 'C': -2, 'E': -3, 'D': -3, 'G': -3, 'F': 6, 'I': 0,
+ 'H': -1, 'K': -3, 'M': 0, 'L': 0, 'N': -3, 'Q': -3, 'P': -4, 'S': -2,
+ 'R': -3, 'T': -2, 'W': 1, 'V': -1, 'Y': 3, 'X': 0},
+ 'I': {'*': -9, 'A': -1, 'C': -1, 'E': -3, 'D': -3, 'G': -4, 'F': 0, 'I': 4,
+ 'H': -3, 'K': -3, 'M': 1, 'L': 2, 'N': -3, 'Q': -3, 'P': -3, 'S': -2,
+ 'R': -3, 'T': -2, 'W': -3, 'V': 1, 'Y': -1, 'X': 0},
+ 'H': {'*': -9, 'A': -2, 'C': -3, 'E': 0, 'D': 1, 'G': -2, 'F': -1, 'I': -3,
+ 'H': 8, 'K': -1, 'M': -2, 'L': -3, 'N': 1, 'Q': 0, 'P': -2, 'S': -1,
+ 'R': 0, 'T': 0, 'W': -2, 'V': -2, 'Y': 2, 'X': 0},
+ 'K': {'*': -9, 'A': -1, 'C': -3, 'E': 1, 'D': -1, 'G': -2, 'F': -3, 'I': -3,
+ 'H': -1, 'K': 5, 'M': -1, 'L': -2, 'N': 0, 'Q': 1, 'P': -1, 'S': 0,
+ 'R': 2, 'T': 0, 'W': -3, 'V': -3, 'Y': -2, 'X': 0},
+ 'M': {'*': -9, 'A': -1, 'C': -1, 'E': -2, 'D': -3, 'G': -3, 'F': 0, 'I': 1,
+ 'H': -2, 'K': -1, 'M': 5, 'L': 2, 'N': -2, 'Q': 0, 'P': -2, 'S': -1,
+ 'R': -1, 'T': -1, 'W': -1, 'V': -2, 'Y': -1, 'X': 0},
+ 'L': {'*': -9, 'A': -1, 'C': -1, 'E': -3, 'D': -4, 'G': -4, 'F': 0, 'I': 2,
+ 'H': -3, 'K': -2, 'M': 2, 'L': 4, 'N': -3, 'Q': -2, 'P': -3, 'S': -2,
+ 'R': -2, 'T': -2, 'W': -2, 'V': 3, 'Y': -1, 'X': 0},
+ 'N': {'*': -9, 'A': -2, 'C': -3, 'E': 0, 'D': 1, 'G': 0, 'F': -3, 'I': -3,
+ 'H': -1, 'K': 0, 'M': -2, 'L': -3, 'N': 6, 'Q': 0, 'P': -2, 'S': 1,
+ 'R': 0, 'T': 0, 'W': -4, 'V': -3, 'Y': -2, 'X': 0},
+ 'Q': {'*': -9, 'A': -1, 'C': -3, 'E': 2, 'D': 0, 'G': -2, 'F': -3, 'I': -3,
+ 'H': 0, 'K': 1, 'M': 0, 'L': -2, 'N': 0, 'Q': 5, 'P': -1, 'S': 0,
+ 'R': 1, 'T': 0, 'W': -2, 'V': -2, 'Y': -1, 'X': 0},
+ 'P': {'*': -9, 'A': -1, 'C': -3, 'E': -1, 'D': -1, 'G': -2, 'F': -4, 'I': -3,
+ 'H': -2, 'K': -1, 'M': -2, 'L': -3, 'N': -1, 'Q': -1, 'P': 7, 'S': -1,
+ 'R': -2, 'T': 1, 'W': -4, 'V': -2, 'Y': -3, 'X': 0},
+ 'S': {'*': -9, 'A': 1, 'C': -1, 'E': 0, 'D': 0, 'G': 0, 'F': -2, 'I': -2,
+ 'H': -1, 'K': 0, 'M': -1, 'L': -2, 'N': 1, 'Q': 0, 'P': -1, 'S': 4,
+ 'R': -1, 'T': 1, 'W': -3, 'V': -2, 'Y': -2, 'X': 0},
+ 'R': {'*': -9, 'A': -1, 'C': -3, 'E': 0, 'D': -2, 'G': -2, 'F': -3, 'I': -3,
+ 'H': 0, 'K': 2, 'M': -1, 'L': -2, 'N': 0, 'Q': 1, 'P': -2, 'S': -1,
+ 'R': 5, 'T': -1, 'W': -3, 'V': -3, 'Y': -2, 'X': 0},
+ 'T': {'*': -9, 'A': -1, 'C': -1, 'E': 0, 'D': 1, 'G': 1, 'F': -2, 'I': -2,
+ 'H': 0, 'K': 0, 'M': -1, 'L': -2, 'N': 0, 'Q': 0, 'P': 1, 'S': 1,
+ 'R': -1, 'T': 4, 'W': -3, 'V': -2, 'Y': -2, 'X': 0},
+ 'W': {'*': -9, 'A': -3, 'C': -2, 'E': -3, 'D': -4, 'G': -2, 'F': 1, 'I': -3,
+ 'H': -2, 'K': -3, 'M': -1, 'L': -2, 'N': -4, 'Q': -2, 'P': -4, 'S': -3,
+ 'R': -3, 'T': -3, 'W': 11, 'V': -3, 'Y': 2, 'X': 0},
+ 'V': {'*': -9, 'A': 0, 'C': -1, 'E': -2, 'D': -3, 'G': -3, 'F': -1, 'I': 3,
+ 'H': -3, 'K': -2, 'M': 1, 'L': 1, 'N': -3, 'Q': -2, 'P': -2, 'S': -2,
+ 'R': -3, 'T': -2, 'W': -3, 'V': 4, 'Y': -1, 'X': 0},
+ 'Y': {'*': -9, 'A': -2, 'C': -2, 'E': -2, 'D': -3, 'G': -3, 'F': 3, 'I': -1,
+ 'H': 2, 'K': -2, 'M': -1, 'L': -1, 'N': -2, 'Q': -1, 'P': -3, 'S': -2,
+ 'R': -2, 'T': -2, 'W': 2, 'V': -1, 'Y': 7, 'X': 0},
+ 'X': {'*': 0, 'A': 0, 'C': 0, 'E': 0, 'D': 0, 'G': 0, 'F': 0, 'I': 0,
+ 'H': 0, 'K': 0, 'M': 0, 'L': 0, 'N': 0, 'Q': 0, 'P': 0, 'S': 0,
+ 'R': 0, 'T': 0, 'W': 0, 'V': 0, 'Y': 0, 'X': 0}
+}
+
+
+def OptimalCTether(reference, translation, extend=1, create=10):
+ '''
+ This function will take two sequences: a `reference` sequence and another
+ protein sequence (`translation`; usually, this is an open reading frame
+ that has been translated). Needleman-Wunsch alignment will be performed
+ and the substring of translation with the highest identity that begins
+ with a start codon [default: `['ATG']`] is reported.
+
+ This function returns a dictionary of relevent information from the
+ alignment; specifically, the alignments itself [keys: `query`, `subject`],
+ the score [key: `score`], the length of the alignment [key: `length`], the
+ length of the substring of translation used [key: `sublength`], the number
+ of identities [key: `identities`], and the number of gaps [key: `gaps`].
+ '''
+
+ starts = set(translate(s) for s in options.START_CODONS)
+ v, w = reference, translation
+
+ try:
+ v = v.seq
+ except AttributeError:
+ pass
+ try:
+ w = w.seq
+ except AttributeError:
+ pass
+ if not starts & set(w):
+ raise ValueError("Open reading frame does not contain a start codon.")
+
+ v, w = v[::-1], w[::-1]
+ lv, lw = len(v), len(w)
+ rv, rw = range(lv + 1), range(lw + 1)
+ gpc = [[create * int(not (i | j)) for i in rw] for j in rv]
+ mat = [[-(i + j) * extend - create * (not (i | j) and w[0] != v[0])
+ for i in rw] for j in rv]
+ pnt = [[VGAP_MARK if i > j else HGAP_MARK if j > i else DIAG_MARK
+ for i in rw] for j in rv]
+ ids = [[0 for i in rw] for j in rv]
+ optimal = [None, 0, 0]
+
+ for i in range(lv):
+ for j in range(lw):
+ vals = [[mat[i][j] + bl[v[i]][w[j]], DIAG_MARK],
+ [mat[i + 1][j] - extend - gpc[i + 1][j], VGAP_MARK],
+ [mat[i][j + 1] - extend - gpc[i][j + 1], HGAP_MARK]]
+ mat[i + 1][j + 1], pnt[i + 1][j + 1] = max(vals)
+ gpc[i + 1][j + 1] = create * int(pnt[i + 1][j + 1] == DIAG_MARK)
+ if (optimal[0] is None or mat[i + 1][j + 1] > optimal[0]) and \
+ abs(lv - i) / float(lv) <= options.LENGTH_ERR and \
+ w[j] in starts:
+ optimal = [mat[i + 1][j + 1], i + 1, j + 1]
+
+ i, j = optimal[1], optimal[2]
+ seq, ids = ['', ''], 0
+ gapcount, length, sublen = 0, 0, 0
+ methods = {
+ VGAP_MARK:
+ lambda s, i, j, l, g, n:
+ (['-' + s[0], w[j - 1] + s[1]], i, j - 1, l + 1, g + 1, n),
+ DIAG_MARK:
+ lambda s, i, j, l, g, n:
+ ([v[i - 1] + s[0], w[j - 1] + s[1]], i - 1, j - 1,
+ l + 1, g, n + (w[j - 1] == v[i - 1])),
+ HGAP_MARK:
+ lambda s, i, j, l, g, n:
+ ([v[i - 1] + s[0], '-' + s[1]], i - 1, j, l, g + 1, n)
+ }
+
+ while [i, j] != [0, 0]:
+ length += 1
+ state = (seq, i, j, sublen, gapcount, ids)
+ seq, i, j, sublen, gapcount, ids = methods[pnt[i][j]](*state)
+
+ return {
+ 'subject': seq[0][::-1],
+ 'query': seq[1][::-1],
+ 'score': optimal[0],
+ 'gaps': gapcount,
+ 'length': length,
+ 'sublength': sublen,
+ 'identities': ids
+ }
diff --git a/src/analysis/__init__.py b/src/analysis/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/analysis/cluster.py b/src/analysis/cluster.py
new file mode 100644
index 0000000..8e53d79
--- /dev/null
+++ b/src/analysis/cluster.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+import biotools.sequence as sequ
+import biotools.IO as io
+import biotools.translate as tran
+import biotools.clustal as clustal
+import biotools.analysis.options as options
+try:
+ import Queue as queue
+except ImportError:
+ import queue
+import hashlib
+import subprocess
+import threading
+from os import sep, mkdir
+
+
+def run(direc, inputs):
+ '''
+ Takes a collection of files generated by gene prediction, creates clusters
+ based off of the genes that have homology to those predicted genes, and
+ creates new fasta files in the clusters sub directory under the given
+ directory and separated according to whether they are nucleotide or amino
+ acid sequnces. These new fasta files are then used to create clustalw
+ alignments of the genes if more than 1 sequence exists in the fasta file.
+ '''
+
+ clusters = {}
+ all_ids = set()
+ ids = {}
+ q = queue.Queue()
+ filenames = []
+
+ def run_clustal():
+ while not q.empty():
+ cid = q.get()
+ dig = hashlib.md5()
+ dig.update(' '.join(cid))
+ dig = dig.hexdigest()
+
+ fpre = direc + 'nt' + sep + dig
+ apre = direc + 'aa' + sep + dig
+ fname = fpre + ".fasta"
+ aname = apre + ".fasta"
+
+ fh = io.open(fname, 'w')
+ ah = io.open(aname, 'w')
+ for ipt in clusters:
+ counter = 0
+ name = '_'.join(ipt.split(sep)[-1].split('.')[0].split())
+ for cluster in clusters[ipt]:
+ if cid & cluster[0]:
+ nm = name + '_' + str(counter)
+ seq = cluster[1]
+ curr = sequ.Sequence(nm, seq, defline=', '.join(cid))
+ tr = tran.translate(curr)
+ tr.name = curr.name
+ fh.write(curr)
+ ah.write(tr)
+ counter += 1
+ fh.close()
+ ah.close()
+
+ try:
+ clustal.run(fname, fpre + '.clustalw')
+ clustal.run(aname, apre + '.clustalw')
+ filenames.append(dig + '.fasta')
+ except ValueError:
+ pass
+
+ q.task_done()
+
+ if direc:
+ for d in [direc, direc + 'nt' + sep, direc + 'aa' + sep]:
+ try:
+ mkdir(d)
+ except OSError:
+ pass
+
+ for ipt in inputs:
+ seqs = {}
+ ids[ipt] = set()
+ for seq in io.open(ipt, 'r'):
+ ids[ipt].add(seq.name)
+ all_ids.add(seq.name)
+ if seq.seq not in seqs:
+ seqs[seq.seq] = set()
+ seqs[seq.seq].add(seq.name)
+ clusters[ipt] = [(seqs[k], k) for k in seqs]
+ del seqs
+
+ sub_ids = []
+ while all_ids:
+ cid = all_ids.pop()
+ subcluster = (all_ids | set([cid])) & \
+ set(i for ipt in clusters for cluster in clusters[ipt]
+ for i in cluster[0] if cid in cluster[0])
+
+ for ipt in clusters:
+ for cluster in clusters[ipt]:
+ if cid in cluster[0]:
+ subcluster = (subcluster & cluster[0]) | \
+ (subcluster - ids[ipt])
+ sub_ids.append(subcluster)
+ all_ids -= subcluster
+
+ for cid in sub_ids:
+ q.put(cid)
+
+ threads = []
+ for i in xrange(options.NUM_PROCESSES - 1):
+ curr = threading.Thread(target=run_clustal)
+ threads.append(curr)
+ curr.start()
+ run_clustal()
+ q.join()
+ return filenames
diff --git a/src/analysis/loaddata.py b/src/analysis/loaddata.py
new file mode 100644
index 0000000..a267912
--- /dev/null
+++ b/src/analysis/loaddata.py
@@ -0,0 +1,176 @@
+'''
+This is a pretty simple JSON-like parser. Specifically, it can load Python-like
+object, list, and other literals, i.e., the sort of stuff you'd get it you
+dumped the the string representation of some data into a file.
+
+The real difference is that you must specify a variable name, e.g.:
+
+```python
+my_stuff = { ... }
+```
+
+These variable names don't need to be on a newline or anything like that, you
+should be able to omit any and all whitespace. The result of a successful
+parse is a dictionary:
+
+```python
+{'my_stuff': { ... }}
+```
+
+This function really only works for `None`, `True`, `False`, numbers, strings,
+dictionaries, and lists.
+'''
+
+def parse(ipt):
+ par = {'char': '', 'pos': -1}
+ leng = 0
+ val = ""
+ ret = {}
+
+ def advance(test=None):
+ curr = par['char']
+ if test and test != curr:
+ raise ValueError("Expected %s, saw %s." % (test, curr))
+
+ par['pos'] += 1
+ if par['pos'] < leng:
+ par['char'] = val[par['pos']]
+ else:
+ par['char'] = ''
+
+ def whitespace():
+ while par['char'] and ord(par['char']) <= ord(' '):
+ advance()
+
+ def variable():
+ whitespace()
+ start = par['pos']
+ while \
+ ('0' <= par['char'] <= '9') or \
+ ('A' <= par['char'] <= 'Z') or \
+ ('a' <= par['char'] <= 'z'):
+ advance()
+ stop = par['pos']
+ whitespace()
+ advance('=')
+
+ return (val[start:stop], value())
+
+ def value():
+ whitespace()
+ if par['char'] == '{':
+ return dictionary()
+ if par['char'] == '[':
+ return array()
+ if par['char'] in ("'", '"'):
+ return string()
+ if '0' <= par['char'] <= '9' or par['char'] == '-':
+ return number()
+ at = par['pos']
+ if par['char'] == 'T':
+ advance('r')
+ advance('u')
+ advance('e')
+ return True
+ if par['char'] == 'F':
+ advance('a')
+ advance('l')
+ advance('s')
+ advance('e')
+ return False
+ if par['char'] == 'N':
+ advance('o')
+ advance('n')
+ advance('e')
+ return None
+ prefix = val[at:par['pos']]
+ raise ValueError("Unexpected value starting with %s." % prefix)
+
+ def dictionary():
+ ret = {}
+ advance('{')
+ whitespace()
+ while par['char'] != '}':
+ s = value()
+ advance(':')
+ v = value()
+ ret[s] = v
+ if par['char'] == ',':
+ advance(',')
+ else:
+ break
+ advance('}')
+ whitespace()
+ return ret
+
+ def array():
+ ret = []
+ advance('[')
+ whitespace()
+ while par['char'] != ']':
+ ret.append(value())
+ if par['char'] == ',':
+ advance(',')
+ else:
+ break
+ advance(']')
+ whitespace()
+ return ret
+
+ def string():
+ ret = ""
+ q = par['char']
+ advance(q)
+ while par['char'] != q:
+ if par['char'] == '\\':
+ advance('\\')
+ if par['char'] == 'n':
+ ret += '\n'
+ elif par['char'] == 't':
+ ret += '\t'
+ else:
+ ret += par['char']
+ advance()
+ continue
+ ret += par['char']
+ advance()
+ advance(q)
+ whitespace()
+ return ret
+
+ def number():
+ ret = ""
+ if par['char'] == '-':
+ ret += '-'
+ advance('-')
+ while '0' <= par['char'] <= '9':
+ ret += par['char']
+ advance()
+ if par['char'] == '.':
+ ret += '.'
+ advance('.')
+ while '0' <= par['char'] <= '9':
+ ret += par['char']
+ advance()
+ if par['char'] in ('e', 'E'):
+ ret += par['char']
+ advance()
+ if par['char'] == '-':
+ ret += '-'
+ advance('-')
+ while '0' <= par['char'] <= '9':
+ ret += par['char']
+ advance()
+ whitespace()
+ return float(ret)
+
+ for line in ipt:
+ val += line
+ leng += len(line)
+
+ advance()
+ while par['pos'] < leng:
+ k, v = variable()
+ ret[k] = v
+
+ return ret
diff --git a/src/analysis/options.py b/src/analysis/options.py
new file mode 100644
index 0000000..9f8a79d
--- /dev/null
+++ b/src/analysis/options.py
@@ -0,0 +1,168 @@
+from optparse import OptionParser
+from threading import Lock
+from os import sep, makedirs
+from sys import stderr
+
+LENGTH_ERR = 0.2
+MIN_IDENTITY = 0.45
+MAX_EVALUE = 1e-30
+MIN_ORFLEN = 300
+NUM_THREADS = 16
+NUM_PROCESSES = 2
+DIRECTORY = '.' + sep
+PLOTTER = 'biotools.analysis.plot'
+
+START_CODONS = ["ATG"]
+STOP_CODONS = ["TAG", "TAA", "TGA"]
+args = tuple()
+
+predicting = True
+clustering = True
+renaming = True
+calculating = True
+reporting = True
+plotting = True
+verbose = False
+
+lock = Lock()
+
+parser = OptionParser(usage="Usage: %prog [options] " +
+ "<database> <sequences ...>")
+parser.add_option("-S", "--start", action="append", dest="start",
+ default=START_CODONS, type="string",
+ help="define a start codon [default: %s]" %
+ ' '.join("-S " + s for s in START_CODONS))
+parser.add_option("-E", "--stop", action="append", dest="stop",
+ default=STOP_CODONS, type="string",
+ help="define a stop codon [default: %s]" %
+ ' '.join("-E " + s for s in STOP_CODONS))
+parser.add_option("-j", "--threads", action="store", dest="threads",
+ default=NUM_THREADS, type="int",
+ help="number of threads [default: %default]")
+parser.add_option("-p", "--processes", action="store", dest="processes",
+ default=NUM_PROCESSES, type="int",
+ help="number of parallel processes to run " +
+ "[default: %default]")
+parser.add_option("-e", "--evalue", action="store", dest="evalue",
+ default=MAX_EVALUE, type="float",
+ help="maximum e-value [default: %default]")
+parser.add_option("-I", "--identity", action="store", dest="identity",
+ default=MIN_IDENTITY, type="float",
+ help="minimum percent identity [default: %default]")
+parser.add_option("-L", "--length", action="store", dest="fraction",
+ default=LENGTH_ERR, type="float",
+ help="allowable relative error in hit length " +
+ "[default: %default]")
+parser.add_option("-O", "--orflen", action="store", dest="orflen",
+ metavar="bases", default=MIN_ORFLEN, type="int",
+ help="minimum allowable length for ORFs [default: %default]")
+parser.add_option("-d", "--directory", action="store", dest="directory",
+ default=DIRECTORY, type="string",
+ help="set working directory [default: current]")
+parser.add_option("-P", "--plotter", action="store", dest="plotter",
+ default=PLOTTER, type="string",
+ help="plotting module [default: %default]")
+parser.add_option("-v", "--verbose", action="store_true", dest="verbose",
+ default=verbose,
+ help="print debug messages [default: False]")
+
+parser.add_option("--no-plots", action="store_false", dest="plotting",
+ default=plotting,
+ help="suppress the drawing of plots [default: False]")
+parser.add_option("--no-predict", action="store_false", dest="predicting",
+ default=predicting,
+ help="don't predict genes, instead treat the input files " +
+ "as predicted genes [default: False]")
+parser.add_option("--no-cluster", action="store_false", dest="clustering",
+ default=True, help="don't cluster the sequences, " +
+ "instead treat the input files as alignments " +
+ "[default: False]")
+parser.add_option("--no-rename", action="store_false", dest="renaming",
+ default=True,
+ help="don't rename the fasta and clustal files " +
+ "[default: False]")
+parser.add_option("--no-reports", action="store_false", dest="reporting",
+ default=True,
+ help="don't generate files for variance data " +
+ "[default: False]")
+parser.add_option("--no-calculation", action="store_false",
+ dest="calculating", default=True,
+ help="don't calculate sequence variance [default: False]")
+
+
+def debug(msg):
+ if verbose:
+ lock.acquire(True)
+ stderr.write(str(msg) + '\n')
+ lock.release()
+
+
+def parse(pargs):
+ '''
+ Parses `pargs` and sets global variables to be accessible to other
+ modules.
+
+ These variables are:
+ * `LENGTH_ERR`
+ * `MIN_IDENTITY`
+ * `MAX_EVALUE`
+ * `NUM_THREADS`
+ * `NUM_PROCESSES`
+ * `START_CODONS`
+ * `START_CODONS`
+ * `DIRECTORY`
+ * `PLOTTER`
+ * `args`
+ '''
+ global \
+ LENGTH_ERR, MIN_IDENTITY, MAX_EVALUE, MIN_ORFLEN, \
+ NUM_THREADS, NUM_PROCESSES, START_CODONS, STOP_CODONS, \
+ DIRECTORY, PLOTTER, args, predicting, clustering, \
+ renaming, calculating, reporting, plotting, verbose
+
+ opts, largs = parser.parse_args(pargs)
+
+ if opts.directory[-1] != sep:
+ opts.directory += sep
+ try:
+ makedirs(opts.directory)
+ except OSError:
+ pass
+
+ if '*' in opts.start:
+ DNA = 'ATCG'
+ opts.start = set(i + j + k for i in DNA for j in DNA for k in DNA) - \
+ set(opts.stop)
+
+ opts.processes = min(opts.processes, len(largs))
+ if not (0 <= opts.fraction <= 1):
+ raise RuntimeError("Allowable length error must be between 0 and 1.")
+
+ LENGTH_ERR = opts.fraction
+ MIN_IDENTITY = opts.identity
+ MAX_EVALUE = opts.evalue
+ MIN_ORFLEN = opts.orflen
+ NUM_THREADS = opts.threads
+ NUM_PROCESSES = opts.processes
+ STOP_CODONS = opts.stop
+ START_CODONS = opts.start
+ DIRECTORY = opts.directory
+ PLOTTER = opts.plotter
+
+ predicting = opts.predicting
+ clustering = opts.clustering
+ renaming = opts.renaming
+ calculating = opts.calculating
+ reporting = opts.reporting
+ plotting = opts.plotting
+
+ verbose = opts.verbose
+
+ args = largs
+
+
+def help():
+ '''
+ Prints the usage.
+ '''
+ parser.print_help()
diff --git a/src/analysis/plot.py b/src/analysis/plot.py
new file mode 100644
index 0000000..1854aba
--- /dev/null
+++ b/src/analysis/plot.py
@@ -0,0 +1,122 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import math
+from os import sep, mkdir
+
+
+def smoothed(unsmoothed, factor):
+ l = len(unsmoothed)
+ r = [math.exp(-(i - l) ** 2 / float(factor)) for i in xrange(1, l)]
+ w = np.array(r + [1] + r[::-1])
+ c = (w[l - i - 1:2 * l - i - 1] for i in xrange(l))
+ wm = np.array([x / sum(x) for x in c])
+ return np.dot(unsmoothed, wm.transpose())
+
+
+def plot(plotdata, directory, bottom=True, side=True, legend=True,
+ save=True, filename='untitled.pdf', upperbound=0.05, factor=21,
+ fig=None, **kwargs):
+ if fig is None:
+ fig = plt.figure(None, facecolor='w', edgecolor='w')
+
+ if not directory.endswith(sep):
+ directory += sep
+
+ try:
+ mkdir(directory)
+ except OSError:
+ pass
+
+ # plotting data
+ ntvar = plotdata['nt']['var']
+ aavar = plotdata['aa']['var']
+
+ # gene models
+ starts = plotdata['aa']['starts']
+ ends = plotdata['aa']['ends']
+ counts = plotdata['aa']['count']
+
+ # smooth the data
+ snt = smoothed(ntvar, factor)
+ lnt = len(ntvar)
+ saa = smoothed(aavar, factor)
+ laa = len(aavar)
+
+ # bounding rectangle
+ bound = [0, laa, -upperbound / 6.0, upperbound]
+
+ # x-values to align nucleotide & amino acids
+ xnt = np.arange(lnt) / 3.0 + 1
+ xaa = np.arange(laa) + 1
+
+ ax = axes(bottom, side, bound, fig, **kwargs)
+ ntl = draw(xnt, snt, ax, '#0000ff', **kwargs)
+ aal = draw(xaa, saa, ax, '#00ff00', **kwargs)
+ models(starts, ends, counts, bound, ax, **kwargs)
+ report(filename, ntvar, aavar, lnt, laa)
+
+ if legend:
+ fig.legend((ntl, aal), ('Nucleotide', 'Amino acid'), 'upper right')
+ if save:
+ fig.savefig(directory + filename)
+
+
+def axes(bottom, side, bound, fig, **kwargs):
+ # create the proper sized frame, depending on
+ # how we draw the plot
+ x = 0.09 if side else 0.02
+ y = 0.09 if bottom else 0.04
+
+ xs = [bound[0], bound[1] * 1.06]
+ # construct the axes
+ ax = fig.add_axes([x, y, 0.98 - x, 0.98 - y], xlim=xs, ylim=bound[2:])
+ ax.minorticks_on()
+ ax.tick_params(axis='x', which='minor', length=3)
+
+ # hide the unwanted axis lines (typically the top & right)
+ # label the wanted axes and draw them
+ for loc, spine in ax.spines.iteritems():
+ if loc in ['right', 'top']:
+ spine.set_color('none')
+ continue
+ if loc == 'bottom':
+ ax.xaxis.set_ticks_position('bottom')
+ if bottom:
+ ax.set_xlabel("Amino acids")
+ continue
+ if loc == 'left':
+ if side:
+ ax.set_ylabel("Sequence variance")
+ ax.yaxis.set_ticks_position('left')
+ else:
+ spine.set_color('none')
+ ax.tick_params('y', which='both', color='none',
+ labelcolor='none')
+
+ ax.hlines(ax.get_yticks(), xs[0], xs[1], color='0.75',
+ linestyle='dashed')
+ ax.hlines(0, xs[0], xs[1], color='k', linestyle='solid')
+ return ax
+
+
+def draw(x, y, ax, color, **kwargs):
+ return ax.plot(x, y, color=color, linestyle='solid')
+
+
+def models(starts, ends, counts, bound, ax, **kwargs):
+ lb, l = bound[2], len(starts)
+ scale = bound[1] / max(ends)
+ ys, i = np.arange(1, l + 1) * lb / 3.0, 0
+
+ # draw the gene models
+ ax.hlines(ys, starts, ends, colors='k', lw=4, linestyle='solid')
+ for c in counts:
+ ax.text(bound[1] + 10, lb / 3.0 * (i + 1.25), int(c))
+ i += 1
+
+
+def report(filename, ntvar, aavar, lnt, laa):
+ print(filename.center(80, '='))
+ print('Average variance: ')
+ print('\t%f per base pair' % (sum(ntvar) / lnt))
+ print('\t%f per amino acid' % (sum(aavar) / laa))
diff --git a/src/analysis/predict.py b/src/analysis/predict.py
new file mode 100644
index 0000000..7e15814
--- /dev/null
+++ b/src/analysis/predict.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+import biotools.IO as io
+import biotools.BLAST as BLAST
+import biotools.analysis.options as options
+from biotools.sequence import Sequence, annotation as ann
+from biotools.align import OptimalCTether as align
+from biotools.translate import translate
+from biotools.complement import complement
+try:
+ import Queue as queue
+except ImportError:
+ import queue
+import threading
+from os import sep, mkdir
+
+PIPING = True
+
+
+def ORFGenerator(sequ):
+ '''
+ Scans both strands of the given sequence and yields the longest subsequence
+ that starts with a start codon and contains no stop codon other than the
+ final codon.
+ '''
+ comp = complement(sequ[::-1])
+ seq = sequ.seq
+ cseq = comp.seq
+ slen = len(sequ)
+ starts = [-1, 0, 1, -1, 0, 1] # locations of start codons in each frame
+ stops = [0, 1, 2, 0, 1, 2] # locations of stop codons in each frame
+ mlen = options.MIN_ORFLEN
+
+ for i in xrange(slen - 2):
+ fcodon, rcodon = seq[i:i + 3], cseq[i:i + 3]
+ if fcodon in options.STOP_CODONS:
+ if i - mlen >= starts[i % 3 + 3] >= stops[i % 3 + 3]:
+ yield sequ[starts[i % 3 + 3]:i + 3]
+ stops[i % 3 + 3] = i + 3
+ elif fcodon in options.START_CODONS:
+ if starts[i % 3 + 3] < stops[i % 3 + 3]:
+ starts[i % 3 + 3] = i
+ if rcodon in options.STOP_CODONS:
+ if i - mlen >= starts[i % 3] >= stops[i % 3]:
+ yield comp[starts[i % 3]:i + 3]
+ stops[i % 3] = i + 3
+ elif rcodon in options.START_CODONS:
+ if starts[i % 3] < stops[i % 3]:
+ starts[i % 3] = i
+ raise StopIteration()
+
+
+class ThreadQueue(queue.Queue):
+
+ def __init__(self, target):
+ queue.Queue.__init__(self)
+ self.threadcount = 0
+ self.target = target
+
+ def put(self, item):
+ options.lock.acquire()
+ queue.Queue.put(self, item)
+ if self.threadcount < options.NUM_THREADS - 1:
+ thread = threading.Thread(target=self.target)
+ thread.start()
+ self.threadcount += 1
+ options.lock.release()
+
+
+def GeneFromBLAST(db, sequences, pref, names):
+ '''
+ BLASTs database against sequences, and for those results that pass the
+ length and percent identity requirements, attempt to locate the full gene
+ that corresponds to that BLAST hit. Genes that are found are saved in the
+ subdirectory sequences under the given directory, divided depending on
+ whether the sequnece is amino acid or nucleotide.
+ '''
+ PIPING = True
+ wd = options.DIRECTORY + 'sequences' + sep
+
+ for d in [options.DIRECTORY, wd]:
+ try:
+ mkdir(d)
+ except OSError:
+ pass
+
+ subj = dict((s.name, s) for s in io.open(db, 'r'))
+ options.debug("Database sequences loaded from file %s." % db)
+
+ try:
+ orfs = dict((s.name, [orf for orf in ORFGenerator(s)])
+ for s in io.open(sequences, 'r'))
+ options.debug("ORFs loaded from file %s." % sequences)
+ except IOError:
+ options.debug("No file \"" + sequences + ",\" skipping.")
+ return
+
+ def target():
+ while 1:
+ try:
+ res = qin.get(PIPING, 1)
+ except queue.Empty:
+ if not PIPING:
+ break
+ else:
+ continue
+
+ qname, sname = res['query']['name'], res['subject']['name']
+ start, end = res['query']['start'], res['query']['end']
+ alignments = []
+ max_match = (options.MIN_IDENTITY, None)
+
+ if subj[sname].type == 'nucl':
+ subject = translate(subj[sname])
+ else:
+ subject = subj[sname]
+
+ while qname:
+ try:
+ o = orfs[qname]
+ break
+ except KeyError:
+ qname = qname[:-1]
+ if not qname:
+ qin.task_done()
+ continue
+
+ for orf in o:
+ if in_range(orf, start, end, res['frame']):
+ orf = orf[:-3]
+ query = translate(orf)
+ options.debug("Aligning %33s v. %33s." % (qname, sname))
+ alignment = align(subject.seq, query.seq)
+ alignments.append((orf, sname, alignment))
+
+ for orf, refname, aln in alignments:
+ hitlen = aln['sublength']
+ region = orf[-3 * hitlen:]
+ identity = float(aln['identities']) / aln['length']
+ if identity >= max_match[0]:
+ max_match = (identity, (region, sname, aln))
+
+ if max_match[1]:
+ seq, name, _ = max_match[1]
+ odl = subject.defline.split('[')[0].strip()
+ src = seq.original.name
+ start, end, strand = seq.start, seq.end, seq.step
+ defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \
+ (odl + (' ' if odl else ''), src, start, end, strand)
+
+ new = Sequence(name.strip(), seq.seq, defline=defline,
+ original=seq.original, type=seq.type,
+ start=seq.start, end=seq.end, step=seq.step)
+ qout.put(new)
+ qin.task_done()
+
+ def in_range(seq, start, end, frame):
+ ss, se = sorted((seq.start, seq.end))
+ os, oe = sorted((start, end))
+ frame = int(frame)
+
+ return (ss < oe and se > os and
+ (se % 3 == oe % 3 or ss % 3 == oe % 3) and
+ ((frame < 0 and seq.step < 0) or
+ (frame > 0 and seq.step > 0)))
+
+ qout = queue.Queue()
+ qin = ThreadQueue(target)
+
+ blastopts = {
+ 'evalue': options.MAX_EVALUE,
+ 'num_threads': options.NUM_THREADS
+ }
+
+ for res in BLAST.run(db, sequences, **blastopts):
+ if float(res['expect']) > options.MAX_EVALUE:
+ continue
+
+ sbjl = len(subj[res['subject']['name']])
+ ident = float(res['identities'].split('(')[1][:-2]) / 100
+ lerr = float(res['subject']['length']) / sbjl
+
+ if ident >= options.MIN_IDENTITY:
+ if lerr >= (1.0 - options.LENGTH_ERR):
+ qin.put(res)
+ PIPING = False
+ options.debug("BLAST done.")
+
+ target()
+ qin.join()
+ options.debug("Done Aligning sequences.")
+
+ options.debug("Now writing sequences (%d)." % qout.qsize())
+ seqs = {}
+ nuc_file = io.open(wd + pref + '.fasta', 'w')
+ count = 0
+ while 1:
+ try:
+ seq = qout.get(False)
+ if seq.seq not in seqs:
+ seqs[seq.seq] = set()
+ seqs[seq.seq].add(seq)
+ nuc_file.write(seq)
+ count += 1
+ options.debug("Wrote %s (%d)." % (seq.name, count));
+ except queue.Empty:
+ break
+ nuc_file.close()
+ options.debug("Done Aligning sequences.")
+
+ gh = io.open(wd + pref + '.gff3', 'w')
+ names.append(wd + pref + '.fasta')
+
+ for id in seqs:
+ gh.write(ann(seqs[id].copy().pop(), pref, 'gene',
+ homologs=','.join(s.name for s in seqs[id])))
+ gh.close()
+
+
+def run(subject, query, prefix, names):
+ GeneFromBLAST(subject, query, prefix, names)
+
+
+if __name__ == '__main__':
+ options.START_CODONS = ['TTG']
+ import sys
+ f = io.open(sys.argv[1], 'r')
+ for seq in f:
+ print(seq.name + ' ' + seq.defline)
+ for orf in ORFGenerator(seq):
+ print('%d ... %d' % (orf.start, orf.end))
diff --git a/src/analysis/renamer.py b/src/analysis/renamer.py
new file mode 100644
index 0000000..6064736
--- /dev/null
+++ b/src/analysis/renamer.py
@@ -0,0 +1,61 @@
+from __future__ import print_function
+from os import sep, rename as mv
+import biotools.IO as io
+import biotools.analysis.options as options
+
+try:
+ get_input = raw_input
+except:
+ get_input = input
+
+
+def rename(direc, db, files):
+ '''
+ This isn't really for bioinformatics, this is more for the pipeline, to
+ rename the files generated by cluster.py with a little human interaction.
+ '''
+
+ names = []
+ seqdb = dict((s.name, s) for s in io.open(db, 'r'))
+ nt_dir, aa_dir = direc + 'nt' + sep, direc + 'aa' + sep
+ for f in files:
+ seq = io.open(nt_dir + f, 'r').next()
+ ids = seq.defline.split(', ')
+ print("File\033[33;1m", f, \
+ "\033[0mis described by the following sequences:")
+ try:
+ for id in ids:
+ seqdb[id]
+ print("* " + seqdb[id].name + ': ' +
+ seqdb[id].defline.split('[')[0])
+ except KeyError:
+ print("* (none)")
+ continue
+ pre = get_input("\033[33;1mWhat should we call this file " +
+ "(or hit enter to skip)? \033[0m")
+ fpre = f[:f.find('.')]
+
+ if pre != "":
+ count = 0
+ while True:
+ rpre = pre + ((" (%d)" % count) if count > 0 else "")
+ try:
+ fh = open(nt_dir + rpre + ".fasta", 'r')
+ fh.close()
+ count += 1
+ continue
+ except IOError:
+ nt_old, nt_new = nt_dir + fpre, nt_dir + rpre
+ aa_old, aa_new = aa_dir + fpre, aa_dir + rpre
+ print("Renaming " + fpre + ".* to " + rpre + ".*")
+ try:
+ mv(nt_old + ".fasta", nt_new + ".fasta")
+ mv(aa_old + ".fasta", aa_new + ".fasta")
+ mv(nt_old + ".clustalw", nt_new + ".clustalw")
+ mv(aa_old + ".clustalw", aa_new + ".clustalw")
+ names.append(nt_new + '.clustalw')
+ names.append(aa_new + '.clustalw')
+ except OSError:
+ pass
+ break
+ return names
diff --git a/src/analysis/report.py b/src/analysis/report.py
new file mode 100644
index 0000000..45d9654
--- /dev/null
+++ b/src/analysis/report.py
@@ -0,0 +1,89 @@
+import matplotlib.pyplot as plt
+import biotools.analysis.plot as bap
+from os import sep, mkdir
+
+
+# report areas of high conservation or variation
+def report(plotdata, **kwargs):
+ pass
+
+
+# wraps biotools.analysis.plot.plot()
+def plot(plotdata, directory, bottom=True, side=True, legend=True,
+ save=True, filename='untitled.pdf', upperbound=0.05, factor=21,
+ fig=plt.figure(None, facecolor='w', edgecolor='w'), **kwargs):
+
+ ranges = report(plotdata, **kwargs)
+
+ try:
+ mkdir(directory)
+ except OSError:
+ pass
+
+ lowerbound = -upperbound / 6
+
+ # smooth the data
+ snt = smoothed(snpdata['nt']['var'], factor)
+ lnt = len(snpdata['nt']['var'])
+ saa = smoothed(snpdata['aa']['var'], factor)
+ laa = len(snpdata['aa']['var'])
+
+ # generate x-ranges so that amino acids
+ # and nucleotides align
+ xnt = np.arange(lnt) * (0.0 + laa) / lnt + 1
+ xaa = np.arange(laa) + 1
+
+ # create the proper sized frame, depending on
+ # how we draw the plot
+ x = 0.09 if side else 0.02
+ y = 0.09 if bottom else 0.04
+
+ ax = fig.add_axes([x, y, 0.98 - x, 0.98 - y], xlim=[0, laa * 1.06],
+ ylim=[lowerbound, upperbound])
+ ax.minorticks_on()
+ ax.tick_params(axis='x', which='minor', length=3)
+
+ for loc, spine in ax.spines.iteritems():
+ if loc in ['right', 'top']:
+ spine.set_color('none')
+ continue
+ if loc == 'bottom':
+ ax.xaxis.set_ticks_position('bottom')
+ if bottom:
+ ax.set_xlabel("Amino acids")
+ continue
+ if loc == 'left':
+ if side:
+ ax.set_ylabel("Sequence variance")
+ ax.yaxis.set_ticks_position('left')
+ else:
+ spine.set_color('none')
+ ax.tick_params('y', which='both', color='none',
+ labelcolor='none')
+
+ ax.hlines(ax.get_yticks(), 0, laa * 1.06, color='0.75', linestyle='dashed')
+ ax.hlines(0, 0, laa * 1.06, color='k', linestyle='solid')
+
+ nt_lines = ax.plot(xnt, snt, color='#0000ff', linestyle='solid')
+ aa_lines = ax.plot(xaa, saa, color='#00ff00', linestyle='solid')
+
+ starts = snpdata['aa']['starts']
+ ends = snpdata['aa']['ends']
+ counts = snpdata['aa']['count']
+ scale = laa / max(ends)
+ ys = (np.arange(len(starts)) + 1) * lowerbound / 3
+
+ ax.hlines(ys, starts, ends, colors='k', lw=4, linestyle='solid')
+ for i, c in zip(xrange(len(counts)), counts):
+ ax.text(laa + 10, lowerbound / 3 * (i + 1.25), c)
+ if legend:
+ fig.legend((nt_lines, aa_lines), ('Nucleotide', 'Amino acid'),
+ 'upper right')
+
+ if save:
+ fig.savefig(directory + filename)
+
+ print '=============', filename, '============='
+ print 'Average variance: '
+ print '\t', sum(snpdata['nt']['var']) / lnt, 'per base pair'
+ print '\t', sum(snpdata['aa']['var']) / laa, 'per amino acid'
diff --git a/src/analysis/run.py b/src/analysis/run.py
new file mode 100644
index 0000000..4621ef0
--- /dev/null
+++ b/src/analysis/run.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+import biotools.analysis.predict as genepredict
+import biotools.analysis.options as options
+import threading
+from os import sep
+try:
+ import Queue as queue
+except ImportError:
+ import queue
+
+
+def run(infile, strains):
+ '''
+ Run several instances of `genepredict.run` at once.
+ '''
+
+ q = queue.Queue()
+ filenames = []
+
+ def run_predict():
+ while 1:
+ try:
+ strainf = q.get(False)
+ except queue.Empty:
+ break
+ strain = strainf.split(sep)[-1]
+ pos = strain.rfind('.')
+ if pos > 1 or (pos == 1 and strain[0] != '.'):
+ strain = strain[:pos]
+
+ options.debug("Predicting for %s." % strain)
+
+ try:
+ genepredict.run(infile, strainf, strain, filenames)
+ except RuntimeError:
+ pass
+ q.task_done()
+
+ for strain in strains:
+ q.put(strain)
+
+ for i in range(options.NUM_PROCESSES - 1):
+ curr = threading.Thread(target=run_predict)
+ curr.start()
+
+ run_predict()
+ q.join()
+
+ return filenames
+
+if __name__ == "__main__":
+ import sys
+ try:
+ run(sys.argv[1:])
+ except IndexError:
+ pass
diff --git a/src/analysis/variance.py b/src/analysis/variance.py
new file mode 100644
index 0000000..9632afd
--- /dev/null
+++ b/src/analysis/variance.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import biotools.IO as io
+import biotools.analysis.options as options
+from biotools.translate import translate
+from os import sep
+
+
+def SaySNPs(input):
+ '''
+ Takes a clustalw alignment and will return a dictionary of data
+ relevent to plotting the sequence variance for the sequences in the
+ given clustalw alignment. These data are:
+ * `var`: the measure of sequence variation,
+ * `starts`: the starting positions for each gene model in amino acids,
+ * `ends`: the ending positions for each gene model in amino acids, and
+ * `count`: the number of sequences with a particular gene model.
+ The values given in `starts`, `ends`, and `counts` are sorted to that the
+ nth element in starts corresponds to the nth value in ends and the nth
+ value in counts.
+ '''
+
+ catalogue = []
+ lengths = {}
+ for seq in io.open(input, 'r'):
+ key = (seq.start, seq.end)
+ lengths[key] = lengths.get(key, 0) + 1
+ for (i, c) in zip(xrange(key[1] - key[0] + 1), seq):
+ if i >= len(catalogue):
+ catalogue.append({})
+ if c != " ":
+ catalogue[i][c] = catalogue[i].get(c, 0) + 1
+
+ calc = []
+ for s in catalogue:
+ tot = float(sum(s.values()))
+ cnt = float(len(s))
+ calc.append(1.0 - sum((s[c] / tot) ** 2 for c in s))
+ llist = sorted(list(lengths.keys()))
+
+ return {
+ 'var': calc,
+ 'starts': [s for s, e in llist],
+ 'ends': [e for s, e in llist],
+ 'count': [lengths[k] for k in llist]
+ }
+
+
+def var(files):
+ '''
+ Returns plot data and metadata for plotting later on in the pipeline.
+ '''
+ sort = {}
+ for f in files:
+ seqs = [s for s in io.open(f)]
+ type = set(s.type for s in seqs)
+ if len(type) > 1:
+ type = set(['prot'])
+ fid = (type.pop(), f)
+ seqs = [''.join(s.seq.split('-')).strip() for s in seqs]
+ seqs = [translate(s) if fid[0] == 'nucl' else s for s in seqs]
+ sset = frozenset(seqs)
+ srtr = (len(seqs), sset)
+ sort[srtr] = sort.get(srtr, set()) | set([fid])
+
+ couples = []
+ for partners in sort.values():
+ trim = lambda x: '.'.join(x.split('.')[:-1]) \
+ if f.endswith('.clustalw') or \
+ f.endswith('.clustal') or \
+ f.endswith('.aln') else x
+ names = ', '.join(set(trim(f.split(sep)[-1]) for type, f in partners))
+ pair = {}
+ for type, f in partners:
+ if len(pair) == 2:
+ break
+ if type in pair:
+ continue
+ pair[type] = f
+ if 0 < len(pair) < 2:
+ raise TypeError("Unmatched clustal alignment(s): " +
+ ", ".join(f for type, f in partners))
+ if len(pair) == 0:
+ continue
+ couples.append((pair['nucl'], pair['prot'], names))
+
+ for nt, aa, strain in couples:
+ plotdata = {
+ 'nt': SaySNPs(nt),
+ 'aa': SaySNPs(aa)
+ }
+ metadata = {'strain': strain, 'filename': strain + '.pdf'}
+
+ yield {'plotdata': plotdata, 'metadata': metadata}
+ raise StopIteration
diff --git a/src/annotation.py b/src/annotation.py
new file mode 100644
index 0000000..aa950d0
--- /dev/null
+++ b/src/annotation.py
@@ -0,0 +1,112 @@
+'''
+This module is used to create annotation files (currently, only GFF files).
+The annotations can be used to create a heirarchy among the annotations (e.g.,
+genes contain exons, introns, ... etc.).
+'''
+
+
+class Annotation(object):
+ '''
+ An object to help with reading and writing GFF files.
+ '''
+ unknowns = 0
+
+ def __init__(self, ref, src, type, start, end, score, strand, phase,
+ attr, name_token='ID', gff_token='='):
+ '''
+ Constructs an `Annotation` object with the necessary values. The
+ parameters are passed in the same order as the columns from a GFF
+ (version 3) file and the name_token and gff_token parameters are the
+ defaults for a gff version 3 file from phytozome. Just write (e.g.)
+
+ ```python
+ Annotation(*line.split('\\t')) #(splitting on tabs)
+ ```
+
+ and the rest of the work will be done for you. Other sources may
+ require changes to `name_tokens` and `gff_token`.
+
+ Instantiating an `Annotation` will generate for it an id of the form
+ *SEQNAME*_*TYPE*[START:END], where *SEQNAME* is the name of the
+ sequence (column 1) from the GFF file, and type is like 'gene' or
+ 'CDS'. If no *SEQNAME* is provided, then `X` be used in its place, and
+ if no identifier can be found in the attributes, the `Annotation` will
+ generate an identifier for itself in the form of `unknown #`.
+ '''
+
+ def parse_attrs(attr, keyvalsep='=', attrsep=';'):
+ '''
+ Creates a dictionary from the atrributes (9th column) of a gff
+ file. By default, key-value separator (`keyvalsep`) is `=`, which
+ is the separator used in gff version 3.
+
+ In other words, `attr` `"a=b;c=d;"` and `keyvalsep` `=` will
+ yield the dictionary `{'a':'b','c':'d'}`. The other separator
+ (`attrsep`) separates individual attributes and defaults to ';',
+ which is also the norm in GFF files.
+ '''
+
+ attributes = {}
+ if keyvalsep not in attr:
+ keyvalsep = ' '
+ l = len(keyvalsep)
+ attrs = [a.strip() for a in attr.strip().split(attrsep)]
+ for attribute in attrs:
+ pos = attribute.find(keyvalsep)
+ if pos > -1:
+ var, val = attribute[:pos], attribute[pos + l:]
+ attributes[var] = attributes.get(var, []) + [val]
+
+ for key in attributes:
+ attributes[key] = ','.join(attributes[key])
+ return attributes
+
+ start, end = int(start), int(end)
+ self.strand = strand
+ self.type = type
+ self.source = src
+ self.seq = ref
+ self.start = min(start, end)
+ self.end = max(end, start)
+ self.attr = parse_attrs(attr, gff_token)
+ self.phase = phase
+ self.score = score
+ self.ntoken = name_token
+ self.id = ((self.seq or 'X') + '_' + self.type +
+ "[%d:%d]" % (self.start, self.end))
+ try:
+ self.name = self.attr[name_token]
+ except KeyError:
+ Annotation.unknowns += 1
+ self.name = "unknown %d" % Annotation.unknowns
+ self.parent = None
+ self.children = []
+
+ '''
+ Some things that you can do to `Annotation` objects:
+ * `len(annotation)` => length of the annotation (`end-start+1`)
+ * `dictionary[annotation]` => store annotations as keys of a dictionary or
+ as elements in a set
+ * `annA == annB` => compare two Annotations, they are the same if they have
+ the same id.
+ * `print annotation` => prints the annotation as a line of a GFF version 3
+ file.
+ '''
+
+ def __len__(self):
+ return max(self.start, self.end) - min(self.end, self.start) + 1
+
+ def __hash__(self):
+ return self.id.__hash__()
+
+ def __eq__(self, other):
+ try:
+ return self.id == other.id
+ except AttributeError:
+ return False
+
+ def __str__(self):
+ return '\t'.join((self.seq, self.source,
+ self.type, str(self.start), str(self.end), self.score,
+ self.strand, str(self.phase),
+ ';'.join(k + '=' + self.attr[k] for k in self.attr)))
diff --git a/src/clustal.py b/src/clustal.py
new file mode 100644
index 0000000..18be815
--- /dev/null
+++ b/src/clustal.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+import biotools.IO as io
+import subprocess
+from os import remove
+
+
+def run(infile, outfile, **kwargs):
+ n = 0
+ for seq in io.open(infile, 'r'):
+ n += 1
+ if n > 1:
+ seqtype = seq.type
+ break
+
+ if n > 1:
+ cmd = "clustalw"
+
+ try:
+ ignore = open('/dev/null', 'w')
+ except IOError:
+ ignore = open('nul', 'w')
+
+ if seqtype == 'nucl':
+ defaults = {
+ 'OUTORDER': 'ALIGNED',
+ 'GAPOPEN': '10',
+ 'GAPEXT': '0.1',
+ 'DNAMATRIX': 'IUB'
+ }
+ others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg]))
+ for arg in set(name.upper() for name in kwargs) &
+ set(defaults.keys())]
+ subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN", "-TYPE=DNA",
+ "-OUTFILE=" + outfile] + others, stdout=ignore)
+ else:
+ defaults = {
+ 'OUTORDER': 'ALIGNED',
+ 'GAPOPEN': '10',
+ 'GAPEXT': '0.1',
+ 'MATRIX': 'BLOSUM'
+ }
+ others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg]))
+ for arg in set(name.upper() for name in kwargs) &
+ set(defaults.keys())]
+ subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN",
+ "-TYPE=PROTEIN", "-OUTFILE=" + outfile] + others,
+ stdout=ignore)
+ pos = infile.rfind('.')
+ if pos > -1:
+ prefix = infile[:pos]
+ else:
+ prefix = infile
+ remove(prefix + '.dnd')
diff --git a/src/complement.py b/src/complement.py
new file mode 100644
index 0000000..7a9da09
--- /dev/null
+++ b/src/complement.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+from biotools.sequence import isprot
+
+_ref = {
+ 'DNA': {
+ 'A': 'T', 'T': 'A', 'a': 't', 't': 'a',
+ 'C': 'G', 'G': 'C', 'c': 'g', 'g': 'c',
+ 'R': 'Y', 'Y': 'R', 'r': 'y', 'y': 'r',
+ ' ': ' ', '-': '-'
+ },
+ 'RNA': {
+ 'A': 'U', 'U': 'A', 'a': 'u', 'u': 'a',
+ 'C': 'G', 'G': 'C', 'c': 'g', 'g': 'c',
+ 'R': 'Y', 'Y': 'R', 'r': 'y', 'y': 'r',
+ ' ': ' ', '-': '-'
+ }
+}
+
+
+def complement(s):
+ '''
+ Creates the complement of a sequence, which can then be reversed by using
+ `seq[::-1]`, if it needs to be reversed. This function accepts either
+ `Sequence`s or strings.
+ '''
+
+ if isprot(s):
+ return s
+ has_u = ('U' in s or 'u' in s)
+ has_t = ('T' in s or 't' in s)
+ if has_u and not has_t:
+ repl = _ref['RNA']
+ elif has_t and not has_u:
+ repl = _ref['DNA']
+ else:
+ repl = _ref['DNA']
+ value = ''.join(repl.get(c, 'N') for c in s)
+ try:
+ return s.__class__("complement(%s)" % s.name, value,
+ original=s.original, start=s.start,
+ end=s.end, step=s.step, qual=s.qual)
+ except (AttributeError, TypeError):
+ return s.__class__(value)
+
+
+if __name__ == '__main__':
+ assert complement('ATCGTAGCTGATCGAT') == 'TAGCATCGACTAGCTA'
+ assert complement('AUCGUAGCUGAUCGAU') == 'UAGCAUCGACUAGCUA'
+ print(complement('AUCgu--cuGAUCGAU'))
diff --git a/src/sequence.py b/src/sequence.py
new file mode 100644
index 0000000..f4fcd0a
--- /dev/null
+++ b/src/sequence.py
@@ -0,0 +1,174 @@
+from biotools.annotation import Annotation
+
+
+def chop(seq, length=70):
+ '''
+ Yields a chunk of a sequence of no more than `length` characters,
+ it is meant to be used to print fasta files.
+ '''
+
+ while seq:
+ try:
+ piece, seq = seq[:length], seq[length:]
+ except IndexError:
+ piece, seq = seq, ''
+ yield piece
+ raise StopIteration()
+
+
+def isprot(seq, nucleotides='ATUCGNYRatucgnyr- '):
+ '''
+ Check whether the current sequence is a protein or nucleotide sequence.
+ '''
+
+ for c in seq:
+ if c not in nucleotides:
+ return True
+ else:
+ return False
+
+
+class Sequence(object):
+ '''
+ A wrapper class for sequences.
+ '''
+
+ def __init__(self, name, seq, **kwargs):
+ '''
+ Instantiates a Sequence object with a given sequence. Some other
+ useful parameters that the `Sequence` constructor can handle are:
+ * `qual` => the quality scores (an array of integers) of the
+ sequence,
+ * `type` => the type of the sequence, either prot or nucl,
+ * `start` => the starting position of the sequence within a
+ supersequence,
+ * `end` => the ending position of the sequnece within a
+ supersequence,
+ * `step` => the 'step' of the sequence, usually +1 for top-strand
+ sequences, and -1 for bottom-strand sequences, but can handle
+ other values as well,
+ * `original` => the original `Sequence` object from which this one
+ derives,
+ * `defline` => the definition line for this sequnce from a fasta
+ file.
+ If one of these are not given, they will default to the most logical
+ value that can be determined from the other values and sequence (e.g.,
+ if `end < start`, then `step` is probably -1).
+ '''
+
+ self.name = name
+ self.seq = seq
+ self.qual = kwargs.get('qual', None)
+ if 'type' in kwargs:
+ self.type = kwargs['type']
+ self.start = kwargs.get('start', 1)
+ self.end = kwargs.get('end', self.start - 1 + len(seq))
+ self.step = kwargs.get('step', -1 if self.start > self.end else 1)
+ self.original = kwargs.get('original', self)
+ self.defline = kwargs.get('defline', '')
+
+ def __getattr__(self, attr):
+ if attr == 'type':
+ self.type = 'prot' if isprot(self.seq) else 'nucl'
+ return self.type
+ raise AttributeError('%r object has no attribute %r' %
+ (self.__class__.__name__, attr))
+
+ def __getitem__(self, key):
+ '''
+ sequence[i] or sequence[start:end] or sequence[start:end:step]
+ constructs a new Sequence that is a subsequence as described by the
+ way this function is called. This function will automatically fill in
+ the name, sequence, start, stop, step, original, and type of the
+ subsequence. It also tries to fill in the annotations, but annotations
+ are handled pretty poorly right now, so it's probably best not to
+ worry about those, but it will work if you really want to.
+ '''
+
+ try:
+ start, stop, step = key.indices(len(self.seq))
+ except AttributeError:
+ start, stop, step = key, key + 1, 1
+
+ order = abs(self.step) / self.step
+ r = stop - (stop - start) % step - step
+ seq = ''.join(self.seq[x] for x in xrange(start, stop, step))
+ qual = self.qual and [self.qual[x] for x in xrange(start, stop, step)]
+ info = (self.name, start, stop, step)
+ self.type
+ return Sequence("subsequence(%s, %d, %d, %d)" % info, seq,
+ qual=qual, original=self.original, type=self.type,
+ start=self.start + start * order,
+ end=self.start + r * order,
+ step=step * self.step)
+
+ '''
+ Some other things you can do with a Sequence object:
+ * len(sequence) => gives the length of the sequence.
+ * for character in sequence: => allows you to loop over each character in
+ the sequence.
+ * dictionary[sequence] => allows sequences to be used as keys for
+ dictionaries and allows you to have sequences in sets. This relies on
+ the test seqA == seqB, described next.
+ * seqA == seqB => compare two sequences. The sequences are the same if
+ they have the same sequence AND name. Therefore, two sequences with
+ different names are treated as separate items in a set and separate
+ keys in a dictionary. If you need to match only the sequence, use
+ seqA.seq == seqB.seq.
+ * print sequence => print a fasta / fastq (depending on whether there are
+ any quality scores) representation of the sequence. Sequence objects
+ in any other data structure (e.g., list, dictionary) are printed as
+ (e.g., <Sequence 0x000000>). If you want to change that, you can do:
+ def __repr__(self):
+ return self.__str__()
+ '''
+
+ def upper(self):
+ return Sequence(self.name, self.seq.upper(), type=self.type,
+ qual=self.qual, original=self.original,
+ defline=self.defline, start=self.start, step=self.step,
+ end=self.end)
+
+ def __iter__(self):
+ for c in self.seq:
+ yield c
+ raise StopIteration()
+
+ def __len__(self):
+ return len(self.seq)
+
+ def __hash__(self):
+ return hash(self.seq)
+
+ def __eq__(self, other):
+ try:
+ return self.seq == other.seq and self.name == other.name
+ except AttributeError:
+ return (self.seq == other)
+
+ def __str__(self):
+ if self.qual:
+ return '@%s\n%s\n+\n%s' % (self.name, self.seq,
+ ''.join(chr(ord('A') - 1 + q)
+ for q in self.qual))
+ else:
+ return '>%s %s\n%s' % (self.name, self.defline,
+ '\n'.join(chop(self.seq, 70)))
+
+
+def annotation(seq, source, type, **kwargs):
+ '''
+ Creates an `Annotation` object for the given sequence from a source
+ (e.g., "phytozome7.0") of a particular type (e.g., "gene").
+ '''
+ try:
+ sname = source.name
+ except AttributeError:
+ sname = source
+
+ start, end = min(seq.start, seq.end), max(seq.start, seq.end)
+ strand = '+' if seq.step == 1 else '-'
+ attrs = ';'.join('%s=%s' % (key, str(kwargs[key])) for key in kwargs)
+
+ return Annotation(seq.original.name, sname, type, start, end, '.',
+ strand, start % 3, attrs)
diff --git a/src/translate.py b/src/translate.py
new file mode 100644
index 0000000..4da0062
--- /dev/null
+++ b/src/translate.py
@@ -0,0 +1,57 @@
+_gencode = {
+ 'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
+ 'AUA': 'I', 'AUC': 'I', 'AUU': 'I', 'AUG': 'M',
+ 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
+ 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACU': 'T',
+ 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
+ 'AAC': 'N', 'AAU': 'N', 'AAA': 'K', 'AAG': 'K',
+ 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
+ 'AGC': 'S', 'AGU': 'S', 'AGA': 'R', 'AGG': 'R',
+ 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
+ 'CUA': 'L', 'CUC': 'L', 'CUG': 'L', 'CUU': 'L',
+ 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
+ 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCU': 'P',
+ 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
+ 'CAC': 'H', 'CAU': 'H', 'CAA': 'Q', 'CAG': 'Q',
+ 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
+ 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGU': 'R',
+ 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
+ 'GUA': 'V', 'GUC': 'V', 'GUG': 'V', 'GUU': 'V',
+ 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
+ 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCU': 'A',
+ 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
+ 'GAC': 'D', 'GAU': 'D', 'GAA': 'E', 'GAG': 'E',
+ 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
+ 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGU': 'G',
+ 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
+ 'UCA': 'S', 'UCC': 'S', 'UCG': 'S', 'UCU': 'S',
+ 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
+ 'UUC': 'F', 'UUU': 'F', 'UUA': 'L', 'UUG': 'L',
+ 'TAC': 'Y', 'TAT': 'Y', 'TAA': '*', 'TAG': '*',
+ 'UAC': 'Y', 'UAU': 'Y', 'UAA': '*', 'UAG': '*',
+ 'TGC': 'C', 'TGT': 'C', 'TGA': '*', 'TGG': 'W',
+ 'UGC': 'C', 'UGU': 'C', 'UGA': '*', 'UGG': 'W'}
+
+
+def translate(sequence):
+ '''
+ Translate a nucleotide using the standard genetic code. The sequence
+ parameter can be either a string or a `Sequence` object. Stop codons are
+ denoted with an asterisk (*).
+ '''
+
+ try:
+ value = ''.join(_gencode.get(sequence.seq[i:i + 3].upper(), 'X')
+ for i in xrange(0, int(len(sequence) / 3) * 3, 3))
+ return sequence.__class__("translate(%s)" % sequence.name, value,
+ original=sequence.original, type='prot',
+ defline=sequence.defline)
+ except AttributeError:
+ value = ''.join(_gencode.get(sequence[i:i + 3].upper(), 'X')
+ for i in xrange(0, int(len(sequence) / 3) * 3, 3))
+ return sequence.__class__(value)
+
+
+if __name__ == '__main__':
+ import sys
+ print(translate(sys.argv[1]))
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-biotools.git
More information about the debian-med-commit
mailing list