[med-svn] [python-burrito-fillings] 10/13: New upstream version 0.1.1
Andreas Tille
tille at debian.org
Tue Dec 26 22:22:15 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository python-burrito-fillings.
commit 119ac5c7f010d061d771d83971619e9d2040b213
Author: Andreas Tille <tille at debian.org>
Date: Tue Dec 26 23:15:53 2017 +0100
New upstream version 0.1.1
---
.gitignore | 39 +
CHANGELOG.md | 11 +
COPYING.txt | 27 +
README.md | 11 +
bfillings/__init__.py | 11 +
bfillings/align.py | 40 +
bfillings/blast.py | 1243 +++++++++++++
bfillings/blat.py | 422 +++++
bfillings/bwa.py | 762 ++++++++
bfillings/cd_hit.py | 343 ++++
bfillings/clearcut.py | 401 ++++
bfillings/clustalw.py | 724 ++++++++
bfillings/denoiser.py | 25 +
bfillings/fastq_join.py | 229 +++
bfillings/fasttree.py | 162 ++
bfillings/fasttree_v1.py | 145 ++
bfillings/formatdb.py | 239 +++
bfillings/infernal.py | 1571 ++++++++++++++++
bfillings/mafft.py | 470 +++++
bfillings/mothur.py | 589 ++++++
bfillings/muscle_v38.py | 777 ++++++++
bfillings/parsinsert.py | 92 +
bfillings/pplacer.py | 201 ++
bfillings/raxml_v730.py | 875 +++++++++
bfillings/rdp_classifier.py | 589 ++++++
bfillings/rtax.py | 293 +++
bfillings/seqprep.py | 351 ++++
bfillings/sortmerna_v2.py | 544 ++++++
bfillings/sumaclust_v1.py | 173 ++
bfillings/swarm_v127.py | 299 +++
bfillings/tests/__init__.py | 9 +
bfillings/tests/test_blast.py | 256 +++
bfillings/tests/test_blat.py | 346 ++++
bfillings/tests/test_bwa.py | 319 ++++
bfillings/tests/test_cd_hit.py | 214 +++
bfillings/tests/test_clearcut.py | 255 +++
bfillings/tests/test_clustalw.py | 627 +++++++
bfillings/tests/test_fasttree.py | 182 ++
bfillings/tests/test_fasttree_v1.py | 174 ++
bfillings/tests/test_formatdb.py | 233 +++
bfillings/tests/test_infernal.py | 620 +++++++
bfillings/tests/test_mafft.py | 132 ++
bfillings/tests/test_mothur.py | 315 ++++
bfillings/tests/test_muscle_v38.py | 286 +++
bfillings/tests/test_parsinsert.py | 138 ++
bfillings/tests/test_pplacer.py | 254 +++
bfillings/tests/test_raxml_v730.py | 236 +++
bfillings/tests/test_rdp_classifier.py | 398 ++++
bfillings/tests/test_rtax.py | 228 +++
bfillings/tests/test_sortmerna_v2.py | 855 +++++++++
bfillings/tests/test_sumaclust_v1.py | 259 +++
bfillings/tests/test_swarm_v127.py | 190 ++
bfillings/tests/test_uclust.py | 758 ++++++++
bfillings/tests/test_usearch.py | 2000 ++++++++++++++++++++
bfillings/tests/test_vsearch.py | 1686 +++++++++++++++++
bfillings/uclust.py | 606 ++++++
bfillings/usearch.py | 2547 ++++++++++++++++++++++++++
bfillings/vsearch.py | 575 ++++++
debian/changelog | 35 -
debian/compat | 1 -
debian/control | 67 -
debian/copyright | 35 -
debian/patches/cd_hit_leaves_no_bak_file | 32 -
debian/patches/handle_renamed_binaries | 168 --
debian/patches/mothur_skip_list_header | 63 -
debian/patches/no_set_blastmat | 12 -
debian/patches/rdp_classifier_2.10 | 106 --
debian/patches/series | 7 -
debian/patches/test_raxml_accept_new_version | 32 -
debian/patches/test_usearch_known_failures | 39 -
debian/rules | 47 -
debian/source/format | 1 -
debian/watch | 3 -
setup.py | 46 +
74 files changed, 26402 insertions(+), 648 deletions(-)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d7ef01f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,39 @@
+*.py[cod]
+
+# C extensions
+*.so
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64
+__pycache__
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+nosetests.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+# vi
+.*.swp
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..8203cf1
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,11 @@
+# burrito-fillings changelog
+
+## Version 0.1.1 (2015-05-22)
+
+* Updated handling of temporary files to make better use of python ``tempfile.gettempdir()`` for some of the most widely used burrito fillings ([#61](https://github.com/biocore/burrito-fillings/pull/61), [#64](https://github.com/biocore/burrito-fillings/pull/64)).
+* Fixed bug where swarm wrapper would silently ignore ``swarm`` failures ([#67](https://github.com/biocore/burrito-fillings/pull/67), [biocore/qiime#2014](https://github.com/biocore/qiime/issues/2014)).
+* Added ``__version__`` to ``bfillings/__init__.py`` so that other python packages have access to the version number ([#54](https://github.com/biocore/burrito-fillings/issues/54)).
+
+## Version 0.1.0 (2014-11-12)
+
+Initial release.
diff --git a/COPYING.txt b/COPYING.txt
new file mode 100644
index 0000000..b6785a9
--- /dev/null
+++ b/COPYING.txt
@@ -0,0 +1,27 @@
+Copyright (c) 2013--, biocore development team.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+ list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+* Neither the name of the {organization} nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ffdd95f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,11 @@
+burrito-fillings
+================
+
+burrito-fillings (canonically pronounced *boar-ee-toe phil-ings*; python package name ``bfillings``) contains [burrito](https://github.com/biocore/burrito) [CommandLineApplication](https://github.com/biocore/burrito/blob/master/burrito/util.py#L161) subclasses (i.e., *application controllers*) for bioinformatics applications. This is intended to be a temporary package for the application controllers that are used in QIIME as we figure out which of these we will continue to support.
+
+**Note:** burrito fillings is currently under active development and its API is not stable. Major compatibility-breaking API changes will likely happen as development progresses.
+
+The pre-history of burrito-fillings
+-----------------------------------
+
+burrito-fillings derives from code in [PyCogent](http://www.pycogent.org) and [QIIME](http://www.qiime.org), and the contributors and/or copyright holders have agreed to make the code they wrote for PyCogent and/or QIIME available under the BSD license. The contributors to PyCogent and/or QIIME modules that have been ported to bfillings are: Rob Knight (@rob-knight), Gavin Huttley (@gavin-huttley), Daniel McDonald (@wasade), Micah Hamady, Antonio Gonzalez (@antgonza), Sandra Smit, Greg C [...]
diff --git a/bfillings/__init__.py b/bfillings/__init__.py
new file mode 100644
index 0000000..0510013
--- /dev/null
+++ b/bfillings/__init__.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+__version__ = '0.1.1'
diff --git a/bfillings/align.py b/bfillings/align.py
new file mode 100644
index 0000000..3a1d448
--- /dev/null
+++ b/bfillings/align.py
@@ -0,0 +1,40 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from cogent import DNA as DNA_cogent, LoadSeqs
+from cogent.align.align import make_dna_scoring_dict, local_pairwise
+
+def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA_cogent, params={}):
+ """
+ Checks parameters for pairwise alignment, returns alignment.
+
+ Code from Greg Caporaso.
+ """
+
+ seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False)
+ try:
+ s1, s2 = seqs.values()
+ except ValueError:
+ raise ValueError(
+ "Pairwise aligning of seqs requires exactly two seqs.")
+
+ try:
+ gap_open = params['gap_open']
+ except KeyError:
+ gap_open = 5
+ try:
+ gap_extend = params['gap_extend']
+ except KeyError:
+ gap_extend = 2
+ try:
+ score_matrix = params['score_matrix']
+ except KeyError:
+ score_matrix = make_dna_scoring_dict(
+ match=1, transition=-1, transversion=-1)
+
+ return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
diff --git a/bfillings/blast.py b/bfillings/blast.py
new file mode 100644
index 0000000..3030a40
--- /dev/null
+++ b/bfillings/blast.py
@@ -0,0 +1,1243 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from string import strip
+from os import remove, access, F_OK, environ, path
+from random import choice
+from copy import copy
+import tempfile
+
+from burrito.parameters import FlagParameter, ValuedParameter, MixedParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+ get_tmp_filename, guess_input_handler,
+ ApplicationNotFoundError)
+from skbio.parse.sequences.fasta import FastaFinder, is_fasta_label
+from skbio.parse.record_finder import LabeledRecordFinder
+
+from cogent.parse.blast import (LastProteinIds9, QMEBlast9, QMEPsiBlast9,
+ BlastResult)
+from cogent.util.misc import app_path
+
+
+class Blast(CommandLineApplication):
+ """BLAST generic application controller"""
+
+ _common_options ={
+ # defaults to non-redundant database
+ #WARNING: This will only work if BLASTDB environment variable is set
+ '-d':ValuedParameter('-',Name='d',Delimiter=' ', Value="nr"),
+
+ # query file
+ '-i':ValuedParameter('-',Name='i',Delimiter=' '),
+
+ # Multiple Hits window size [Integer]
+ '-A':ValuedParameter('-',Name='A',Delimiter=' '),
+
+ # Threshold for extending hits [Integer]
+ '-f':ValuedParameter('-',Name='f',Delimiter=' '),
+
+ # Expectation value (E) [Real]
+ '-e':ValuedParameter('-',Name='e',Delimiter=' ', Value="10.0"),
+
+ # alignment view options:
+ # 0 = pairwise,
+ # 1 = query-anchored showing identities,
+ # 2 = query-anchored no identities,
+ # 3 = flat query-anchored, show identities,
+ # 4 = flat query-anchored, no identities,
+ # 5 = query-anchored no identities and blunt ends,
+ # 6 = flat query-anchored, no identities and blunt ends,
+ # 7 = XML Blast output,
+ # 8 = Tabular output,
+ # 9 = Tabular output with comments
+ # 10 = ASN, text
+ # 11 = ASN, binary [Integer]
+ '-m':ValuedParameter('-',Name='m',Delimiter=' ', Value="9"),
+
+ # Output File for Alignment [File Out] Optional
+ '-o':ValuedParameter('-',Name='o',Delimiter=' '),
+
+ # Filter query sequence with SEG [String]
+ '-F':ValuedParameter('-',Name='F',Delimiter=' '),
+
+ # Cost to open a gap [Integer]
+ '-G':ValuedParameter('-',Name='G',Delimiter=' '),
+
+ # Cost to extend a gap [Integer]
+ '-E':ValuedParameter('-',Name='E',Delimiter=' '),
+
+ # X dropoff value for gapped alignment (in bits) [Integer]
+ # blastn 30, megablast 20, tblastx 0, all others 15 [Integer]
+ '-X':ValuedParameter('-',Name='X',Delimiter=' '),
+
+ # Show GI's in deflines [T/F]
+ '-I':ValuedParameter('-',Name='I',Delimiter=' '),
+
+ # Number of database seqs to show one-line descriptionss for [Integer]
+ '-v':ValuedParameter('-',Name='v',Delimiter=' '),
+
+ # Number of database sequence to show alignments for (B) [Integer]
+ '-b':ValuedParameter('-',Name='b',Delimiter=' '),
+
+ # Perform gapped alignment (not available with tblastx) [T/F]
+ '-g':ValuedParameter('-',Name='g',Delimiter=' '),
+
+ # Number of processors to use [Integer]
+ '-a':ValuedParameter('-',Name='a',Delimiter=' ', Value="1"),
+
+ # Believe the query defline [T/F]
+ '-J':ValuedParameter('-',Name='J',Delimiter=' '),
+
+ # SeqAlign file ('Believe the query defline' must be TRUE) [File Out]
+ # Optional
+ '-O':ValuedParameter('-',Name='O',Delimiter=' '),
+
+ # Matrix [String]
+ '-M':ValuedParameter('-',Name='M',Delimiter=' ', Value="BLOSUM62"),
+
+ # Word size [Integer] (blastn 11, megablast 28, all others 3)
+ '-W':ValuedParameter('-',Name='W',Delimiter=' '),
+
+ # Effective length of the database (use zero for the real size) [Real]
+ '-z':ValuedParameter('-',Name='z',Delimiter=' '),
+
+ # Number of best hits from a region to keep [Integer]
+ '-K':ValuedParameter('-',Name='K',Delimiter=' '),
+
+ # 0 for multiple hit, 1 for single hit [Integer]
+ '-P':ValuedParameter('-',Name='P',Delimiter=' '),
+
+ # Effective length of the search space (use zero for real size) [Real]
+ '-Y':ValuedParameter('-',Name='Y',Delimiter=' '),
+
+ # Produce HTML output [T/F]
+ '-T':ValuedParameter('-',Name='T',Delimiter=' ', Value="F"),
+
+ # Restrict search of database to list of GI's [String] Optional
+ '-l':ValuedParameter('-',Name='l',Delimiter=' '),
+
+ # Use lower case filtering of FASTA sequence [T/F] Optional
+ '-U':ValuedParameter('-',Name='U',Delimiter=' '),
+
+ # Dropoff (X) for blast extensions in bits (default if zero) [Real]
+ # blastn 20, megablast 10, all others 7
+ '-y':ValuedParameter('-',Name='y',Delimiter=' '),
+
+ # X dropoff value for final gapped alignment (in bits) [Integer]
+ # blastn/megablast 50, tblastx 0, all others 25
+ '-Z':ValuedParameter('-',Name='Z',Delimiter=' '),
+
+ # Input File for PSI-BLAST Restart [File In] Optional
+ '-R':ValuedParameter('-',Name='R',Delimiter=' '),
+
+ }
+
+ _executable = 'blastall'
+
+ _parameters = {}
+ _parameters.update(_common_options)
+
+ def __init__(self, cur_options, command, blast_mat_root=None,
+ extra_env="",
+ params=None,InputHandler=None,
+ SuppressStderr=None, SuppressStdout=None,WorkingDir=None,\
+ HALT_EXEC=False):
+ """ Initialize blast """
+ # update options
+ self._parameters.update(cur_options)
+
+ # check if need to set env variable (for cgi calls)
+ if blast_mat_root:
+ self._command = "export BLASTMAT=%s;%s%s" % (blast_mat_root,
+ extra_env, command)
+ else:
+ # Determine if blast is installed and raise an ApplicationError
+ # if not -- this is done here so the user will get the most
+ # informative error message available.
+ self._error_on_missing_application(params)
+
+ # Otherwise raise error about $BLASTMAT not being set
+ if not ('BLASTMAT' in environ or \
+ access(path.expanduser("~/.ncbirc"), F_OK) or \
+ access(".ncbirc", F_OK)):
+ ## SHOULD THIS BE CHANGED TO RAISE AN ApplicationError?
+ raise RuntimeError, blastmat_error_message
+ self._command = command
+
+ super(Blast, self).__init__(params=params,
+ InputHandler=InputHandler,SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout,WorkingDir=WorkingDir,\
+ HALT_EXEC=HALT_EXEC)
+
+ def _error_on_missing_application(self,params):
+ """ Raise an ApplicationNotFoundError if the app is not accessible
+ """
+ if not app_path('blastall'):
+ raise ApplicationNotFoundError,\
+ "Cannot find blastall. Is it installed? Is it in your path?"
+
+ def _input_as_seqs(self,data):
+ lines = []
+ for i,s in enumerate(data):
+ #will number the sequences 1,2,3,etc.
+ lines.append(''.join(['>',str(i+1)]))
+ lines.append(s)
+ return self._input_as_lines(lines)
+
+ def _input_as_seq_id_seq_pairs(self,data):
+ lines = []
+ for seq_id,seq in data:
+ lines.append(''.join(['>',str(seq_id)]))
+ lines.append(seq)
+ return self._input_as_lines(lines)
+
+ def _input_as_lines(self,data):
+ if data:
+ self.Parameters['-i']\
+ .on(super(Blast,self)._input_as_lines(data))
+
+ return ''
+
+ def _input_as_string(self,data):
+ """Makes data the value of a specific parameter
+
+ This method returns the empty string. The parameter will be printed
+ automatically once set.
+ """
+ if data:
+ self.Parameters['-i'].on(str(data))
+ return ''
+
+ def _input_as_multiline_string(self, data):
+ if data:
+ self.Parameters['-i']\
+ .on(super(Blast,self)._input_as_multiline_string(data))
+ return ''
+
+ def _align_out_filename(self):
+
+ if self.Parameters['-o'].isOn():
+ aln_filename = self._absolute(str(self.Parameters['-o'].Value))
+ else:
+ raise ValueError, "No output file specified."
+ return aln_filename
+
+ def _get_result_paths(self,data):
+
+ result = {}
+ if self.Parameters['-o'].isOn():
+ out_name = self._align_out_filename()
+ result['BlastOut'] = ResultPath(Path=out_name,IsWritten=True)
+ return result
+
+blastmat_error_message =\
+"""BLAST cannot run if the BLASTMAT environment variable is not set.
+
+Usually, the BLASTMAT environment variable points to the NCBI data directory,
+which contains matrices like PAM30 and PAM70, etc.
+
+Alternatively, you may create a .ncbirc file to define these variables.
+
+From help file:
+
+2) Create a .ncbirc file. In order for Standalone BLAST to operate, you
+have will need to have a .ncbirc file that contains the following lines:
+
+[NCBI]
+Data="path/data/"
+
+Where "path/data/" is the path to the location of the Standalone BLAST
+"data" subdirectory. For Example:
+
+Data=/root/blast/data
+
+The data subdirectory should automatically appear in the directory where
+the downloaded file was extracted. Please note that in many cases it may
+be necessary to delimit the entire path including the machine name and
+or the net work you are located on. Your systems administrator can help
+you if you do not know the entire path to the data subdirectory.
+
+Make sure that your .ncbirc file is either in the directory that you
+call the Standalone BLAST program from or in your root directory.
+"""
+
+class PsiBlast(Blast):
+ """PSI-BLAST application controller - Prototype"""
+ _options ={
+
+ # ASN.1 Scoremat input of checkpoint data:
+ # 0: no scoremat input
+ # 1: Restart is from ASCII scoremat checkpoint file,
+ # 2: Restart is from binary scoremat checkpoint file [Integer] Optional
+ '-q':ValuedParameter('-',Name='q',Delimiter=' '),
+
+ # Output File for PSI-BLAST Matrix in ASCII [File Out] Optional
+ '-Q':ValuedParameter('-',Name='Q',Delimiter=' '),
+
+ # Start of required region in query [Integer]
+ '-S':ValuedParameter('-',Name='S',Delimiter=' ', Value="1"),
+
+ # ASN.1 Scoremat output of checkpoint data:
+ # 0: no scoremat output
+ # 1: Output is ASCII scoremat checkpoint file (requires -J),
+ # 2: Output is binary scoremat checkpoint file (requires -J) Optional
+ '-u':ValuedParameter('-',Name='u',Delimiter=' '),
+
+ # Cost to decline alignment (disabled when 0) [Integer]
+ '-L':ValuedParameter('-',Name='L',Delimiter=' ', Value="0"),
+
+ # program option for PHI-BLAST [String]
+ '-p':ValuedParameter('-',Name='p',Delimiter=' ', Value="blastpgp"),
+
+ # Use composition based statistics [T/F]
+ '-t':ValuedParameter('-',Name='t',Delimiter=' ', Value="T"),
+
+ # Input Alignment File for PSI-BLAST Restart [File In] Optional
+ '-B':ValuedParameter('-',Name='B',Delimiter=' '),
+
+ # Number of bits to trigger gapping [Real]
+ '-N':ValuedParameter('-',Name='N',Delimiter=' ', Value="22.0"),
+
+ # End of required region in query (-1 indicates end of query) [Integer]
+ '-H':ValuedParameter('-',Name='H',Delimiter=' ', Value="-1"),
+
+ # e-value threshold for inclusion in multipass model [Real]
+ '-h':ValuedParameter('-',Name='h',Delimiter=' ', Value="0.001"),
+
+ # Constant in pseudocounts for multipass version [Integer]
+ '-c':ValuedParameter('-',Name='c',Delimiter=' ', Value="9"),
+
+ # Maximum number of passes to use in multipass version [Integer]
+ '-j':ValuedParameter('-',Name='j',Delimiter=' ', Value="1"),
+
+ # Output File for PSI-BLAST Checkpointing [File Out] Optional
+ '-C':ValuedParameter('-',Name='C',Delimiter=' '),
+
+ # Compute locally optimal Smith-Waterman alignments [T/F]
+ '-s':ValuedParameter('-',Name='s',Delimiter=' ', Value="F"),
+
+ # Hit File for PHI-BLAST [File In]
+ '-k':ValuedParameter('-',Name='k',Delimiter=' '),
+
+ }
+
+ def __init__(self, blast_mat_root=None, params=None,
+ extra_env="",
+ InputHandler=None,SuppressStderr=None,
+ SuppressStdout=None,WorkingDir=None,
+ HALT_EXEC=False):
+ """ Initialize the Psi-Blast"""
+ super(PsiBlast, self).__init__(self._options,
+ "blastpgp",
+ extra_env=extra_env,
+ blast_mat_root=blast_mat_root,
+ params=params,
+ InputHandler=InputHandler,SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout,WorkingDir=WorkingDir,
+ HALT_EXEC=HALT_EXEC)
+
+
+# should probably go into blastall superclass. it's late, works for now
+BLASTALL_OPTIONS ={
+ # Use lower case filtering of FASTA sequence [T/F] Optional
+ '-U':ValuedParameter('-',Name='U',Delimiter=' '),
+
+ # Penalty for a nucleotide mismatch (blastn only) [Integer]
+ # default = -3
+ '-q':ValuedParameter('-',Name='q',Delimiter=' '),
+
+ # Reward for a nucleotide match (blastn only) [Integer]
+ '-r':ValuedParameter('-',Name='r',Delimiter=' '),
+
+ # Query Genetic code to use [Integer] default = 1
+ '-Q':ValuedParameter('-',Name='Q',Delimiter=' '),
+
+ # DB Genetic code (for tblast[nx] only) [Integer]
+ '-D':ValuedParameter('-',Name='D',Delimiter=' '),
+
+ # Query strands to search against database (for blast[nx], and tblastx)
+ # 3 is both, 1 is top, 2 is bottom [Integer]
+ '-S':ValuedParameter('-',Name='S',Delimiter=' '),
+
+ # Program Name
+ '-p':ValuedParameter('-',Name='p',Delimiter=' '),
+
+ # MegaBlast search [T/F]
+ '-n':ValuedParameter('-',Name='n',Delimiter=' '),
+
+ # Location on query sequence [String] Option
+ '-L':ValuedParameter('-',Name='L',Delimiter=' '),
+
+ # Frame shift penalty (OOF algorithm for blastx) [Integer]
+ '-w':ValuedParameter('-',Name='w',Delimiter=' '),
+
+ # Length of the largest intron allowed in tblastn for linking HSPs
+ #(0 disables linking) [Integer]
+ '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+ # Number of concatenated queries, for blastn and tblastn [Integer]
+ '-B':ValuedParameter('-',Name='B',Delimiter=' '),
+ }
+
+
+class Blastall(Blast):
+ """blastall application controller - Prototype """
+
+ def __init__(self, blast_mat_root=None, params=None,
+ extra_env="",
+ InputHandler=None,SuppressStderr=None,
+ SuppressStdout=None,WorkingDir=None,
+ HALT_EXEC=False):
+ """ Initialize the blastall"""
+ super(Blastall, self).__init__(BLASTALL_OPTIONS,
+ "blastall",
+ blast_mat_root=blast_mat_root,
+ extra_env=extra_env,
+ params=params,
+ InputHandler=InputHandler,SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout,WorkingDir=WorkingDir,
+ HALT_EXEC=HALT_EXEC)
+class MpiBlast(Blast):
+ """mpblast application controller - Prototype """
+
+ _mpi_options ={
+ # Produces verbose debugging output for each node, optionally logs the
+ # output to a file
+ '--debug':ValuedParameter('-',Name='--debug',Delimiter='='),
+
+ # Set the scheduler process' MPI Rank (default is 1). Because the
+ # scheduler uses very little CPU it can be useful to force the
+ # scheduler to run on the same physical machine as the writer (rank 0).
+ '--scheduler-rank':ValuedParameter('-',Name='--scheduler-rank',
+ Delimiter='='),
+
+ # Print the Altschul. et. al. 1997 paper reference instead of the
+ # mpiBLAST paper reference. With this option mpiblast output is nearly
+ # identical to NCBI-BLAST output.
+ '--altschul-reference':FlagParameter(Prefix='--',
+ Name='altschul-reference'),
+
+ #Removes the local copy of the database from each node before
+ # terminating execution
+ '--removedb':FlagParameter(Prefix='--', Name='removedb'),
+
+ # Sets the method of copying files that each worker will use.
+ # Default = "cp"
+ # * cp : use standard file system "cp" command.
+ # Additional option is --concurrent.
+ # * rcp : use rsh "rcp" command. Additonal option is --concurrent.
+ # * scp : use ssh "scp" command. Additional option is --concurrent.
+ # * mpi : use MPI_Send/MPI_Recv to copy files.
+ # Additional option is --mpi-size.
+ # * none : do not copy files,instead use shared storage as local storage
+ '--copy-via':ValuedParameter('-',Name='--copy-via', Delimiter='='),
+
+
+ # set the number of concurrent accesses to shared storage. Default = 1
+ '--concurrent':ValuedParameter('-',Name='--concurrent', Delimiter='='),
+
+
+ # in bytes, set the maximum buffer size that MPI will use to send data
+ # when transferring files. Default = 65536
+ '--mpi-size':ValuedParameter('-',Name='--mpi-size', Delimiter='='),
+
+
+ # set whether file locking should be used to manage local fragment
+ # lists. Defaults to off. When --concurrency > 1 defaults to on
+ # [on|off]
+ '--lock':ValuedParameter('-',Name='--lock', Delimiter='='),
+
+ # When set, the writer will use the database on shared storage for
+ # sequence lookup. Can drastically reduce overhead for some blastn
+ # searches.
+ '--disable-mpi-db':FlagParameter(Prefix='--', Name='disable-mpi-db'),
+
+ # Under unix, sets the nice value for each mpiblast process.
+ '--nice':ValuedParameter('-',Name='--nice', Delimiter='='),
+
+ # Under unix, sets the nice value for each mpiblast process.
+ '--config-file':ValuedParameter('--',Name='config-file', Delimiter='='),
+
+
+ # Experimental. When set, mpiblast will read the output file and
+ # attempt to continue a previously aborted run where it left off
+ '--resume-run':FlagParameter(Prefix='--', Name='resume-run'),
+
+ # print the mpiBLAST version
+ '--version':FlagParameter(Prefix='--', Name='version'),
+ }
+
+ _mpi_options.update(BLASTALL_OPTIONS)
+
+ def __init__(self, blast_mat_root=None, params=None,
+ mpiblast_root="/usr/local/bin/",
+ local_root="/var/scratch/mpiblastdata/",
+ shared_root="/quicksand/hamady/data/blast/mpidb/",
+ config_file="/quicksand2/downloads2/mpiblast/mpiblast.conf",
+ num_db_frags=40,
+ InputHandler=None,SuppressStderr=None,
+ SuppressStdout=None,WorkingDir=None,
+ HALT_EXEC=False):
+ """ Initialize mpiblast"""
+ if config_file:
+ params["--config-file"] = config_file
+ super(MpiBlast, self).__init__(self._mpi_options,
+ "mpirun -np %d %smpiblast" % ((num_db_frags + 2),
+ mpiblast_root),
+ blast_mat_root=blast_mat_root,
+ extra_env="export Local=%s; export Shared=%s;" %(local_root,
+ shared_root),
+ params=params,
+ InputHandler=InputHandler,SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout,WorkingDir=WorkingDir,
+ HALT_EXEC=HALT_EXEC)
+
+class FastaCmd(CommandLineApplication):
+ """FastaCmd application controller - Prototype"""
+
+ _options ={
+ # Database [String] Optional
+ '-d':ValuedParameter('-',Name='d',Delimiter=' '),
+
+ # Type of file
+ # G - guess mode (look for protein, then nucleotide)
+ # T - protein
+ # F - nucleotide [String] Optional
+ '-p':ValuedParameter('-',Name='p',Delimiter=' ', Value="G"),
+
+ # Search str: GIs, accessions and loci may be used delimited by comma
+ '-s':ValuedParameter('-',Name='s',Delimiter=' '),
+
+ # Input file wilth GIs/accessions/loci for batch retrieval Optional
+ '-i':ValuedParameter('-',Name='i',Delimiter=' '),
+
+ # Retrieve duplicate accessions [T/F] Optional
+ '-a':ValuedParameter('-',Name='a',Delimiter=' ', Value='F'),
+
+ # Line length for sequence [Integer] Optional
+ '-l':ValuedParameter('-',Name='l',Delimiter=' '),
+
+ # Definition line should contain target gi only [T/F] Optional
+ '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+ # Output file [File Out] Optional
+ '-o':ValuedParameter('-',Name='o',Delimiter=' '),
+
+ # Use Ctrl-A's as non-redundant defline separator [T/F] Optional
+ '-c':ValuedParameter('-',Name='c',Delimiter=' '),
+
+ # Dump the entire database in fasta format [T/F] Optional
+ '-D':ValuedParameter('-',Name='D',Delimiter=' '),
+
+ # Range of sequence to extract (Format: start,stop)
+ # 0 in 'start' refers to the beginning of the sequence
+ # 0 in 'stop' refers to the end of the sequence [String] Optional
+ '-L':ValuedParameter('-',Name='L',Delimiter=' '),
+
+ # Strand on subsequence (nucleotide only): 1 is top, 2 is bottom [Int]
+ '-S':ValuedParameter('-',Name='S',Delimiter=' '),
+
+ # Print taxonomic information for requested sequence(s) [T/F]
+ '-T':ValuedParameter('-',Name='T',Delimiter=' '),
+
+ # Print database information only (overrides all other options) [T/F]
+ '-I':ValuedParameter('-',Name='I',Delimiter=' '),
+
+ # Retrieve sequences with this PIG [Integer] Optional
+ '-P':ValuedParameter('-',Name='P',Delimiter=' '),
+
+ }
+ _parameters = {}
+ _parameters.update(_options)
+ _command = 'fastacmd'
+
+ def _input_as_lines(self,data):
+ if data:
+ self.Parameters['-i']\
+ .on(super(FastaCmd,self)._input_as_lines(data))
+ return ''
+
+ def _input_as_seqs(self,data):
+ lines = []
+ for i,s in enumerate(data):
+ #will number the sequences 1,2,3,etc.
+ lines.append(''.join(['>',str(i+1)]))
+ lines.append(s)
+ return self._input_as_lines(lines)
+
+ def _input_as_string(self,data):
+ """Makes data the value of a specific parameter
+
+ This method returns the empty string. The parameter will be printed
+ automatically once set.
+ """
+ if data:
+ self.Parameters['-s'].on(data)
+ return ''
+
+ def _out_filename(self):
+
+ if self.Parameters['-o'].isOn():
+ aln_filename = self._absolute(str(self.Parameters['-o'].Value))
+ else:
+ raise ValueError, "No output file specified."
+ return aln_filename
+
+ def _get_result_paths(self,data):
+
+ result = {}
+ if self.Parameters['-o'].isOn():
+ out_name = self._out_filename()
+ result['FastaOut'] = ResultPath(Path=out_name,IsWritten=True)
+ return result
+
+def seqs_to_stream(seqs, ih):
+ """Converts seqs into stream of FASTA records, depending on input handler.
+
+ Each FASTA record will be a list of lines.
+ """
+ if ih == '_input_as_multiline_string':
+ recs = FastaFinder(seqs.split('\n'))
+ elif ih == '_input_as_string':
+ recs = FastaFinder(open(seqs))
+ elif ih == '_input_as_seqs':
+ recs = [['>'+str(i), s] for i, s in enumerate(seqs)]
+ elif ih == '_input_as_lines':
+ recs = FastaFinder(seqs)
+ else:
+ raise TypeError, "Unknown input handler %s" % ih
+ return recs
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+def blast_seqs(seqs,
+ blast_constructor,
+ blast_db=None,
+ blast_mat_root=None,
+ params={},
+ add_seq_names=True,
+ out_filename=None,
+ WorkingDir=None,
+ SuppressStderr=None,
+ SuppressStdout=None,
+ input_handler=None,
+ HALT_EXEC=False
+ ):
+ """Blast list of sequences.
+
+ seqs: either file name or list of sequence objects or list of strings or
+ single multiline string containing sequences.
+
+ WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules
+ for data are as follows. If it's s list, treat as lines, unless
+ add_seq_names is true (in which case treat as list of seqs). If it's a
+ string, test whether it has newlines. If it doesn't have newlines, assume
+ it's a filename. If it does have newlines, it can't be a filename, so
+ assume it's a multiline string containing sequences.
+
+ If you want to skip the detection and force a specific type of input
+ handler, use input_handler='your_favorite_handler'.
+
+ add_seq_names: boolean. if True, sequence names are inserted in the list
+ of sequences. if False, it assumes seqs is a list of lines of some
+ proper format that the program can handle
+ """
+
+ # set num keep
+
+ if blast_db:
+ params["-d"] = blast_db
+
+ if out_filename:
+ params["-o"] = out_filename
+
+ ih = input_handler or guess_input_handler(seqs, add_seq_names)
+
+ blast_app = blast_constructor(
+ params=params,
+ blast_mat_root=blast_mat_root,
+ InputHandler=ih,
+ WorkingDir=WorkingDir,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout,
+ HALT_EXEC=HALT_EXEC)
+
+ return blast_app(seqs)
+
+
+def fasta_cmd_get_seqs(acc_list,
+ blast_db=None,
+ is_protein=None,
+ out_filename=None,
+ params={},
+ WorkingDir=tempfile.gettempdir(),
+ SuppressStderr=None,
+ SuppressStdout=None):
+ """Retrieve sequences for list of accessions """
+
+ if is_protein is None:
+ params["-p"] = 'G'
+ elif is_protein:
+ params["-p"] = 'T'
+ else:
+ params["-p"] = 'F'
+
+ if blast_db:
+ params["-d"] = blast_db
+
+ if out_filename:
+ params["-o"] = out_filename
+
+ # turn off duplicate accessions
+ params["-a"] = "F"
+
+ # create Psi-BLAST
+ fasta_cmd = FastaCmd(params=params,
+ InputHandler='_input_as_string',
+ WorkingDir=WorkingDir,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout)
+
+ # return results
+ return fasta_cmd("\"%s\"" % ','.join(acc_list))
+
+def fastacmd_is_crap(line):
+ """Handles missing ids..."""
+ return (not line) or line.isspace() or line.startswith('[')
+
+FastaCmdFinder = LabeledRecordFinder(is_fasta_label, ignore=fastacmd_is_crap)
+
+def seqs_from_fastacmd(acc_list, blast_db,is_protein=True):
+ """Get dict of description:seq from fastacmd."""
+ fasta_cmd_res = fasta_cmd_get_seqs(acc_list, blast_db=blast_db, \
+ is_protein=is_protein)
+ recs = FastaCmdFinder(fasta_cmd_res['StdOut'])
+ result = {}
+ for rec in recs:
+ try:
+ result[rec[0][1:].strip()] = ''.join(map(strip, rec[1:]))
+ except IndexError: #maybe we didn't get a sequence?
+ pass
+ fasta_cmd_res.cleanUp()
+ return result
+
+def psiblast_n_neighbors(seqs,
+ n=100,
+ blast_db=None,
+ core_threshold=1e-50,
+ extra_threshold=1e-10,
+ lower_threshold=1e-6,
+ step=100,
+ method="two-step",
+ blast_mat_root=None,
+ params={},
+ add_seq_names=False,
+ WorkingDir=None,
+ SuppressStderr=None,
+ SuppressStdout=None,
+ input_handler=None,
+ scorer=3, #shotgun with 3 hits needed to keep
+ second_db=None
+ ):
+ """PsiBlasts sequences, stopping when n neighbors are reached.
+
+ core_threshold: threshold for the core profile (default: 1e-50)
+ extra_threshold: threshold for pulling in additional seqs (default:1e-10)
+ lower_threshold: threshold for seqs in final round (default:1e-6)
+
+ seqs: either file name or list of sequence objects or list of strings or
+ single multiline string containing sequences.
+ If you want to skip the detection and force a specific type of input
+ handler, use input_handler='your_favorite_handler'.
+
+ add_seq_names: boolean. if True, sequence names are inserted in the list
+ of sequences. if False, it assumes seqs is a list of lines of some
+ proper format that the program can handle
+ """
+ if blast_db:
+ params["-d"] = blast_db
+
+ ih = input_handler or guess_input_handler(seqs, add_seq_names)
+ recs = seqs_to_stream(seqs, ih) #checkpointing can only handle one seq...
+
+ #set up the parameters for the core and additional runs
+ max_iterations = params['-j']
+ params['-j'] = 2 #won't checkpoint with single iteration
+
+ app = PsiBlast(params=params,
+ blast_mat_root=blast_mat_root,
+ InputHandler='_input_as_lines',
+ WorkingDir=WorkingDir,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout,
+ )
+ result = {}
+ for seq in recs:
+ query_id = seq[0][1:].split(None,1)[0]
+ if method == "two-step":
+ result[query_id] = ids_from_seq_two_step(seq, n, max_iterations, \
+ app, core_threshold, extra_threshold, lower_threshold, second_db)
+ elif method == "lower_threshold":
+ result[query_id] = ids_from_seq_lower_threshold(seq, n, \
+ max_iterations, app, core_threshold, lower_threshold, step)
+ elif method == "iterative":
+ result[query_id] = ids_from_seqs_iterative(seq, app, \
+ QMEPsiBlast9, scorer, params['-j'], n)
+ else:
+ raise TypeError, "Got unknown method %s" % method
+
+ params['-j'] = max_iterations
+ return result
+
+def ids_from_seq_two_step(seq, n, max_iterations, app, core_threshold, \
+ extra_threshold, lower_threshold, second_db=None):
+ """Returns ids that match a seq, using a 2-tiered strategy.
+
+ Optionally uses a second database for the second search.
+ """
+ #first time through: reset 'h' and 'e' to core
+ #-h is the e-value threshold for including seqs in the score matrix model
+ app.Parameters['-h'].on(core_threshold)
+ #-e is the e-value threshold for the final blast
+ app.Parameters['-e'].on(core_threshold)
+ checkpoints = []
+ ids = []
+ last_num_ids = None
+ for i in range(max_iterations):
+ if checkpoints:
+ app.Parameters['-R'].on(checkpoints[-1])
+ curr_check = 'checkpoint_%s.chk' % i
+ app.Parameters['-C'].on(curr_check)
+
+ output = app(seq)
+ #if we didn't write a checkpoint, bail out
+ if not access(curr_check, F_OK):
+ break
+ #if we got here, we wrote a checkpoint file
+ checkpoints.append(curr_check)
+ result = list(output.get('BlastOut', output['StdOut']))
+ output.cleanUp()
+ if result:
+ ids = LastProteinIds9(result,keep_values=True,filter_identity=False)
+ num_ids = len(ids)
+ if num_ids >= n:
+ break
+ if num_ids == last_num_ids:
+ break
+ last_num_ids = num_ids
+
+ #if we didn't write any checkpoints, second run won't work, so return ids
+ if not checkpoints:
+ return ids
+
+ #if we got too many ids and don't have a second database, return the ids we got
+ if (not second_db) and num_ids >= n:
+ return ids
+
+ #second time through: reset 'h' and 'e' to get extra hits, and switch the
+ #database if appropriate
+ app.Parameters['-h'].on(extra_threshold)
+ app.Parameters['-e'].on(lower_threshold)
+ if second_db:
+ app.Parameters['-d'].on(second_db)
+ for i in range(max_iterations): #will always have last_check if we get here
+ app.Parameters['-R'].on(checkpoints[-1])
+ curr_check = 'checkpoint_b_%s.chk' % i
+ app.Parameters['-C'].on(curr_check)
+ output = app(seq)
+ #bail out if we couldn't write a checkpoint
+ if not access(curr_check, F_OK):
+ break
+ #if we got here, the checkpoint worked
+ checkpoints.append(curr_check)
+ result = list(output.get('BlastOut', output['StdOut']))
+ if result:
+ ids = LastProteinIds9(result,keep_values=True,filter_identity=False)
+ num_ids = len(ids)
+ if num_ids >= n:
+ break
+ if num_ids == last_num_ids:
+ break
+ last_num_ids = num_ids
+ #return the ids we got. may not be as many as we wanted.
+ for c in checkpoints:
+ remove(c)
+ return ids
+
+class ThresholdFound(Exception): pass
+
+def ids_from_seq_lower_threshold(seq, n, max_iterations, app, core_threshold, \
+ lower_threshold, step=100):
+ """Returns ids that match a seq, decreasing the sensitivity."""
+ last_num_ids = None
+ checkpoints = []
+ cp_name_base = make_unique_str()
+
+ # cache ides for each iteration
+ # store { iteration_num:(core_threshold, [list of matching ids]) }
+ all_ids = {}
+ try:
+ i=0
+ while 1:
+ #-h is the e-value threshold for inclusion in the score matrix model
+ app.Parameters['-h'].on(core_threshold)
+ app.Parameters['-e'].on(core_threshold)
+ if core_threshold > lower_threshold:
+ raise ThresholdFound
+ if checkpoints:
+ #-R restarts from a previously stored file
+ app.Parameters['-R'].on(checkpoints[-1])
+ #store the score model from this iteration
+ curr_check = 'checkpoint_' + cp_name_base + '_' + str(i) + \
+ '.chk'
+ app.Parameters['-C'].on(curr_check)
+ output = app(seq)
+ result = list(output.get('BlastOut', output['StdOut']))
+ #sometimes fails on first try -- don't know why, but this seems
+ #to fix problem
+ while not result:
+ output = app(seq)
+ result = list(output.get('BlastOut', output['StdOut']))
+
+ ids = LastProteinIds9(result,keep_values=True,filter_identity=False)
+ output.cleanUp()
+ all_ids[i + 1] = (core_threshold, copy(ids))
+ if not access(curr_check, F_OK):
+ raise ThresholdFound
+ checkpoints.append(curr_check)
+ num_ids = len(ids)
+ if num_ids >= n:
+ raise ThresholdFound
+ last_num_ids = num_ids
+ core_threshold *= step
+ if i >= max_iterations - 1: #because max_iterations is 1-based
+ raise ThresholdFound
+ i += 1
+ except ThresholdFound:
+ for c in checkpoints:
+ remove(c)
+ #turn app.Parameters['-R'] off so that for the next file it does not
+ #try and read in a checkpoint file that is not there
+ app.Parameters['-R'].off()
+ return ids, i + 1, all_ids
+
+def make_unique_str(num_chars=20):
+ """make a random string of characters for a temp filename"""
+ chars = 'abcdefghigklmnopqrstuvwxyz'
+ all_chars = chars + chars.upper() + '01234567890'
+ picks = list(all_chars)
+ return ''.join([choice(picks) for i in range(num_chars)])
+
+def make_subject_match_scorer(count):
+ def subject_match_scorer(checked_ids):
+ """From {subject:{query:score}} returns subject ids w/ >= count hits.
+
+ Useful for elminating subjects with few homologs.
+ """
+ return [key for key, val in checked_ids.items() if len(val) >= count]
+ return subject_match_scorer
+
+def make_shotgun_scorer(count):
+ def shotgun_scorer(checked_ids):
+ """From {subject:{query:score}} returns any ids w/ >= count hits.
+
+ A hit counts towards a sequence's score if it was either the subject
+ or the query, but we don't double-count (subject, query) pairs, i.e.
+ if A hits B and B hits A, only one (A,B) hit will be counted, although
+ it will be counted as both (A,B) and (B,A) (i.e. it will help preserve
+ both A and B).
+ """
+ result = {}
+ for subject, val in checked_ids.items():
+ for query in val.keys():
+ if subject not in result:
+ result[subject] = {}
+ result[subject][query] = True
+ if query not in result:
+ result[query] = {}
+ result[query][subject] = True
+ return [key for key, val in result.items() if len(val) >= count]
+ return shotgun_scorer
+
+def keep_everything_scorer(checked_ids):
+ """Returns every query and every match in checked_ids, with best score."""
+ result = checked_ids.keys()
+ for i in checked_ids.values():
+ result.extend(i.keys())
+ return dict.fromkeys(result).keys()
+
+def ids_from_seqs_iterative(seqs, app, query_parser, \
+ scorer=keep_everything_scorer, max_iterations=None, blast_db=None,\
+ max_seqs=None, ):
+ """Gets the ids from each seq, then does each additional id until all done.
+
+ If scorer is passed in as an int, uses shotgun scorer with that # hits.
+ """
+ if isinstance(scorer, int):
+ scorer = make_shotgun_scorer(scorer)
+ seqs_to_check = list(seqs)
+ checked_ids = {}
+ curr_iteration = 0
+ while seqs_to_check:
+ unchecked_ids = {}
+ #pass seqs to command
+ all_output = app(seqs_to_check)
+ output = all_output.get('BlastOut', all_output['StdOut'])
+
+ for query_id, match_id, match_score in query_parser(output):
+ if query_id not in checked_ids:
+ checked_ids[query_id] = {}
+ checked_ids[query_id][match_id] = match_score
+ if match_id not in checked_ids:
+ unchecked_ids[match_id] = True
+ all_output.cleanUp()
+ if unchecked_ids:
+ seq_file = fasta_cmd_get_seqs(unchecked_ids.keys(),
+ app.Parameters['-d'].Value)['StdOut']
+ seqs_to_check = []
+ for s in FastaCmdFinder(fasta_cmd_get_seqs(\
+ unchecked_ids.keys(), app.Parameters['-d'].Value)['StdOut']):
+ seqs_to_check.extend(s)
+ else:
+ seqs_to_check = []
+ #bail out if max iterations or max seqs was defined and we've reached it
+ curr_iteration += 1
+ if max_iterations and (curr_iteration >= max_iterations):
+ break
+ if max_seqs:
+ curr = scorer(checked_ids)
+ if len(curr) >= max_seqs:
+ return curr
+ return scorer(checked_ids) #scorer should return list of good ids
+
+
+def blastp(seqs, blast_db="nr", e_value="1e-20", max_hits=200,
+ working_dir=tempfile.gettempdir(), blast_mat_root=None,
+ extra_params={}):
+ """
+ Returns BlastResult from input seqs, using blastp.
+
+ Need to add doc string
+ """
+
+ # set up params to use with blastp
+ params = {
+ # matrix
+ "-M":"BLOSUM62",
+
+ # max procs
+ "-a":"1",
+
+ # expectation
+ "-e":e_value,
+
+ # max seqs to show
+ "-b":max_hits,
+
+ # max one line descriptions
+ "-v":max_hits,
+
+ # program
+ "-p":"blastp"
+ }
+ params.update(extra_params)
+
+ # blast
+ blast_res = blast_seqs(seqs,
+ Blastall,
+ blast_mat_root=blast_mat_root,
+ blast_db=blast_db,
+ params=params,
+ add_seq_names=False,
+ WorkingDir=working_dir
+ )
+
+ # get prot id map
+ if blast_res['StdOut']:
+ lines = [x for x in blast_res['StdOut']]
+ return BlastResult(lines)
+
+ return None
+
+def blastn(seqs, blast_db="nt", e_value="1e-20", max_hits=200,
+ working_dir=tempfile.gettempdir(), blast_mat_root=None,
+ extra_params={}):
+ """
+ Returns BlastResult from input seqs, using blastn.
+
+ Need to add doc string
+ """
+
+ # set up params to use with blastp
+ params = {
+ # matrix
+ "-M":"BLOSUM62",
+
+ # max procs
+ "-a":"1",
+
+ # expectation
+ "-e":e_value,
+
+ # max seqs to show
+ "-b":max_hits,
+
+ # max one line descriptions
+ "-v":max_hits,
+
+ # program
+ "-p":"blastn"
+ }
+ params.update(extra_params)
+
+ # blast
+ blast_res = blast_seqs(seqs,
+ Blastall,
+ blast_mat_root=blast_mat_root,
+ blast_db=blast_db,
+ params=params,
+ add_seq_names=False,
+ WorkingDir=working_dir
+ )
+
+ # get prot id map
+ if blast_res['StdOut']:
+ lines = [x for x in blast_res['StdOut']]
+ return BlastResult(lines)
+
+ return None
+
+
+
+def blastx(seqs, params=None):
+ """Returns BlastResults from input seqs, using blastx."""
+ raise NotImplementedError
+
+def tblastx(seqs, params=None):
+ """Returns BlastResults from input seqs, using tblastx."""
+ raise NotImplementedError
+
+def psiblast(seqs, params=None):
+ """Returns BlastResults from input seqs, using psiblast."""
+ raise NotImplementedError
+
+def reciprocal_best_blast_hit(query_id, db_1, db_2, exclude_self_hits=True,\
+ params=None):
+ """Returns best hit in db_2 that maps back to query_id in db_1, or None.
+
+ exclude_self_hits: if True (the default), returns the best hit that
+ doesn't have the same id. Otherwise, will return the same id if it is in
+ both databases (assuming it's the same sequence in both).
+ """
+ raise NotImplementedError
+
+ #make with factory functions for the blast hits
+
+
+if __name__ == "__main__":
+
+ print "Debug. examples of how i've been using."
+
+ print "Example of straightforward BLAST"
+
+# WARNING: I changed a bunch of stuff to make testing easier, since nr doesn't
+# fit in memory on my laptop. I created a database 'eco' using formatdb on the
+# E. coli K12 fasta file from this URL:
+# ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Escherichia_coli_K12/NC_000913.faa
+# Because we're blasting an archaeal sequence against one bacterial genome, I
+# relaxed the inclusion thresholds substantially. DO NOT USE THESE AGAINST NR!
+
+ in_filename = "test_seq.fasta"
+ out_filename = "test.out"
+ # if blast env variable set, can just say 'nr'
+ #BLAST_DB = "/home/hamady/quicksand/data/blast/db/nr"
+ BLAST_DB = 'nr' #'nr'
+ BLAST_MAT_ROOT="/home/hamady/apps/blast-2.2.9/data"
+ #BLAST_MAT_ROOT='/Users/rob/ncbi/data'
+ # set up params to use with iterative
+
+ #print seqs_from_fastacmd(['16766313'], 'nr', True)
+ #raise ValueError, "dbug"
+ params = {
+
+ # matrix
+ "-M":"PAM70",
+ # max procs
+ "-a":2,
+ # expect
+ "-e":1e-15,
+
+# blastall
+# # program
+# "-p":"blastp",
+
+# psi-blast
+ # max iterations
+ "-j":2,
+
+ # max seqs to show
+ "-b":50,
+ # inclusion
+ "-h":1e-2,
+ }
+
+ in_seqs = """>stm:STMabcdef thrA; aspartokinase I
+ MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTIGGQDA
+ LPNISDAERIFSDLLAGLASAQPGFPLARLKMVVEQEFAQIKHVLHGISLLGQCPDSINA
+ ALICRGEKMSIAIMAGLLEARGHRVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASQIP
+ ADHMILMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQV
+ PDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASDS
+ DDNLPVKGISNLNNMAMFSVSGPGMKGMIGMAARVFAAMSRAGISVVLITQSSSEYSISF
+ CVPQSDCARARRAMQDEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAAL
+ ARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGAL"""
+
+# The following should now give the same output:
+#
+# in_seqs = 'tiny.faa' #tiny.faa in cwd contains the sequence above
+#
+# in_seqs = """>gi|2501594|sp|Q57997|Y577_METJA PROTEIN MJ0577
+#MSVMYKKILYPTDFSETAEIALKHVKAFKTLKAEEVILLHVIDEREIKKRDIFSLLLGVAGLNKSVEEFE
+#NELKNKLTEEAKNKMENIKKELEDVGFKVKDIIVVGIPHEEIVKIAEDEGVDIIIMGSHGKTNLKEILLG
+#SVTENVIKKSNKPVLVVKRKNS""".split() #lines instead of multiline string
+#
+ blast_res = blast_seqs(in_seqs, Blastall,
+ blast_mat_root=BLAST_MAT_ROOT,
+ add_seq_names=False,
+ blast_db=BLAST_DB,
+ params={'-p': 'blastp','-e': '1','-m': 9},
+ out_filename=out_filename)
+
+ print [x for x in blast_res['StdOut']]
+ print [x for x in blast_res['StdErr']]
+ print blast_res
+ #for x in blast_res['BlastOut']:
+ # print x.rstrip()
+ blast_res.cleanUp()
+ #print '\n\n'
+ #print "Example of psiblast_n_neighbors"
+ #print "Method 1: two-step with high- and low-confidence matches"
+ #print psiblast_n_neighbors(in_seqs, n=10, blast_db=BLAST_DB, \
+ # method="two-step", blast_mat_root=BLAST_MAT_ROOT,params=params,\
+ # core_threshold=1e-5, extra_threshold=1e-2, lower_threshold=1e-1)
+ #print
+ #print "Method 2: keep lowering threshold"
+ #print psiblast_n_neighbors(in_seqs, n=10, blast_db=BLAST_DB, \
+ # method="lower_threshold", blast_mat_root=BLAST_MAT_ROOT,params=params,
+ # core_threshold=1e-6, lower_threshold=1e-2)
+ #print
+ #print "Method 3: psi-blast shotgun"
+ #print psiblast_n_neighbors(in_seqs, n=10, blast_db=BLAST_DB, \
+ # method="iterative", blast_mat_root=BLAST_MAT_ROOT,params=params,
+ # core_threshold=1e-5, lower_threshold=1e-2)
+ #print
+ #print "Method 4: two-step with high- and low-confidence matches, diff dbs"
+ #print psiblast_n_neighbors(in_seqs, n=10, blast_db=BLAST_DB, \
+ # method="two-step", blast_mat_root=BLAST_MAT_ROOT,params=params,\
+ # core_threshold=1e-5, extra_threshold=1e-2, lower_threshold=1e-1, second_db='stm')
+ #print
diff --git a/bfillings/blat.py b/bfillings/blat.py
new file mode 100644
index 0000000..424a091
--- /dev/null
+++ b/bfillings/blat.py
@@ -0,0 +1,422 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for BLAT v34"""
+
+from os import remove
+from os.path import isabs
+from tempfile import mkstemp
+
+from cogent import DNA
+from cogent.core.genetic_code import GeneticCodes
+from cogent.parse.blast import MinimalBlatParser9
+
+from skbio.parse.sequences import parse_fasta
+from burrito.util import (CommandLineApplication, ResultPath,
+ ApplicationError)
+from burrito.parameters import FlagParameter, ValuedParameter, FilePath
+
+
+class Blat(CommandLineApplication):
+
+ """BLAT generic application controller"""
+
+ _command = 'blat'
+ _input_handler = "_input_as_list"
+
+ _database_types = ['dna', 'prot', 'dnax']
+ _query_types = ['dna', 'rna', 'prot', 'dnax', 'rnax']
+ _mask_types = ['lower', 'upper', 'out', 'file.out']
+ _out_types = ['psl', 'pslx', 'axt', 'maf', 'sim4', 'wublast', 'blast',
+ 'blast8', 'blast9']
+ _valid_combinations = [('dna', 'dna'), ('dna', 'rna'), ('prot', 'prot'),
+ ('dnax', 'prot'), ('dnax', 'dnax'),
+ ('dnax', 'rnax')]
+ _database = None
+ _query = None
+ _output = None
+
+ _parameters = {
+ # database type (dna, prot, or dnax, where dnax is DNA sequence
+ # translated in six frames to protein
+ '-t': ValuedParameter('-', Delimiter='=', Name='t'),
+
+ # query type (dna, rna, prot, dnax, rnax, where rnax is DNA sequence
+ # translated in three frames to protein
+ '-q': ValuedParameter('-', Delimiter='=', Name='q'),
+
+ # Use overused tile file N.ooc, and N should correspond to the tileSize
+ '-ooc': ValuedParameter('-', Delimiter='=', Name='ooc', IsPath=True),
+
+ # Sets the size of at match that that triggers an alignment
+ '-tileSize': ValuedParameter('-', Delimiter='=', Name='tileSize'),
+
+ # Spacing between tiles.
+ '-stepSize': ValuedParameter('-', Delimiter='=', Name='stepSize'),
+
+ # If set to 1, allows one mismatch in the tile and still triggers
+ # an alignment.
+ '-oneOff': ValuedParameter('-', Delimiter='=', Name='oneOff'),
+
+ # sets the number of tile matches
+ '-minMatch': ValuedParameter('-', Delimiter='=', Name='minMatch'),
+
+ # sets the minimum score
+ '-minScore': ValuedParameter('-', Delimiter='=', Name='minScore'),
+
+ # sets the minimum sequence identity in percent
+ '-minIdentity':
+ ValuedParameter('-', Delimiter='=', Name='minIdentity'),
+
+ # sets the size o the maximum gap between tiles in a clump
+ '-maxGap': ValuedParameter('-', Delimiter='=', Name='maxGap'),
+
+ # make an overused tile file. Target needs to be complete genome.
+ '-makeOoc': ValuedParameter('-', Delimiter='=', Name='makeOoc',
+ IsPath=True),
+
+ # sets the number of repetitions of a tile allowed before it is marked
+ # as overused
+ '-repMatch': ValuedParameter('-', Delimiter='=', Name='repMatch'),
+
+ # mask out repeats. Alignments won't be started in masked region but
+ # may extend through it in nucleotide searches. Masked areas are
+ # ignored entirely in protein or translated searches. Types are:
+ # lower, upper, out, file.out (file.out - mask database according to
+ # RepeatMasker file.out
+ '-mask': ValuedParameter('-', Delimiter='=', Name='mask'),
+
+ # Mask out repeats in query sequence. similar to -mask but for query
+ # rather than target sequence
+ '-qMask': ValuedParameter('-', Delimiter='=', Name='qMask'),
+
+ # repeat bases will not be masked in any way, but matches in
+ # repeat areas will be reported separately from matches in other
+ # areas in the pls output
+ '-repeats': ValuedParameter('-', Delimiter='=', Name='repeats'),
+
+ # minimum percent divergence of repeats to allow them to be unmasked
+ '-minRepDivergence': ValuedParameter('-', Delimiter='=',
+ Name='minRepDivergence'),
+
+ # output dot every N sequences to show program's progress
+ '-dots': ValuedParameter('-', Delimiter='=', Name='dots'),
+
+ # controls output file format. One of:
+ # psl - Default. Tab separated format, no sequence
+ # pslx - Tab separated format with sequence
+ # axt - blastz-associated axt format
+ # maf - multiz-associated maf format
+ # sim4 - similar to sim4 format
+ # wublast - similar to wublast format
+ # blast - similar to NCBI blast format
+ # blast8- NCBI blast tabular format
+ # blast9 - NCBI blast tabular format with comments
+ '-out': ValuedParameter('-', Delimiter='=', Name='out'),
+
+ # sets maximum intron size
+ '-maxIntron': ValuedParameter('-', Delimiter='=', Name='maxIntron'),
+
+ # suppress column headers in psl output
+ '-noHead': FlagParameter('-', Name='noHead'),
+
+ # trim leading poly-T
+ '-trimT': FlagParameter('-', Name='trimT'),
+
+ # do not trim trailing poly-A
+ '-noTrimA': FlagParameter('-', Name='noTrimA'),
+
+ # Remove poly-A tail from qSize as well as alignments in psl output
+ '-trimHardA': FlagParameter('-', Name='trimHardA'),
+
+ # run for fast DNA/DNA remapping - not allowing introns,
+ # requiring high %ID
+ '-fastMap': FlagParameter('-', Name='fastMap'),
+
+ # for high quality mRNAs, look harder for small initial and terminal
+ # exons
+ '-fine': FlagParameter('-', Name='fine'),
+
+ # Allows extension of alignment through large blocks of N's
+ '-extendThroughN': FlagParameter('-', Name='extendThroughN')
+ }
+
+ def _get_result_paths(self, data):
+ """Returns the file location for result output
+ """
+
+ return {'output': ResultPath(data[2], IsWritten=True)}
+
+ def _get_base_command(self):
+ """Gets the command that will be run when the app controller is
+ called.
+ """
+ command_parts = []
+ cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
+ if self._command is None:
+ raise ApplicationError('_command has not been set.')
+ command = self._command
+ parameters = sorted([str(x) for x in self.Parameters.values()
+ if str(x)])
+
+ synonyms = self._synonyms
+
+ command_parts.append(cd_command)
+ command_parts.append(command)
+ command_parts.append(self._database) # Positional argument
+ command_parts.append(self._query) # Positional argument
+ command_parts += parameters
+ if self._output:
+ command_parts.append(self._output.Path) # Positional
+
+ return (
+ self._command_delimiter.join(filter(None, command_parts)).strip()
+ )
+
+ BaseCommand = property(_get_base_command)
+
+ def _input_as_list(self, data):
+ '''Takes the positional arguments as input in a list.
+
+ The list input here should be [query_file_path, database_file_path,
+ output_file_path]'''
+ query, database, output = data
+ if (not isabs(database)) \
+ or (not isabs(query)) \
+ or (not isabs(output)):
+ raise ApplicationError("Only absolute paths allowed.\n%s" %
+ ', '.join(data))
+
+ self._database = FilePath(database)
+ self._query = FilePath(query)
+ self._output = ResultPath(output, IsWritten=True)
+
+ # check parameters that can only take a particular set of values
+ # check combination of databse and query type
+ if self.Parameters['-t'].isOn() and self.Parameters['-q'].isOn() and \
+ (self.Parameters['-t'].Value, self.Parameters['-q'].Value) not in \
+ self._valid_combinations:
+ error_message = "Invalid combination of database and query " + \
+ "types ('%s', '%s').\n" % \
+ (self.Paramters['-t'].Value,
+ self.Parameters['-q'].Value)
+
+ error_message += "Must be one of: %s\n" % \
+ repr(self._valid_combinations)
+
+ raise ApplicationError(error_message)
+
+ # check database type
+ if self.Parameters['-t'].isOn() and \
+ self.Parameters['-t'].Value not in self._database_types:
+ error_message = "Invalid database type %s\n" % \
+ self.Parameters['-t'].Value
+
+ error_message += "Allowed values: %s\n" % \
+ ', '.join(self._database_types)
+
+ raise ApplicationError(error_message)
+
+ # check query type
+ if self.Parameters['-q'].isOn() and \
+ self.Parameters['-q'].Value not in self._query_types:
+ error_message = "Invalid query type %s\n" % \
+ self.Parameters['-q'].Value
+
+ error_message += "Allowed values: %s\n" % \
+ ', '.join(self._query_types)
+
+ raise ApplicationError(error_message)
+
+ # check mask type
+ if self.Parameters['-mask'].isOn() and \
+ self.Parameters['-mask'].Value not in self._mask_types:
+ error_message = "Invalid mask type %s\n" % \
+ self.Parameters['-mask']
+
+ error_message += "Allowed Values: %s\n" % \
+ ', '.join(self._mask_types)
+
+ raise ApplicationError(error_message)
+
+ # check qmask type
+ if self.Parameters['-qMask'].isOn() and \
+ self.Parameters['-qMask'].Value not in self._mask_types:
+ error_message = "Invalid qMask type %s\n" % \
+ self.Parameters['-qMask'].Value
+
+ error_message += "Allowed values: %s\n" % \
+ ', '.join(self._mask_types)
+
+ raise ApplicationError(error_message)
+
+ # check repeat type
+ if self.Parameters['-repeats'].isOn() and \
+ self.Parameters['-repeats'].Value not in self._mask_types:
+ error_message = "Invalid repeat type %s\n" % \
+ self.Parameters['-repeat'].Value
+
+ error_message += "Allowed values: %s\n" % \
+ ', '.join(self._mask_types)
+
+ raise ApplicationError(error_message)
+
+ # check output format
+ if self.Parameters['-out'].isOn() and \
+ self.Parameters['-out'].Value not in self._out_types:
+ error_message = "Invalid output type %s\n" % \
+ self.Parameters['-out']
+
+ error_message += "Allowed values: %s\n" % \
+ ', '.join(self._out_types)
+
+ raise ApplicationError(error_message)
+
+ return ''
+
+
+def assign_reads_to_database(query_fasta_fp, database_fasta_fp, output_fp,
+ params=None):
+ """Assign a set of query sequences to a reference database
+
+ query_fasta_fp : absolute file path to query sequences
+ database_fasta_fp : absolute file path to the reference database
+ output_fp : absolute file path of the output file to write
+ params : dict of BLAT specific parameters.
+
+ This method returns an open file object. The output format
+ defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
+ """
+ if params is None:
+ params = {}
+ if '-out' not in params:
+ params['-out'] = 'blast9'
+ blat = Blat(params=params)
+
+ result = blat([query_fasta_fp, database_fasta_fp, output_fp])
+ return result['output']
+
+
+def assign_dna_reads_to_dna_database(query_fasta_fp, database_fasta_fp,
+ output_fp, params=None):
+ """Assign DNA reads to a database fasta of DNA sequences.
+
+ Wraps assign_reads_to_database, setting database and query types. All
+ parameters are set to default unless params is passed.
+
+ query_fasta_fp: absolute path to the query fasta file containing DNA
+ sequences.
+ database_fasta_fp: absolute path to the database fasta file containing
+ DNA sequences.
+ output_fp: absolute path where the output file will be generated.
+ params: optional. dict containing parameter settings to be used
+ instead of default values. Cannot change database or query
+ file types from dna and dna, respectively.
+
+ This method returns an open file object. The output format
+ defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
+ """
+ if params is None:
+ params = {}
+
+ my_params = {'-t': 'dna',
+ '-q': 'dna'
+ }
+
+ # if the user specified parameters other than default, then use them.
+ # However, if they try to change the database or query types, raise an
+ # applciation error.
+ if '-t' in params or '-q' in params:
+ raise ApplicationError("Cannot change database or query types when " +
+ "using assign_dna_reads_to_dna_database. " +
+ "Use assign_reads_to_database instead.\n")
+
+ my_params.update(params)
+
+ result = assign_reads_to_database(query_fasta_fp, database_fasta_fp,
+ output_fp, my_params)
+
+ return result
+
+
+def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp,
+ output_fp, temp_dir="/tmp", params=None):
+ """Assign DNA reads to a database fasta of protein sequences.
+
+ Wraps assign_reads_to_database, setting database and query types. All
+ parameters are set to default unless params is passed. A temporary
+ file must be written containing the translated sequences from the input
+ query fasta file because BLAT cannot do this automatically.
+
+ query_fasta_fp: absolute path to the query fasta file containing DNA
+ sequences.
+ database_fasta_fp: absolute path to the database fasta file containing
+ protein sequences.
+ output_fp: absolute path where the output file will be generated.
+ temp_dir: optional. Change the location where the translated sequences
+ will be written before being used as the query. Defaults to
+ /tmp.
+ params: optional. dict containing parameter settings to be used
+ instead of default values. Cannot change database or query
+ file types from protein and dna, respectively.
+
+ This method returns an open file object. The output format
+ defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
+ """
+ if params is None:
+ params = {}
+
+ my_params = {'-t': 'prot', '-q': 'prot'}
+
+ # make sure temp_dir specifies an absolute path
+ if not isabs(temp_dir):
+ raise ApplicationError("temp_dir must be an absolute path.")
+
+ # if the user specified parameters other than default, then use them.
+ # However, if they try to change the database or query types, raise an
+ # applciation error.
+ if '-t' in params or '-q' in params:
+ raise ApplicationError("Cannot change database or query types "
+ "when using assign_dna_reads_to_dna_database. Use "
+ "assign_reads_to_database instead.")
+
+ if 'genetic_code' in params:
+ my_genetic_code = GeneticCodes[params['genetic_code']]
+ del params['genetic_code']
+ else:
+ my_genetic_code = GeneticCodes[1]
+
+ my_params.update(params)
+
+ # get six-frame translation of the input DNA sequences and write them to
+ # temporary file.
+ _, tmp = mkstemp(dir=temp_dir)
+ tmp_out = open(tmp, 'w')
+
+ for label, sequence in parse_fasta(open(query_fasta_fp)):
+ seq_id = label.split()[0]
+
+ s = DNA.makeSequence(sequence)
+ translations = my_genetic_code.sixframes(s)
+ frames = [1, 2, 3, -1, -2, -3]
+ translations = dict(zip(frames, translations))
+
+ for frame, translation in sorted(translations.iteritems()):
+ entry = '>{seq_id}_frame_{frame}\n{trans}\n'
+ entry = entry.format(seq_id=seq_id, frame=frame, trans=translation)
+ tmp_out.write(entry)
+
+ tmp_out.close()
+ result = assign_reads_to_database(tmp, database_fasta_fp, output_fp,
+ params=my_params)
+
+ remove(tmp)
+
+ return result
diff --git a/bfillings/bwa.py b/bfillings/bwa.py
new file mode 100644
index 0000000..7fd36e2
--- /dev/null
+++ b/bfillings/bwa.py
@@ -0,0 +1,762 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for BWA 0.6.2 (release 19 June 2012)"""
+
+from os.path import isabs
+from tempfile import mkstemp
+
+from burrito.parameters import FlagParameter, ValuedParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+ ApplicationError)
+
+__author__ = "Adam Robbins-Pianka"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Adam Robbins-Pianka", "Jai Ram Rideout"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "Adam Robbins-Pianka"
+__email__ = "adam.robbinspianka at colorado.edu"
+__status__ = "Production"
+
+# helper functions for argument checking
+
+
+def is_int(x):
+ # return true if it's an int
+ return ((isinstance(x, int)) or
+ # or it's a string that is all digits
+ (isinstance(x, str) and x.isdigit()) or
+ # otherwise return False
+ False)
+
+
+def is_float(x):
+ return (is_int(x) or
+ # or if it's a float
+ (isinstance(x, float)) or
+ # or it's a string with exactly one decimal and all digits on both sides of
+ # the decimal
+ (isinstance(x, str)
+ and '.' in x and all(map(str.isdigit, x.split('.', 1))))
+ # otherwise return False
+ or False)
+
+# Exceptions
+
+
+class InvalidArgumentApplicationError(Exception):
+ pass
+
+
+class MissingRequiredArgumentApplicationError(Exception):
+ pass
+
+# Base class
+
+
+class BWA(CommandLineApplication):
+
+ """BWA generic application controller. Do not instantiate directly.
+
+ Instead of instantiating this class, instantiate a subclass for each
+ subcommand. Available subclasses are:
+ BWA_index
+ BWA_aln
+ BWA_samse
+ BWA_sampe
+ BWA_bwasw
+ """
+
+ # all subclasses will accept dictionaries as input that specify input
+ # and output files. The required (and optional) types of input and output
+ # files differ by subcommand.
+ _input_handler = "_input_as_dict"
+
+ # the main command. The program bwa should be in the PATH
+ _command = "bwa"
+
+ # holds the values of the dict handled by the input handler
+ _input = {}
+
+ # Each subclass can have a dictionary (keys = option names, e.g., -a
+ # and values = boolean fucntions) called _valid_arguments
+ # that specifies checks to be made on the parameters.
+ def check_arguments(self):
+ """Sanity check the arguments passed in.
+
+ Uses the boolean functions specified in the subclasses in the
+ _valid_arguments dictionary to determine if an argument is valid
+ or invalid.
+ """
+ for k, v in self.Parameters.iteritems():
+ if self.Parameters[k].isOn():
+ if k in self._valid_arguments:
+ if not self._valid_arguments[k](v.Value):
+ error_message = 'Invalid argument (%s) ' % v.Value
+ error_message += 'for parameter %s\n' % k
+ raise InvalidArgumentApplicationError(error_message)
+
+ def _get_base_command(self):
+ """ Returns the full command string
+
+ Overridden here because there are positional arguments (specifically
+ the input and output files).
+ """
+ command_parts = []
+ # Append a change directory to the beginning of the command to change
+ # to self.WorkingDir before running the command
+ # WorkingDir should be in quotes -- filenames might contain spaces
+ cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
+ if self._command is None:
+ raise ApplicationError('_command has not been set.')
+ command = self._command
+ # also make sure there's a subcommand!
+ if self._subcommand is None:
+ raise ApplicationError('_subcommand has not been set.')
+ subcommand = self._subcommand
+ # sorting makes testing easier, since the options will be written out
+ # in alphabetical order. Could of course use option parsing scripts
+ # in cogent for this, but this works as well.
+ parameters = sorted([str(x) for x in self.Parameters.values()
+ if str(x)])
+ synonyms = self._synonyms
+
+ command_parts.append(cd_command)
+ command_parts.append(command)
+ # add in subcommand
+ command_parts.append(subcommand)
+ command_parts += parameters
+ # add in the positional arguments in the correct order
+ for k in self._input_order:
+ # this check is necessary to account for optional positional
+ # arguments, such as the mate file for bwa bwasw
+ # Note that the input handler will ensure that all required
+ # parameters have valid values
+ if k in self._input:
+ command_parts.append(self._input[k])
+
+ return self._command_delimiter.join(command_parts).strip()
+
+ BaseCommand = property(_get_base_command)
+
+ def _input_as_dict(self, data):
+ """Takes dictionary that sets input and output files.
+
+ Valid keys for the dictionary are specified in the subclasses. File
+ paths must be absolute.
+ """
+ # clear self._input; ready to receive new input and output files
+ self._input = {}
+ # Check that the arguments to the
+ # subcommand-specific parameters are valid
+ self.check_arguments()
+
+ # Ensure that we have all required input (file I/O)
+ for k in self._input_order:
+ # N.B.: optional positional arguments begin with underscore (_)!
+ # (e.g., see _mate_in for bwa bwasw)
+ if k[0] != '_' and k not in data:
+ raise MissingRequiredArgumentApplicationError("Missing "
+ "required "
+ "input %s" % k)
+
+ # Set values for input and output files
+ for k in data:
+ # check for unexpected keys in the dict
+ if k not in self._input_order:
+ error_message = "Invalid input arguments (%s)\n" % k
+ error_message += "Valid keys are: %s" % repr(self._input_order)
+ raise InvalidArgumentApplicationError(error_message + '\n')
+
+ # check for absolute paths
+ if not isabs(data[k][0]):
+ raise InvalidArgumentApplicationError("Only absolute paths "
+ "allowed.\n%s" %
+ repr(data))
+ self._input[k] = data[k]
+
+ # if there is a -f option to specify an output file, force the user to
+ # use it (otherwise things to to stdout)
+ if '-f' in self.Parameters and not self.Parameters['-f'].isOn():
+ raise InvalidArgumentApplicationError("Please specify an output "
+ "file with -f")
+
+ return ''
+
+
+class BWA_index(BWA):
+
+ """Controls the "index" subcommand of the bwa application.
+
+ Valid input keys are: fasta_in
+ """
+
+ # the subcommand for bwa index
+ _subcommand = "index"
+
+ _parameters = {
+ # which algorithm to use.
+ # is
+ # IS linear-time algorithm for constructing suffix array. It requires
+ # 5.37N memory where N is the size of the database. IS is moderately
+ # fast, but does not work with database larger than 2GB. IS is the
+ # default algorithm due to its simplicity. The current codes for IS
+ # algorithm are reimplemented by Yuta Mori.
+ #
+ # bwtsw
+ # Algorithm implemented in BWT-SW. This method works with the whole
+ # human genome, but it does not work with database smaller than 10MB
+ # and it is usually slower than IS.
+ #
+ # DEFAULTs to auto-select (based on input fasta file size)
+ '-a': ValuedParameter('-', Delimiter=' ', Name='a'),
+
+ # prefix for the output index.
+ # DEFAULTs to the base name of the input fasta file
+ '-p': ValuedParameter('-', Delimiter=' ', Name='p'),
+
+ # index files named as <in.fasta>.64.* instead of <in.fasta>.*
+ '-6': FlagParameter('-', Name='6')
+ }
+
+ # The -a command can take on of only two possible values
+ # the -p command allows the user to specify a prefix; for our purposes,
+ # this prefix should be an abolute path
+ _valid_arguments = {
+ '-a': lambda x: x in ['is', 'bwtsw'],
+ '-p': isabs
+ }
+
+ # For the position specific arguments, this is the order that they will
+ # be written in the base command
+ # input file keys beginning with _ are optional inputs
+ _input_order = ['fasta_in']
+
+ def _get_result_paths(self, data):
+ """Gets the results for a run of bwa index.
+
+ bwa index outputs 5 files when the index is created. The filename
+ prefix will be the same as the input fasta, unless overridden with
+ the -p option, and the 5 extensions are listed below:
+
+ .amb
+ .ann
+ .bwt
+ .pac
+ .sa
+
+ and these extentions (including the period) are the keys to the
+ dictionary that is returned.
+ """
+
+ # determine the names of the files. The name will be the same as the
+ # input fasta file unless overridden with the -p option
+ if self.Parameters['-p'].isOn():
+ prefix = self.Parameters['-p'].Value
+ else:
+ prefix = data['fasta_in']
+
+ # the 5 output file suffixes
+ suffixes = ['.amb', '.ann', '.bwt', '.pac', '.sa']
+ out_files = {}
+ for suffix in suffixes:
+ out_files[suffix] = ResultPath(prefix + suffix, IsWritten=True)
+
+ return out_files
+
+
+class BWA_aln(BWA):
+
+ """Controls the "aln" subcommand of the bwa application.
+
+ Valid input keys are: prefix, fastq_in
+ """
+ _parameters = {
+ # max #diff (int) or missing prob under 0.02 err rate (float) [0.04]
+ '-n': ValuedParameter('-', Delimiter=' ', Name='n'),
+ # maximum number or fraction of gap opens [1]
+ '-o': ValuedParameter('-', Delimiter=' ', Name='o'),
+
+ # maximum number of gap extensions, -1 for disabling long gaps
+ # [-1]
+ '-e': ValuedParameter('-', Delimiter=' ', Name='e'),
+
+ # do not put an indel within bp towards the ends [5]
+ '-i': ValuedParameter('-', Delimiter=' ', Name='i'),
+
+ # maximum occurrences for extending a long deletion [10]
+ '-d': ValuedParameter('-', Delimiter=' ', Name='d'),
+
+ # seed length [32]
+ '-l': ValuedParameter('-', Delimiter=' ', Name='l'),
+
+ # maximum differences in the seed [2]
+ '-k': ValuedParameter('-', Delimiter=' ', Name='k'),
+
+ # maximum entries in the queue [2000000]
+ '-m': ValuedParameter('-', Delimiter=' ', Name='m'),
+
+ # number of threads [1]
+ '-t': ValuedParameter('-', Delimiter=' ', Name='t'),
+
+ # mismatch penalty [3]
+ '-M': ValuedParameter('-', Delimiter=' ', Name='M'),
+
+ # gap open penalty [11]
+ '-O': ValuedParameter('-', Delimiter=' ', Name='O'),
+
+ # gap extension penalty [4]
+ '-E': ValuedParameter('-', Delimiter=' ', Name='E'),
+
+ # stop searching when there are > equally best hits [30]
+ '-R': ValuedParameter('-', Delimiter=' ', Name='R'),
+
+ # quality threshold for read trimming down to 35bp [0]
+ '-q': ValuedParameter('-', Delimiter=' ', Name='q'),
+
+ # file to write output to instead of stdout
+ '-f': ValuedParameter('-', Delimiter=' ', Name='f'),
+
+ # length of barcode
+ '-B': ValuedParameter('-', Delimiter=' ', Name='B'),
+
+ # log-scaled gap penalty for long deletions
+ '-L': FlagParameter('-', Name='L'),
+
+ # non-iterative mode: search for all n-difference hits (slooow)
+ '-N': FlagParameter('-', Name='N'),
+
+ # the input is in the Illumina 1.3+ FASTQ-like format
+ '-I': FlagParameter('-', Name='I'),
+
+ # the input read file is in the BAM format
+ '-b': FlagParameter('-', Name='b'),
+
+ # use single-end reads only (effective with -b)
+ '-0': FlagParameter('-', Name='0'),
+
+ # use the 1st read in a pair (effective with -b)
+ '-1': FlagParameter('-', Name='1'),
+
+ # use the 2nd read in a pair (effective with -b)
+ '-2': FlagParameter('-', Name='2'),
+
+ # filter Casava-filtered sequences
+ '-Y': FlagParameter('-', Name='Y')
+ }
+
+ # the subcommand for bwa aln
+ _subcommand = 'aln'
+
+ _valid_arguments = {
+ # check to see if this is decimal numbers
+ '-n': is_float,
+
+ # check to see if these are integers
+ '-o': is_int,
+ '-e': is_int,
+ '-i': is_int,
+ '-d': is_int,
+ '-l': is_int,
+ '-k': is_int,
+ '-m': is_int,
+ '-t': is_int,
+ '-M': is_int,
+ '-O': is_int,
+ '-E': is_int,
+ '-R': is_int,
+ '-q': is_int,
+ '-B': is_int,
+
+ # check to see if this is an absolute file path
+ '-f': isabs
+ }
+
+ # input file keys beginning with _ are optional inputs
+ _input_order = ['prefix', 'fastq_in']
+
+ def _get_result_paths(self, data):
+ """Gets the result file for a bwa aln run.
+
+ There is only one output file of a bwa aln run, a .sai file
+ and it can be retrieved with the key 'output'.
+ """
+ return {'output': ResultPath(self.Parameters['-f'].Value,
+ IsWritten=True)}
+
+
+class BWA_samse(BWA):
+
+ """Controls the "samse" subcommand of the bwa application.
+
+ Valid input keys are: prefix, sai_in, fastq_in
+ """
+ _parameters = {
+ # Maximum number of alignments to output in the XA tag for reads
+ # paired properly. If a read has more than this number of hits, the
+ # XA tag will not be written
+ '-n': ValuedParameter('-', Delimiter=' ', Name='n'),
+
+ # file to write output to instead of stdout
+ '-f': ValuedParameter('-', Delimiter=' ', Name='f'),
+
+ # Specify the read group in a format like '@RG\tID:foo\tSM:bar'
+ '-r': ValuedParameter('-', Delimiter=' ', Name='r')
+ }
+
+ # the subcommand for samse
+ _subcommand = 'samse'
+
+ _valid_arguments = {
+ # make sure that this is an int
+ '-n': is_int,
+
+ # check to see if this is an absolute file path
+ '-f': isabs
+ }
+
+ # input file keys beginning with _ are optional inputs
+ _input_order = ['prefix', 'sai_in', 'fastq_in']
+
+ def _get_result_paths(self, data):
+ """Gets the result file for a bwa samse run.
+
+ There is only one output file of a bwa samse run, a .sam file
+ and it can be retrieved with the key 'output'.
+ """
+ return {'output': ResultPath(self.Parameters['-f'].Value,
+ IsWritten=True)}
+
+
+class BWA_sampe(BWA):
+
+ """Controls the "sampe" subcommand of the bwa application.
+
+ Valid input keys are: prefix, sai1_in, sai2_in, fastq1_in,
+ fastq2_in
+ """
+ _parameters = {
+ # Maximum insert size for a read pair to be considered being mapped
+ # properly
+ '-a': ValuedParameter('-', Delimiter=' ', Name='a'),
+
+ # Maximum occurrences of a read for pairing
+ '-o': ValuedParameter('-', Delimiter=' ', Name='o'),
+
+ # Load the entire FM-index into memory to reduce disk operations
+ '-P': FlagParameter('-', Name='P'),
+
+ # maximum hits to output for paired reads [3]
+ '-n': ValuedParameter('-', Delimiter=' ', Name='n'),
+
+ # maximum hits to output for discordant pairs [10]
+ '-N': ValuedParameter('-', Delimiter=' ', Name='N'),
+
+ # file to write output to instead of stdout
+ '-f': ValuedParameter('-', Delimiter=' ', Name='f'),
+
+ # Specify the read group in a format like '@RG\tID:foo\tSM:bar'
+ '-r': ValuedParameter('-', Delimiter=' ', Name='r'),
+
+ # disable Smith-Waterman for the unmapped mate
+ '-s': FlagParameter('-', Name='s'),
+
+ # prior of chimeric rate (lower bound) [1.0e-05]
+ '-c': ValuedParameter('-', Delimiter=' ', Name='c'),
+
+ # disable insert size estimate (force -s)
+ '-A': FlagParameter('-', Name='A')
+ }
+
+ # the subcommand for sampe
+ _subcommand = 'sampe'
+
+ _valid_arguments = {
+ # make sure this is a float
+ '-c': is_float,
+
+ # make sure these are all ints
+ '-a': is_int,
+ '-o': is_int,
+ '-n': is_int,
+ '-N': is_int,
+
+ # check to see if this is an absolute file path
+ '-f': isabs
+ }
+
+ # input file keys beginning with _ are optional inputs
+ _input_order = ['prefix', 'sai1_in', 'sai2_in',
+ 'fastq1_in', 'fastq2_in']
+
+ def _get_result_paths(self, data):
+ """Gets the result file for a bwa sampe run.
+
+ There is only one output file of a bwa sampe run, a .sam file,
+ and it can be retrieved with the key 'output'.
+ """
+ return {'output': ResultPath(self.Parameters['-f'].Value,
+ IsWritten=True)}
+
+
+class BWA_bwasw(BWA):
+
+ """Controls the "bwasw" subcommand of the bwa application.
+
+ Valid input keys are: prefix, query_fasta, _query_fasta2
+ input keys beginning with an underscore are optional.
+ """
+ _parameters = {
+ # Score of a match [1]
+ '-a': ValuedParameter('-', Delimiter=' ', Name='a'),
+
+ # Mismatch penalty [3]
+ '-b': ValuedParameter('-', Delimiter=' ', Name='b'),
+
+ # Gap open penalty [5]
+ '-q': ValuedParameter('-', Delimiter=' ', Name='q'),
+
+ # Gap extension penalty.
+ '-r': ValuedParameter('-', Delimiter=' ', Name='r'),
+
+ # mask level [0.50]
+ '-m': ValuedParameter('-', Delimiter=' ', Name='m'),
+
+ # Number of threads in the multi-threading mode [1]
+ '-t': ValuedParameter('-', Delimiter=' ', Name='t'),
+
+ # file to output results to instead of stdout
+ '-f': ValuedParameter('-', Delimiter=' ', Name='f'),
+
+ # Band width in the banded alignment [33]
+ '-w': ValuedParameter('-', Delimiter=' ', Name='w'),
+
+ # Minimum score threshold divided by a [30]
+ '-T': ValuedParameter('-', Delimiter=' ', Name='T'),
+
+ # Coefficient for threshold adjustment according to query length.
+ # Given an l-long query, the threshold for a hit to be retained is
+ # a*max{T,c*log(l)}. [5.5]
+ '-c': ValuedParameter('-', Delimiter=' ', Name='c'),
+
+ # Z-best heuristics. Higher -z increases accuracy at the cost
+ # of speed. [1]
+ '-z': ValuedParameter('-', Delimiter=' ', Name='z'),
+
+ # Maximum SA interval size for initiating a seed. Higher -s increases
+ # accuracy at the cost of speed. [3]
+ '-s': ValuedParameter('-', Delimiter=' ', Name='s'),
+
+ # Minimum number of seeds supporting the resultant alignment to
+ # trigger reverse alignment. [5]
+ '-N': ValuedParameter('-', Delimiter=' ', Name='N'),
+
+ # in SAM output, use hard clipping instead of soft clipping
+ '-H': FlagParameter('-', Name='H'),
+
+ # mark multi-part alignments as secondary
+ '-M': FlagParameter('-', Name='M'),
+
+ # skip Smith-Waterman read pariing
+ '-S': FlagParameter('-', Name='S'),
+
+ # ignore pairs with insert >= INT for inferring the size of distr
+ # [20000]
+ '-I': ValuedParameter('-', Delimiter=' ', Name='I')
+ }
+
+ # the subcommand fo bwasw
+ _subcommand = 'bwasw'
+
+ # input file keys beginning with _ are optional inputs
+ _input_order = ['prefix', 'query_fasta', '_query_fasta_2']
+
+ _valid_arguments = {
+ # Make sure this is a float
+ '-c': is_float,
+ '-m': is_float,
+
+ # Make sure these are ints
+ '-a': is_int,
+ '-b': is_int,
+ '-q': is_int,
+ '-r': is_int,
+ '-t': is_int,
+ '-w': is_int,
+ '-T': is_int,
+ '-z': is_int,
+ '-s': is_int,
+ '-N': is_int,
+ '-I': is_int,
+
+ # make sure this is an absolute path
+ '-f': isabs
+ }
+
+ def _get_result_paths(self, data):
+ """Gets the result file for a bwa bwasw run.
+
+ There is only one output file of a bwa bwasw run, a .sam file,
+ and it can be retrieved with the key 'output'.
+ """
+ return {'output': ResultPath(self.Parameters['-f'].Value,
+ IsWritten=True)}
+
+
+def create_bwa_index_from_fasta_file(fasta_in, params=None):
+ """Create a BWA index from an input fasta file.
+
+ fasta_in: the input fasta file from which to create the index
+ params: dict of bwa index specific paramters
+
+ This method returns a dictionary where the keys are the various
+ output suffixes (.amb, .ann, .bwt, .pac, .sa) and the values
+ are open file objects.
+
+ The index prefix will be the same as fasta_in, unless the -p parameter
+ is passed in params.
+ """
+ if params is None:
+ params = {}
+
+ # Instantiate the app controller
+ index = BWA_index(params)
+
+ # call the application, passing the fasta file in
+ results = index({'fasta_in': fasta_in})
+ return results
+
+
+def assign_reads_to_database(query, database_fasta, out_path, params=None):
+ """Assign a set of query sequences to a reference database
+
+ database_fasta_fp: absolute file path to the reference database
+ query_fasta_fp: absolute file path to query sequences
+ output_fp: absolute file path of the file to be output
+ params: dict of BWA specific parameters.
+ * Specify which algorithm to use (bwa-short or bwasw) using the
+ dict key "algorithm"
+ * if algorithm is bwasw, specify params for the bwa bwasw
+ subcommand
+ * if algorithm is bwa-short, specify params for the bwa samse
+ subcommand
+ * if algorithm is bwa-short, must also specify params to use with
+ bwa aln, which is used to get the sai file necessary to run samse.
+ bwa aln params should be passed in using dict key "aln_params" and
+ the associated value should be a dict of params for the bwa aln
+ subcommand
+ * if a temporary directory is not specified in params using dict
+ key "temp_dir", it will be assumed to be /tmp
+
+ This method returns an open file object (SAM format).
+ """
+ if params is None:
+ params = {}
+
+ # set the output path
+ params['-f'] = out_path
+
+ # if the algorithm is not specified in the params dict, or the algorithm
+ # is not recognized, raise an exception
+ if 'algorithm' not in params:
+ raise InvalidArgumentApplicationError("Must specify which algorithm to"
+ " use ('bwa-short' or 'bwasw')")
+ elif params['algorithm'] not in ('bwa-short', 'bwasw'):
+ raise InvalidArgumentApplicationError("Unknown algorithm '%s' Please "
+ "enter either 'bwa-short' or "
+ "'bwasw'." % params['algorithm'])
+
+ # if the temp directory is not specified, assume /tmp
+ if 'temp_dir' not in params:
+ params['temp_dir'] = '/tmp'
+
+ # if the algorithm is bwa-short, we must build use bwa aln to get an sai
+ # file before calling bwa samse on that sai file, so we need to know how
+ # to run bwa aln. Therefore, we must ensure there's an entry containing
+ # those parameters
+ if params['algorithm'] == 'bwa-short':
+ if 'aln_params' not in params:
+ raise InvalidArgumentApplicationError("With bwa-short, need to "
+ "specify a key 'aln_params' "
+ "and its value, a dictionary"
+ " to pass to bwa aln, since"
+ " bwa aln is an intermediate"
+ " step when doing "
+ "bwa-short.")
+
+ # we have this params dict, with "algorithm" and "temp_dir", etc which are
+ # not for any of the subcommands, so make a new params dict that is the
+ # same as the original minus these addendums
+ subcommand_params = {}
+ for k, v in params.iteritems():
+ if k not in ('algorithm', 'temp_dir', 'aln_params'):
+ subcommand_params[k] = v
+
+ # build index from database_fasta
+ # get a temporary file name that is not in use
+ _, index_prefix = mkstemp(dir=params['temp_dir'], suffix='')
+
+ create_bwa_index_from_fasta_file(database_fasta, {'-p': index_prefix})
+
+ # if the algorithm is bwasw, things are pretty simple. Just instantiate
+ # the proper controller and set the files
+ if params['algorithm'] == 'bwasw':
+ bwa = BWA_bwasw(params=subcommand_params)
+ files = {'prefix': index_prefix, 'query_fasta': query}
+
+ # if the algorithm is bwa-short, it's not so simple
+ elif params['algorithm'] == 'bwa-short':
+ # we have to call bwa_aln to get the sai file needed for samse
+ # use the aln_params we ensured we had above
+ bwa_aln = BWA_aln(params=params['aln_params'])
+ aln_files = {'prefix': index_prefix, 'fastq_in': query}
+ # get the path to the sai file
+ sai_file_path = bwa_aln(aln_files)['output'].name
+
+ # we will use that sai file to run samse
+ bwa = BWA_samse(params=subcommand_params)
+ files = {'prefix': index_prefix, 'sai_in': sai_file_path,
+ 'fastq_in': query}
+
+ # run which ever app controller we decided was correct on the files
+ # we set up
+ result = bwa(files)
+
+ # they both return a SAM file, so return that
+ return result['output']
+
+
+def assign_dna_reads_to_dna_database(query_fasta_fp, database_fasta_fp, out_fp,
+ params={}):
+ """Wraps assign_reads_to_database, setting various parameters.
+
+ The default settings are below, but may be overwritten and/or added to
+ using the params dict:
+
+ algorithm: bwasw
+ """
+ my_params = {'algorithm': 'bwasw'}
+ my_params.update(params)
+
+ result = assign_reads_to_database(query_fasta_fp, database_fasta_fp,
+ out_fp, my_params)
+
+ return result
+
+
+def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp,
+ out_fp, temp_dir='/tmp',
+ params={}):
+ """Wraps assign_reads_to_database, setting various parameters.
+
+ Not yet implemented, as BWA can only align DNA reads to DNA databases.
+ """
+ raise NotImplementedError("BWA cannot at this point align DNA to protein")
diff --git a/bfillings/cd_hit.py b/bfillings/cd_hit.py
new file mode 100644
index 0000000..943f511
--- /dev/null
+++ b/bfillings/cd_hit.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for CD-HIT v3.1.1"""
+
+import shutil
+from os import remove
+from tempfile import mkstemp, mkdtemp
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import ValuedParameter
+from burrito.util import CommandLineApplication, ResultPath
+
+from cogent.core.moltype import RNA, DNA, PROTEIN
+from cogent.core.alignment import SequenceCollection
+
+__author__ = "Daniel McDonald"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Daniel McDonald"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "Daniel McDonald"
+__email__ = "mcdonadt at colorado.edu"
+__status__ = "Development"
+
+
+class CD_HIT(CommandLineApplication):
+ """cd-hit Application Controller
+
+ Use this version of CD-HIT if your MolType is PROTEIN
+ """
+
+ _command = 'cd-hit'
+ _input_handler = '_input_as_multiline_string'
+ _parameters = {
+ # input input filename in fasta format, required
+ '-i':ValuedParameter('-',Name='i',Delimiter=' ',IsPath=True),
+
+ # output filename, required
+ '-o':ValuedParameter('-',Name='o',Delimiter=' ',IsPath=True),
+
+ # sequence identity threshold, default 0.9
+ # this is the default cd-hit's "global sequence identity" calc'd as :
+ # number of identical amino acids in alignment
+ # divided by the full length of the shorter sequence
+ '-c':ValuedParameter('-',Name='c',Delimiter=' '),
+
+ # use global sequence identity, default 1
+ # if set to 0, then use local sequence identity, calculated as :
+ # number of identical amino acids in alignment
+ # divided by the length of the alignment
+ # NOTE!!! don't use -G 0 unless you use alignment coverage controls
+ # see options -aL, -AL, -aS, -AS
+ '-g':ValuedParameter('-',Name='g',Delimiter=' '),
+
+ # band_width of alignment, default 20
+ '-b':ValuedParameter('-',Name='b',Delimiter=' '),
+
+ # max available memory (Mbyte), default 400
+ '-M':ValuedParameter('-',Name='M',Delimiter=' '),
+
+ # word_length, default 8, see user's guide for choosing it
+ '-n':ValuedParameter('-',Name='n',Delimiter=' '),
+
+ # length of throw_away_sequences, default 10
+ '-l':ValuedParameter('-',Name='l',Delimiter=' '),
+
+ # tolerance for redundance, default 2
+ '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+ # length of description in .clstr file, default 20
+ # if set to 0, it takes the fasta defline and stops at first space
+ '-d':ValuedParameter('-',Name='d',Delimiter=' '),
+
+ # length difference cutoff, default 0.0
+ # if set to 0.9, the shorter sequences need to be
+ # at least 90% length of the representative of the cluster
+ '-s':ValuedParameter('-',Name='s',Delimiter=' '),
+
+ # length difference cutoff in amino acid, default 999999
+ # f set to 60, the length difference between the shorter sequences
+ # and the representative of the cluster can not be bigger than 60
+ '-S':ValuedParameter('-',Name='S',Delimiter=' '),
+
+ # alignment coverage for the longer sequence, default 0.0
+ # if set to 0.9, the alignment must covers 90% of the sequence
+ '-aL':ValuedParameter('-',Name='aL',Delimiter=' '),
+
+ # alignment coverage control for the longer sequence, default 99999999
+ # if set to 60, and the length of the sequence is 400,
+ # then the alignment must be >= 340 (400-60) residues
+ '-AL':ValuedParameter('-',Name='AL',Delimiter=' '),
+
+ # alignment coverage for the shorter sequence, default 0.0
+ # if set to 0.9, the alignment must covers 90% of the sequence
+ '-aS':ValuedParameter('-',Name='aS',Delimiter=' '),
+
+ # alignment coverage control for the shorter sequence, default 99999999
+ # if set to 60, and the length of the sequence is 400,
+ # then the alignment must be >= 340 (400-60) residues
+ '-AS':ValuedParameter('-',Name='AS',Delimiter=' '),
+
+ # 1 or 0, default 0, by default, sequences are stored in RAM
+ # if set to 1, sequence are stored on hard drive
+ # it is recommended to use -B 1 for huge databases
+ '-B':ValuedParameter('-',Name='B',Delimiter=' '),
+
+ # 1 or 0, default 0
+ # if set to 1, print alignment overlap in .clstr file
+ '-p':ValuedParameter('-',Name='p',Delimiter=' '),
+
+ # 1 or 0, default 0
+ # by cd-hit's default algorithm, a sequence is clustered to the first
+ # cluster that meet the threshold (fast cluster). If set to 1, the program
+ # will cluster it into the most similar cluster that meet the threshold
+ # (accurate but slow mode)
+ # but either 1 or 0 won't change the representatives of final clusters
+ '-g':ValuedParameter('-',Name='g',Delimiter=' '),
+
+ # print this help
+ '-h':ValuedParameter('-',Name='h',Delimiter=' ')
+ }
+ _synonyms = {'Similarity':'-c'}
+
+ def getHelp(self):
+ """Method that points to documentation"""
+ help_str =\
+ """
+ CD-HIT is hosted as an open source project at:
+ http://www.bioinformatics.org/cd-hit/
+
+ The following papers should be cited if this resource is used:
+
+ Clustering of highly homologous sequences to reduce thesize of large
+ protein database", Weizhong Li, Lukasz Jaroszewski & Adam Godzik
+ Bioinformatics, (2001) 17:282-283
+
+ Tolerating some redundancy significantly speeds up clustering of large
+ protein databases", Weizhong Li, Lukasz Jaroszewski & Adam Godzik
+ Bioinformatics, (2002) 18:77-82
+ """
+ return help_str
+
+ def _input_as_multiline_string(self, data):
+ """Writes data to tempfile and sets -i parameter
+
+ data -- list of lines
+ """
+ if data:
+ self.Parameters['-i']\
+ .on(super(CD_HIT,self)._input_as_multiline_string(data))
+ return ''
+
+ def _input_as_lines(self, data):
+ """Writes data to tempfile and sets -i parameter
+
+ data -- list of lines, ready to be written to file
+ """
+ if data:
+ self.Parameters['-i']\
+ .on(super(CD_HIT,self)._input_as_lines(data))
+ return ''
+
+ def _input_as_seqs(self, data):
+ """Creates a list of seqs to pass to _input_as_lines
+
+ data -- list like object of sequences
+ """
+ lines = []
+ for i,s in enumerate(data):
+ # will number the sequences 1,2,3, etc...
+ lines.append(''.join(['>',str(i+1)]))
+ lines.append(s)
+ return self._input_as_lines(lines)
+
+ def _input_as_string(self, data):
+ """Makes data the value of a specific parameter"""
+ if data:
+ self.Parameters['-i'].on(str(data))
+ return ''
+
+ def _get_seqs_outfile(self):
+ """Returns the absolute path to the seqs outfile"""
+ if self.Parameters['-o'].isOn():
+ return self.Parameters['-o'].Value
+ else:
+ raise ValueError, "No output file specified"
+
+ def _get_clstr_outfile(self):
+ """Returns the absolute path to the clstr outfile"""
+ if self.Parameters['-o'].isOn():
+ return ''.join([self.Parameters['-o'].Value, '.clstr'])
+ else:
+ raise ValueError, "No output file specified"
+
+ def _get_result_paths(self, data):
+ """Return dict of {key: ResultPath}"""
+ result = {}
+ result['FASTA'] = ResultPath(Path=self._get_seqs_outfile())
+ result['CLSTR'] = ResultPath(Path=self._get_clstr_outfile())
+ return result
+
+class CD_HIT_EST(CD_HIT):
+ """cd-hit Application Controller
+
+ Use this version of CD-HIT if your MolType is PROTEIN
+ """
+
+ _command = 'cd-hit-est'
+ _input_handler = '_input_as_multiline_string'
+ _parameters = CD_HIT._parameters
+ _parameters.update({\
+ # 1 or 0, default 0, by default only +/+ strand alignment
+ # if set to 1, do both +/+ & +/- alignments
+ '-r':ValuedParameter('-',Name='r',Delimiter=' ')
+ })
+
+def cdhit_clusters_from_seqs(seqs, moltype=DNA, params=None):
+ """Returns the CD-HIT clusters given seqs
+
+ seqs : dict like collection of sequences
+ moltype : cogent.core.moltype object
+ params : cd-hit parameters
+
+ NOTE: This method will call CD_HIT if moltype is PROTIEN,
+ CD_HIT_EST if moltype is RNA/DNA, and raise if any other
+ moltype is passed.
+ """
+ # keys are not remapped. Tested against seq_ids of 100char length
+ seqs = SequenceCollection(seqs, MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = seqs.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+
+ # setup params and make sure the output argument is set
+ if params is None:
+ params = {}
+ if '-o' not in params:
+ _, params['-o'] = mkstemp()
+
+ # call the correct version of cd-hit base on moltype
+ working_dir = mkdtemp()
+ if moltype is PROTEIN:
+ app = CD_HIT(WorkingDir=working_dir, params=params)
+ elif moltype is RNA:
+ app = CD_HIT_EST(WorkingDir=working_dir, params=params)
+ elif moltype is DNA:
+ app = CD_HIT_EST(WorkingDir=working_dir, params=params)
+ else:
+ raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"
+
+ # grab result
+ res = app(int_map.toFasta())
+ clusters = parse_cdhit_clstr_file(res['CLSTR'])
+
+ remapped_clusters = []
+ for c in clusters:
+ curr = [int_keys[i] for i in c]
+ remapped_clusters.append(curr)
+
+ # perform cleanup
+ res.cleanUp()
+ shutil.rmtree(working_dir)
+ remove(params['-o'] + '.bak.clstr')
+
+ return remapped_clusters
+
+def cdhit_from_seqs(seqs, moltype, params=None):
+ """Returns the CD-HIT results given seqs
+
+ seqs : dict like collection of sequences
+ moltype : cogent.core.moltype object
+ params : cd-hit parameters
+
+ NOTE: This method will call CD_HIT if moltype is PROTIEN,
+ CD_HIT_EST if moltype is RNA/DNA, and raise if any other
+ moltype is passed.
+ """
+ # keys are not remapped. Tested against seq_ids of 100char length
+ seqs = SequenceCollection(seqs, MolType=moltype)
+
+ # setup params and make sure the output argument is set
+ if params is None:
+ params = {}
+ if '-o' not in params:
+ _, params['-o'] = mkstemp()
+
+ # call the correct version of cd-hit base on moltype
+ working_dir = mkdtemp()
+ if moltype is PROTEIN:
+ app = CD_HIT(WorkingDir=working_dir, params=params)
+ elif moltype is RNA:
+ app = CD_HIT_EST(WorkingDir=working_dir, params=params)
+ elif moltype is DNA:
+ app = CD_HIT_EST(WorkingDir=working_dir, params=params)
+ else:
+ raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"
+
+ # grab result
+ res = app(seqs.toFasta())
+ new_seqs = dict(parse_fasta(res['FASTA']))
+
+ # perform cleanup
+ res.cleanUp()
+ shutil.rmtree(working_dir)
+ remove(params['-o'] + '.bak.clstr')
+
+ return SequenceCollection(new_seqs, MolType=moltype)
+
+def clean_cluster_seq_id(id):
+ """Returns a cleaned cd-hit sequence id
+
+ The cluster file has sequence ids in the form of:
+ >some_id...
+ """
+ return id[1:-3]
+
+def parse_cdhit_clstr_file(lines):
+ """Returns a list of list of sequence ids representing clusters"""
+ clusters = []
+ curr_cluster = []
+
+ for l in lines:
+ if l.startswith('>Cluster'):
+ if not curr_cluster:
+ continue
+ clusters.append(curr_cluster)
+ curr_cluster = []
+ else:
+ curr_cluster.append(clean_cluster_seq_id(l.split()[2]))
+
+ if curr_cluster:
+ clusters.append(curr_cluster)
+
+ return clusters
diff --git a/bfillings/clearcut.py b/bfillings/clearcut.py
new file mode 100644
index 0000000..5f53a90
--- /dev/null
+++ b/bfillings/clearcut.py
@@ -0,0 +1,401 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Provides an application controller for the commandline version of:
+Clearcut v1.0.8
+"""
+from burrito.parameters import (FlagParameter, ValuedParameter,
+ MixedParameter)
+from burrito.util import (CommandLineApplication, ResultPath,
+ get_tmp_filename)
+
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.parse.tree import DndParser
+from cogent.core.tree import PhyloNode
+from cogent.util.dict2d import Dict2D
+from cogent.format.table import phylipMatrix
+
+
+MOLTYPE_MAP = {'DNA':'-D',
+ 'RNA':'-D',
+ 'PROTEIN':'-P',
+ }
+
+
+class Clearcut(CommandLineApplication):
+ """ clearcut application controller
+
+ The parameters are organized by function to give some idea of how the
+ program works. However, no restrictions are put on any combinations
+ of parameters. Misuse of parameters can lead to errors or otherwise
+ strange results.
+ """
+ #General options.
+ _general = {\
+ # --verbose. More Output. (Default:OFF)
+ '-v':FlagParameter('-',Name='v'),
+ # --quiet. Silent operation. (Default: ON)
+ '-q':FlagParameter('-',Name='q',Value=True),
+ # --seed=<seed>. Explicitly set the PRNG seed to a specific value.
+ '-s':ValuedParameter('-',Name='s',Delimiter='='),
+ # --norandom. Attempt joins deterministically. (Default: OFF)
+ '-r':FlagParameter('-',Name='r'),
+ # --shuffle. Randomly shuffle the distance matrix. (Default: OFF)
+ '-S':FlagParameter('-',Name='S'),
+ #--neighbor. Use traditional Neighbor-Joining algorithm. (Default: OFF)
+ '-N':FlagParameter('-',Name='N'),
+
+ }
+
+
+ # Input file is distance matrix or alignment. Default expects distance
+ # matrix. Output file is tree created by clearcut.
+ _input = {\
+ # --in=<infilename>. Input file
+ '--in':ValuedParameter('--',Name='in',Delimiter='=',IsPath=True),
+ # --stdin. Read input from STDIN.
+ '-I':FlagParameter('-',Name='I'),
+ # --distance. Input file is a distance matrix. (Default: ON)
+ '-d':FlagParameter('-',Name='d',Value=True),
+ # --alignment. Input file is a set of aligned sequences.
+ # (Default: OFF)
+ '-a':FlagParameter('-',Name='a'),
+ # --DNA. Input alignment are DNA sequences.
+ '-D':FlagParameter('-',Name='D'),
+ # --protein. Input alignment are protein sequences.
+ '-P':FlagParameter('-',Name='P'),
+ }
+
+
+ #Correction model for computing distance matrix (Default: NO Correction):
+ _correction={\
+ # --jukes. Use Jukes-Cantor correction for computing distance matrix.
+ '-j':FlagParameter('-',Name='j'),
+ # --kimura. Use Kimura correction for distance matrix.
+ '-k':FlagParameter('-',Name='k'),
+
+ }
+
+ _output={\
+ # --out=<outfilename>. Output file
+ '--out':ValuedParameter('--',Name='out',Delimiter='=',IsPath=True),
+ # --stdout. Output tree to STDOUT.
+ '-O':FlagParameter('-',Name='O'),
+ # --matrixout=<file> Output distance matrix to specified file.
+ '-m':ValuedParameter('-',Name='m',Delimiter='='),
+ # --ntrees=<n>. Output n trees. (Default: 1)
+ '-n':ValuedParameter('-',Name='n',Delimiter='='),
+ # --expblen. Exponential notation for branch lengths. (Default: OFF)
+ '-e':FlagParameter('-',Name='e'),
+ # --expdist. Exponential notation in distance output. (Default: OFF)
+ '-E':FlagParameter('-',Name='E'),
+
+ }
+
+
+ #NOT SUPPORTED
+ #'-h':FlagParameter('-','h'), #Help
+ #'-V':FlagParameter('-','V'), #Version
+
+
+ _parameters = {}
+ _parameters.update(_general)
+ _parameters.update(_input)
+ _parameters.update(_correction)
+ _parameters.update(_output)
+
+ _command = 'clearcut'
+
+ def getHelp(self):
+ """Method that points to the Clearcut documentation."""
+ help_str =\
+ """
+ See Clearcut homepage at:
+ http://bioinformatics.hungry.com/clearcut/
+ """
+ return help_str
+
+ def _input_as_multiline_string(self, data):
+ """Writes data to tempfile and sets -infile parameter
+
+ data -- list of lines
+ """
+ if data:
+ self.Parameters['--in']\
+ .on(super(Clearcut,self)._input_as_multiline_string(data))
+ return ''
+
+ def _input_as_lines(self,data):
+ """Writes data to tempfile and sets -infile parameter
+
+ data -- list of lines, ready to be written to file
+ """
+ if data:
+ self.Parameters['--in']\
+ .on(super(Clearcut,self)._input_as_lines(data))
+ return ''
+
+ def _input_as_seqs(self,data):
+ """writes sequences to tempfile and sets -infile parameter
+
+ data -- list of sequences
+
+ Adds numbering to the sequences: >1, >2, etc.
+ """
+ lines = []
+ for i,s in enumerate(data):
+ #will number the sequences 1,2,3,etc.
+ lines.append(''.join(['>',str(i+1)]))
+ lines.append(s)
+ return self._input_as_lines(lines)
+
+ def _input_as_string(self,data):
+ """Makes data the value of a specific parameter
+
+ This method returns the empty string. The parameter will be printed
+ automatically once set.
+ """
+ if data:
+ self.Parameters['--in'].on(data)
+ return ''
+
+ def _tree_filename(self):
+ """Return name of file containing the alignment
+
+ prefix -- str, prefix of alignment file.
+ """
+ if self.Parameters['--out']:
+ aln_filename = self._absolute(self.Parameters['--out'].Value)
+ else:
+ raise ValueError, "No tree output file specified."
+ return aln_filename
+
+ def _get_result_paths(self,data):
+ """Return dict of {key: ResultPath}
+ """
+ result = {}
+ if self.Parameters['--out'].isOn():
+ out_name = self._tree_filename()
+ result['Tree'] = ResultPath(Path=out_name,IsWritten=True)
+ return result
+
+
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+
+
+def align_unaligned_seqs(seqs, moltype=DNA, params=None):
+ """Returns an Alignment object from seqs.
+
+ seqs: SequenceCollection object, or data that can be used to build one.
+
+ moltype: a MolType object. DNA, RNA, or PROTEIN.
+
+ params: dict of parameters to pass in to the Clearcut app controller.
+
+ Result will be an Alignment object.
+ """
+ #Clearcut does not support alignment
+ raise NotImplementedError, """Clearcut does not support alignment."""
+
+def align_and_build_tree(seqs, moltype, best_tree=False, params={}):
+ """Returns an alignment and a tree from Sequences object seqs.
+
+ seqs: SequenceCollection object, or data that can be used to build one.
+
+ best_tree: if True (default:False), uses a slower but more accurate
+ algorithm to build the tree.
+
+ params: dict of parameters to pass in to the Clearcut app controller.
+
+ The result will be a tuple containing an Alignment object and a
+ cogent.core.tree.PhyloNode object (or None for the alignment and/or tree
+ if either fails).
+ """
+ #Clearcut does not support alignment
+ raise NotImplementedError, """Clearcut does not support alignment."""
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={},\
+ working_dir='/tmp'):
+ """Returns a tree from Alignment object aln.
+
+ aln: an cogent.core.alignment.Alignment object, or data that can be used
+ to build one.
+ - Clearcut only accepts aligned sequences. Alignment object used to
+ handle unaligned sequences.
+
+ moltype: a cogent.core.moltype object.
+ - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8
+ gives incorrect results if RNA is passed in. 'U' is treated as an
+ incorrect character and is excluded from distance calculations.
+
+ best_tree: if True (default:False), uses a slower but more accurate
+ algorithm to build the tree.
+
+ params: dict of parameters to pass in to the Clearcut app controller.
+
+ The result will be an cogent.core.tree.PhyloNode object, or None if tree
+ fails.
+ """
+ params['--out'] = get_tmp_filename(working_dir)
+
+ # Create instance of app controller, enable tree, disable alignment
+ app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \
+ WorkingDir=working_dir, SuppressStdout=True,\
+ SuppressStderr=True)
+ #Input is an alignment
+ app.Parameters['-a'].on()
+ #Turn off input as distance matrix
+ app.Parameters['-d'].off()
+
+ #If moltype = RNA, we must convert to DNA.
+ if moltype == RNA:
+ moltype = DNA
+
+ if best_tree:
+ app.Parameters['-N'].on()
+
+ #Turn on correct moltype
+ moltype_string = moltype.label.upper()
+ app.Parameters[MOLTYPE_MAP[moltype_string]].on()
+
+ # Setup mapping. Clearcut clips identifiers. We will need to remap them.
+ # Clearcut only accepts aligned sequences. Let Alignment object handle
+ # unaligned sequences.
+ seq_aln = Alignment(aln,MolType=moltype)
+ #get int mapping
+ int_map, int_keys = seq_aln.getIntMap()
+ #create new Alignment object with int_map
+ int_map = Alignment(int_map)
+
+ # Collect result
+ result = app(int_map.toFasta())
+
+ # Build tree
+ tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+ for node in tree.tips():
+ node.Name = int_keys[node.Name]
+
+ # Clean up
+ result.cleanUp()
+ del(seq_aln, app, result, int_map, int_keys, params)
+
+ return tree
+
+def add_seqs_to_alignment(seqs, aln, params=None):
+ """Returns an Alignment object from seqs and existing Alignment.
+
+ seqs: an cogent.core.sequence.Sequence object, or data that can be used
+ to build one.
+
+ aln: an cogent.core.alignment.Alignment object, or data that can be used
+ to build one
+
+ params: dict of parameters to pass in to the Clearcut app controller.
+ """
+ #Clearcut does not support alignment
+ raise NotImplementedError, """Clearcut does not support alignment."""
+
+def align_two_alignments(aln1, aln2, params=None):
+ """Returns an Alignment object from two existing Alignments.
+
+ aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
+ used to build them.
+
+ params: dict of parameters to pass in to the Clearcut app controller.
+ """
+ #Clearcut does not support alignment
+ raise NotImplementedError, """Clearcut does not support alignment."""
+
+
+def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\
+ working_dir='/tmp'):
+ """Returns a tree from a distance matrix.
+
+ matrix: a square Dict2D object (cogent.util.dict2d)
+
+ best_tree: if True (default:False), uses a slower but more accurate
+ algorithm to build the tree.
+
+ params: dict of parameters to pass in to the Clearcut app controller.
+
+ The result will be an cogent.core.tree.PhyloNode object, or None if tree
+ fails.
+ """
+ params['--out'] = get_tmp_filename(working_dir)
+
+ # Create instance of app controller, enable tree, disable alignment
+ app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \
+ WorkingDir=working_dir, SuppressStdout=True,\
+ SuppressStderr=True)
+ #Turn off input as alignment
+ app.Parameters['-a'].off()
+ #Input is a distance matrix
+ app.Parameters['-d'].on()
+
+ if best_tree:
+ app.Parameters['-N'].on()
+
+ # Turn the dict2d object into the expected input format
+ matrix_input, int_keys = _matrix_input_from_dict2d(matrix)
+
+ # Collect result
+ result = app(matrix_input)
+
+ # Build tree
+ tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+
+ # reassign to original names
+ for node in tree.tips():
+ node.Name = int_keys[node.Name]
+
+ # Clean up
+ result.cleanUp()
+ del(app, result, params)
+
+ return tree
+
+def _matrix_input_from_dict2d(matrix):
+ """makes input for running clearcut on a matrix from a dict2D object"""
+ #clearcut truncates names to 10 char- need to rename before and
+ #reassign after
+
+ #make a dict of env_index:full name
+ int_keys = dict([('env_' + str(i), k) for i,k in \
+ enumerate(sorted(matrix.keys()))])
+ #invert the dict
+ int_map = {}
+ for i in int_keys:
+ int_map[int_keys[i]] = i
+
+ #make a new dict2D object with the integer keys mapped to values instead of
+ #the original names
+ new_dists = []
+ for env1 in matrix:
+ for env2 in matrix[env1]:
+ new_dists.append((int_map[env1], int_map[env2], matrix[env1][env2]))
+ int_map_dists = Dict2D(new_dists)
+
+ #names will be fed into the phylipTable function - it is the int map names
+ names = sorted(int_map_dists.keys())
+ rows = []
+ #populated rows with values based on the order of names
+ #the following code will work for a square matrix only
+ for index, key1 in enumerate(names):
+ row = []
+ for key2 in names:
+ row.append(str(int_map_dists[key1][key2]))
+ rows.append(row)
+ input_matrix = phylipMatrix(rows, names)
+ #input needs a trailing whitespace or it will fail!
+ input_matrix += '\n'
+
+ return input_matrix, int_keys
diff --git a/bfillings/clustalw.py b/bfillings/clustalw.py
new file mode 100644
index 0000000..b195e15
--- /dev/null
+++ b/bfillings/clustalw.py
@@ -0,0 +1,724 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Provides an application controller for the commandline version of:
+CLUSTALW v1.83
+"""
+from numpy.random import randint
+from burrito.parameters import (FlagParameter, ValuedParameter,
+ MixedParameter, FilePath)
+from burrito.util import CommandLineApplication, ResultPath, remove
+
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.parse.tree import DndParser
+from cogent.parse.clustal import ClustalParser
+from cogent.core.tree import PhyloNode
+from cogent.core.moltype import RNA, DNA, PROTEIN
+
+
+class Clustalw(CommandLineApplication):
+ """ clustalw application controller
+
+ The parameters are organized by function to give some idea of how the
+ program works. However, no restrictions are put on any combinations
+ of parameters. Misuse of parameters can lead to errors or otherwise
+ strange results.
+
+ You are supposed to choose one action for the program to perform. (align,
+ profile, sequences, tree, or bootstrap). If you choose multiple, only the
+ dominant action (see order above) will be executed. By DEFAULT, the -align
+ parameter is turned on. If you decide to turn another one on, you should
+ turn '-align' off IN ADDITION!
+
+ Some references to help pages are available in the 'getHelp' method.
+ Some might be useful to you.
+ """
+ _actions = {\
+ '-align':FlagParameter('-','align',Value=True),
+ '-profile':FlagParameter('-','profile'),
+ '-sequences':FlagParameter('-','sequences'),
+ '-tree':FlagParameter('-','tree'),
+ '-bootstrap':MixedParameter('-','bootstrap',Delimiter='=')}
+
+ #sequence file for alignment, or alignment file for bootstrap and tree
+ #actions
+ _input = {'-infile':ValuedParameter('-','infile',Delimiter='=',IsPath=True)}
+
+ # matrix and dnamatrix can be filenames as well, but not always.
+ # They won't be treated as filenames and thus not quoted.
+ # Therefore filepaths containing spaces might result in errors.
+ _multiple_alignment={\
+ '-quicktree':FlagParameter('-','quicktree'),
+ '-type':ValuedParameter('-','type',Delimiter='='),
+ '-matrix':ValuedParameter('-','matrix',Delimiter='='),
+ '-dnamatrix':ValuedParameter('-','dnamatrix',Delimiter='='),
+ '-gapopen':ValuedParameter('-','gapopen',Delimiter='='),
+ '-gapext':ValuedParameter('-','gapext',Delimiter='='),
+ '-endgaps':FlagParameter('-','endgaps'),
+ '-gapdist':ValuedParameter('-',Name='gapdist',Delimiter='='),
+ '-nopgap':FlagParameter('-','nopgap'),
+ '-nohgap':FlagParameter('-','nohgap'),
+ '-hgapresidues':ValuedParameter('-','hgapresidues',Delimiter='='),
+ '-maxdiv':ValuedParameter('-',Name='maxdiv',Delimiter='='),
+ '-negative':FlagParameter('-','negative'),
+ '-transweight':ValuedParameter('-',Name='transweight',Delimiter='='),
+ '-newtree':ValuedParameter('-','newtree',Delimiter='=',IsPath=True),
+ '-usetree':ValuedParameter('-','usetree',Delimiter='=',IsPath=True)}
+
+ _fast_pairwise={\
+ '-ktuple':ValuedParameter('-',Name='ktuple',Delimiter='='),
+ '-topdiags':ValuedParameter('-',Name='topdiags',Delimiter='='),
+ '-window':ValuedParameter('-',Name='window',Delimiter='='),
+ '-pairgap':ValuedParameter('-',Name='pairgap',Delimiter='='),
+ '-score':ValuedParameter('-',Name='score',Delimiter='=')}
+
+ # pwmatrix and pwdnamatrix can be filenames as well, but not always.
+ # They won't be treated as filenames and thus not quoted.
+ # Therefore filepaths containing spaces might result in errors.
+ _slow_pairwise={\
+ '-pwmatrix':ValuedParameter('-',Name='pwmatrix',Delimiter='='),
+ '-pwdnamatrix':ValuedParameter('-',Name='pwdnamatrix',Delimiter='='),
+ '-pwgapopen':ValuedParameter('-',Name='pwgapopen',Delimiter='='),
+ '-pwgapext':ValuedParameter('-',Name='pwgapext',Delimiter='=')}
+
+ #plus -bootstrap
+ _tree={\
+ '-kimura':FlagParameter('-',Name='kimura'),
+ '-tossgaps':FlagParameter('-',Name='tossgaps'),
+ '-bootlabels':ValuedParameter('-',Name='bootlabels',Delimiter='='),
+ '-seed':ValuedParameter('-',Name='seed',Delimiter='='),
+ '-outputtree':ValuedParameter('-',Name='outputtree',Delimiter='=')}
+
+ _output={\
+ '-outfile':ValuedParameter('-',Name='outfile',Delimiter='=',\
+ IsPath=True),
+ '-output':ValuedParameter('-',Name='output',Delimiter='='),
+ '-case':ValuedParameter('-',Name='case',Delimiter='='),
+ '-outorder':ValuedParameter('-',Name='outorder',Delimiter='='),
+ '-seqnos':ValuedParameter('-',Name='seqnos',Delimiter='=')}
+
+ _profile_alignment={\
+ '-profile1':ValuedParameter('-','profile1',Delimiter='=',IsPath=True),
+ '-profile2':ValuedParameter('-','profile2',Delimiter='=',IsPath=True),
+ '-usetree1':ValuedParameter('-','usetree1',Delimiter='=',IsPath=True),
+ '-usetree2':ValuedParameter('-','usetree2',Delimiter='=',IsPath=True),
+ '-newtree1':ValuedParameter('-','newtree1',Delimiter='=',IsPath=True),
+ '-newtree2':ValuedParameter('-','newtree2',Delimiter='=',IsPath=True)}
+
+ _structure_alignment={\
+ '-nosecstr1':FlagParameter('-',Name='nosecstr1'),
+ '-nosecstr2':FlagParameter('-',Name='nosecstr2'),
+ '-helixgap':ValuedParameter('-',Name='helixgap',Delimiter='='),
+ '-strandgap':ValuedParameter('-',Name='strandgap',Delimiter='='),
+ '-loopgap':ValuedParameter('-',Name='loopgap',Delimiter='='),
+ '-terminalgap':ValuedParameter('-',Name='terminalgap',Delimiter='='),
+ '-helixendin':ValuedParameter('-',Name='helixendin',Delimiter='='),
+ '-helixendout':ValuedParameter('-',Name='helixendout',Delimiter='='),
+ '-strandendin':ValuedParameter('-',Name='strandendin',Delimiter='='),
+ '-strandendout':ValuedParameter('-',Name='strandendout',Delimiter='='),
+ '-secstrout':ValuedParameter('-',Name='secstrout',Delimiter='=')}
+
+ #NOT SUPPORTED
+ #'-help':FlagParameter('-','help'),
+ #'-check':FlagParameter('-','check'),
+ #'-options':FlagParameter('-','options'),
+ #'-convert':FlagParameter('-','convert'),
+ #'-batch':FlagParameter('-','batch'),
+ #'-noweights':FlagParameter('-','noweights'),
+ #'-novgap':FlagParameter('-','novgap'),
+ #'-debug':ValuedParameter('-',Name='debug',Delimiter='='),
+
+ _parameters = {}
+ _parameters.update(_actions)
+ _parameters.update(_input)
+ _parameters.update(_multiple_alignment)
+ _parameters.update(_fast_pairwise)
+ _parameters.update(_slow_pairwise)
+ _parameters.update(_tree)
+ _parameters.update(_output)
+ _parameters.update(_profile_alignment)
+ _parameters.update(_structure_alignment)
+
+ _command = 'clustalw'
+
+ def getHelp(self):
+ """Methods that points to the documentation"""
+ help_str =\
+ """
+ There are several help pages available online. For example:
+ http://searchlauncher.bcm.tmc.edu/multi-align/Help/
+ clustalw_help_1.8.html
+ http://hypernig.nig.ac.jp/homology/clustalw-e_help.html
+ http://www.genebee.msu.su/clustal/help.html
+
+ A page that give reasonable insight in use of the parameters:
+ http://bioweb.pasteur.fr/seqanal/interfaces/clustalw.html
+ """
+ return help_str
+
+ def _input_as_multiline_string(self, data):
+ """Writes data to tempfile and sets -infile parameter
+
+ data -- list of lines
+ """
+ if data:
+ self.Parameters['-infile']\
+ .on(super(Clustalw,self)._input_as_multiline_string(data))
+ return ''
+
+ def _input_as_lines(self,data):
+ """Writes data to tempfile and sets -infile parameter
+
+ data -- list of lines, ready to be written to file
+ """
+ if data:
+ self.Parameters['-infile']\
+ .on(super(Clustalw,self)._input_as_lines(data))
+ return ''
+
+ def _input_as_seqs(self,data):
+ """writes sequences to tempfile and sets -infile parameter
+
+ data -- list of sequences
+
+ Adds numbering to the sequences: >1, >2, etc.
+ """
+ lines = []
+ for i,s in enumerate(data):
+ #will number the sequences 1,2,3,etc.
+ lines.append(''.join(['>',str(i+1)]))
+ lines.append(s)
+ return self._input_as_lines(lines)
+
+ def _input_as_string(self,data):
+ """Makes data the value of a specific parameter
+
+ This method returns the empty string. The parameter will be printed
+ automatically once set.
+ """
+ if data:
+ self.Parameters['-infile'].on(data)
+ return ''
+
+ def _suffix(self):
+ """Return appropriate suffix for alignment file"""
+ _output_formats={'GCG':'.msf',
+ 'GDE':'.gde',
+ 'PHYLIP':'.phy',
+ 'PIR':'.pir',
+ 'NEXUS':'.nxs'}
+
+ if self.Parameters['-output'].isOn():
+ return _output_formats[self.Parameters['-output'].Value]
+ else:
+ return '.aln'
+
+ def _aln_filename(self,prefix):
+ """Return name of file containing the alignment
+
+ prefix -- str, prefix of alignment file.
+ """
+ if self.Parameters['-outfile'].isOn():
+ aln_filename = self._absolute(self.Parameters['-outfile'].Value)
+ else:
+ aln_filename = prefix + self._suffix()
+ return aln_filename
+
+ def _tempfile_as_multiline_string(self, data):
+ """Write a multiline string to a temp file and return the filename.
+
+ data: a multiline string to be written to a file.
+
+ * Note: the result will be the filename as a FilePath object
+ (which is a string subclass).
+
+ """
+ filename = FilePath(self.getTmpFilename(self.TmpDir))
+ data_file = open(filename,'w')
+ data_file.write(data)
+ data_file.close()
+ return filename
+
+ def _get_result_paths(self,data):
+ """Return dict of {key: ResultPath}
+ """
+
+ #clustalw .aln is used when no or unkown output type specified
+ _treeinfo_formats = {'nj':'.nj',
+ 'dist':'.dst',
+ 'nexus':'.tre'}
+
+ result = {}
+ par = self.Parameters
+ abs = self._absolute
+
+ if par['-align'].isOn():
+ prefix = par['-infile'].Value.rsplit('.', 1)[0]
+ #prefix = par['-infile'].Value.split('.')[0]
+ aln_filename = self._aln_filename(prefix)
+ if par['-newtree'].isOn():
+ dnd_filename = abs(par['-newtree'].Value)
+ elif par['-usetree'].isOn():
+ dnd_filename = abs(par['-usetree'].Value)
+ else:
+ dnd_filename = abs(prefix + '.dnd')
+ result['Align'] = ResultPath(Path=aln_filename,IsWritten=True)
+ result['Dendro'] = ResultPath(Path=dnd_filename,IsWritten=True)
+ elif par['-profile'].isOn():
+ prefix1 = par['-profile1'].Value.rsplit('.', 1)[0]
+ prefix2 = par['-profile2'].Value.rsplit('.', 1)[0]
+ #prefix1 = par['-profile1'].Value.split('.')[0]
+ #prefix2 = par['-profile2'].Value.split('.')[0]
+ aln_filename = ''; aln_written = True
+ dnd1_filename = ''; tree1_written = True
+ dnd2_filename = ''; tree2_written = True
+ aln_filename = self._aln_filename(prefix1)
+ #usetree1
+ if par['-usetree1'].isOn():
+ tree1_written = False
+ #usetree2
+ if par['-usetree2'].isOn():
+ tree2_written = False
+ if par['-newtree1'].isOn():
+ dnd1_filename = abs(par['-newtree1'].Value)
+ aln_written=False
+ else:
+ dnd1_filename = abs(prefix1 + '.dnd')
+ if par['-newtree2'].isOn():
+ dnd2_filename = abs(par['-newtree2'].Value)
+ aln_written=False
+ else:
+ dnd2_filename = abs(prefix2 + '.dnd')
+ result['Align'] = ResultPath(Path=aln_filename,
+ IsWritten=aln_written)
+ result['Dendro1'] = ResultPath(Path=dnd1_filename,
+ IsWritten=tree1_written)
+ result['Dendro2'] = ResultPath(Path=dnd2_filename,
+ IsWritten=tree2_written)
+ elif par['-sequences'].isOn():
+ prefix1 = par['-profile1'].Value.rsplit('.', 1)[0]
+ prefix2 = par['-profile2'].Value.rsplit('.', 1)[0]
+ #prefix1 = par['-profile1'].Value.split('.')[0] #alignment
+ #prefix2 = par['-profile2'].Value.split('.')[0] #sequences
+ aln_filename = ''; aln_written = True
+ dnd_filename = ''; dnd_written = True
+
+ aln_filename = self._aln_filename(prefix2)
+ if par['-usetree'].isOn():
+ dnd_written = False
+ elif par['-newtree'].isOn():
+ aln_written = False
+ dnd_filename = abs(par['-newtree'].Value)
+ else:
+ dnd_filename = prefix2 + '.dnd'
+ result['Align'] = ResultPath(Path=aln_filename,\
+ IsWritten=aln_written)
+ result['Dendro'] = ResultPath(Path=dnd_filename,\
+ IsWritten=dnd_written)
+ elif par['-tree'].isOn():
+ prefix = par['-infile'].Value.rsplit('.', 1)[0]
+ #prefix = par['-infile'].Value.split('.')[0]
+ tree_filename = ''; tree_written = True
+ treeinfo_filename = ''; treeinfo_written = False
+ tree_filename = prefix + '.ph'
+ if par['-outputtree'].isOn() and\
+ par['-outputtree'].Value != 'phylip':
+ treeinfo_filename = prefix +\
+ _treeinfo_formats[par['-outputtree'].Value]
+ treeinfo_written = True
+ result['Tree'] = ResultPath(Path=tree_filename,\
+ IsWritten=tree_written)
+ result['TreeInfo'] = ResultPath(Path=treeinfo_filename,\
+ IsWritten=treeinfo_written)
+
+ elif par['-bootstrap'].isOn():
+ prefix = par['-infile'].Value.rsplit('.', 1)[0]
+ #prefix = par['-infile'].Value.split('.')[0]
+ boottree_filename = prefix + '.phb'
+ result['Tree'] = ResultPath(Path=boottree_filename,IsWritten=True)
+
+ return result
+
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+def alignUnalignedSeqs(seqs,add_seq_names=True,WorkingDir=None,\
+ SuppressStderr=None,SuppressStdout=None):
+ """Aligns unaligned sequences
+
+ seqs: either list of sequence objects or list of strings
+ add_seq_names: boolean. if True, sequence names are inserted in the list
+ of sequences. if False, it assumes seqs is a list of lines of some
+ proper format that the program can handle
+ """
+ if add_seq_names:
+ app = Clustalw(InputHandler='_input_as_seqs',\
+ WorkingDir=WorkingDir,SuppressStderr=SuppressStderr,\
+ SuppressStdout=SuppressStdout)
+ else:
+ app = Clustalw(InputHandler='_input_as_lines',\
+ WorkingDir=WorkingDir,SuppressStderr=SuppressStderr,\
+ SuppressStdout=SuppressStdout)
+ return app(seqs)
+
+def alignUnalignedSeqsFromFile(filename,WorkingDir=None,SuppressStderr=None,\
+ SuppressStdout=None):
+ """Aligns unaligned sequences from some file (file should be right format)
+
+ filename: string, the filename of the file containing the sequences
+ to be aligned in a valid format.
+ """
+ app = Clustalw(WorkingDir=WorkingDir,SuppressStderr=SuppressStderr,\
+ SuppressStdout=SuppressStdout)
+ return app(filename)
+
+def alignTwoAlignments(aln1,aln2,outfile,WorkingDir=None,SuppressStderr=None,\
+ SuppressStdout=None):
+ """Aligns two alignments. Individual sequences are not realigned
+
+ aln1: string, name of file containing the first alignment
+ aln2: string, name of file containing the second alignment
+ outfile: you're forced to specify an outfile name, because if you don't
+ aln1 will be overwritten. So, if you want aln1 to be overwritten, you
+ should specify the same filename.
+ WARNING: a .dnd file is created with the same prefix as aln1. So an
+ existing dendrogram might get overwritten.
+ """
+ app = Clustalw({'-profile':None,'-profile1':aln1,\
+ '-profile2':aln2,'-outfile':outfile},SuppressStderr=\
+ SuppressStderr,WorkingDir=WorkingDir,SuppressStdout=SuppressStdout)
+ app.Parameters['-align'].off()
+ return app()
+
+def addSeqsToAlignment(aln1,seqs,outfile,WorkingDir=None,SuppressStderr=None,\
+ SuppressStdout=None):
+ """Aligns sequences from second profile against first profile
+
+ aln1: string, name of file containing the alignment
+ seqs: string, name of file containing the sequences that should be added
+ to the alignment.
+ opoutfile: string, name of the output file (the new alignment)
+ """
+ app = Clustalw({'-sequences':None,'-profile1':aln1,\
+ '-profile2':seqs,'-outfile':outfile},SuppressStderr=\
+ SuppressStderr,WorkingDir=WorkingDir, SuppressStdout=SuppressStdout)
+
+ app.Parameters['-align'].off()
+ return app()
+
+def buildTreeFromAlignment(filename,WorkingDir=None,SuppressStderr=None):
+ """Builds a new tree from an existing alignment
+
+ filename: string, name of file containing the seqs or alignment
+ """
+ app = Clustalw({'-tree':None,'-infile':filename},SuppressStderr=\
+ SuppressStderr,WorkingDir=WorkingDir)
+ app.Parameters['-align'].off()
+ return app()
+
+def align_and_build_tree(seqs, moltype, best_tree=False, params=None):
+ """Returns an alignment and a tree from Sequences object seqs.
+
+ seqs: an cogent.core.alignment.SequenceCollection object, or data that can
+ be used to build one.
+
+ moltype: cogent.core.moltype.MolType object
+
+ best_tree: if True (default:False), uses a slower but more accurate
+ algorithm to build the tree.
+
+ params: dict of parameters to pass in to the Clustal app controller.
+
+ The result will be a tuple containing a cogent.core.alignment.Alignment
+ and a cogent.core.tree.PhyloNode
+ object (or None for the alignment and/or tree if either fails).
+ """
+ aln = align_unaligned_seqs(seqs, moltype=moltype, params=params)
+ tree = build_tree_from_alignment(aln, moltype, best_tree, params)
+ return {'Align':aln,'Tree':tree}
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None):
+ """Returns a tree from Alignment object aln.
+
+ aln: an cogent.core.alignment.Alignment object, or data that can be used
+ to build one.
+
+ moltype: cogent.core.moltype.MolType object
+
+ best_tree: if True (default:False), uses a slower but more accurate
+ algorithm to build the tree.
+
+ params: dict of parameters to pass in to the Clustal app controller.
+
+ The result will be an cogent.core.tree.PhyloNode object, or None if tree
+ fails.
+ """
+ # Create instance of app controller, enable tree, disable alignment
+ app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \
+ WorkingDir='/tmp')
+ app.Parameters['-align'].off()
+
+ #Set params to empty dict if None.
+ if params is None:
+ params={}
+
+ if moltype == DNA or moltype == RNA:
+ params['-type'] = 'd'
+ elif moltype == PROTEIN:
+ params['-type'] = 'p'
+ else:
+ raise ValueError, "moltype must be DNA, RNA, or PROTEIN"
+
+ # best_tree -> bootstrap
+ if best_tree:
+ if '-bootstrap' not in params:
+ app.Parameters['-bootstrap'].on(1000)
+ if '-seed' not in params:
+ app.Parameters['-seed'].on(randint(0,1000))
+ if '-bootlabels' not in params:
+ app.Parameters['-bootlabels'].on('nodes')
+ else:
+ app.Parameters['-tree'].on()
+
+ # Setup mapping. Clustalw clips identifiers. We will need to remap them.
+ seq_collection = SequenceCollection(aln)
+ int_map, int_keys = seq_collection.getIntMap()
+ int_map = SequenceCollection(int_map)
+
+ # Collect result
+ result = app(int_map.toFasta())
+
+ # Build tree
+ tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+ for node in tree.tips():
+ node.Name = int_keys[node.Name]
+
+ # Clean up
+ result.cleanUp()
+ del(seq_collection, app, result, int_map, int_keys)
+
+ return tree
+
+def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None):
+ """Returns a tree from Alignment object aln with bootstrap support values.
+
+ aln: an cogent.core.alignment.Alignment object, or data that can be used
+ to build one.
+
+ seed: an interger, seed value to use
+
+ num_trees: an integer, number of trees to bootstrap against
+
+ params: dict of parameters to pass in to the Clustal app controller.
+
+ The result will be an cogent.core.tree.PhyloNode object, or None if tree
+ fails.
+
+ If seed is not specifed in params, a random integer between 0-1000 is used.
+ """
+ # Create instance of controllor, enable bootstrap, disable alignment,tree
+ app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \
+ WorkingDir='/tmp')
+ app.Parameters['-align'].off()
+ app.Parameters['-tree'].off()
+
+ if app.Parameters['-bootstrap'].isOff():
+ if num_trees is None:
+ num_trees = 1000
+
+ app.Parameters['-bootstrap'].on(num_trees)
+
+ if app.Parameters['-seed'].isOff():
+ if seed is None:
+ seed = randint(0,1000)
+
+ app.Parameters['-seed'].on(seed)
+
+ if app.Parameters['-bootlabels'].isOff():
+ app.Parameters['-bootlabels'].on("node")
+
+ # Setup mapping. Clustalw clips identifiers. We will need to remap them.
+ seq_collection = SequenceCollection(aln)
+ int_map, int_keys = seq_collection.getIntMap()
+ int_map = SequenceCollection(int_map)
+
+ # Collect result
+ result = app(int_map.toFasta())
+
+ # Build tree
+ tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+ for node in tree.tips():
+ node.Name = int_keys[node.Name]
+
+ # Clean up
+ result.cleanUp()
+ del(seq_collection, app, result, int_map, int_keys)
+
+ return tree
+
+def align_unaligned_seqs(seqs, moltype=DNA, params=None):
+ """Returns an Alignment object from seqs.
+
+ seqs: cogent.core.alignment.SequenceCollection object, or data that can be
+ used to build one.
+
+ moltype: a MolType object. DNA, RNA, or PROTEIN.
+
+ params: dict of parameters to pass in to the Clustal app controller.
+
+ Result will be a cogent.core.alignment.Alignment object.
+ """
+ #create SequenceCollection object from seqs
+ seq_collection = SequenceCollection(seqs,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = seq_collection.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+ #Create Clustalw app.
+ app = Clustalw(InputHandler='_input_as_multiline_string',params=params)
+ #Get results using int_map as input to app
+ res = app(int_map.toFasta())
+ #Get alignment as dict out of results
+ alignment = dict(ClustalParser(res['Align'].readlines()))
+ #Make new dict mapping original IDs
+ new_alignment = {}
+ for k,v in alignment.items():
+ new_alignment[int_keys[k]]=v
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment,MolType=moltype)
+ #Clean up
+ res.cleanUp()
+ del(seq_collection,int_map,int_keys,app,res,alignment)
+
+ return new_alignment
+
+def add_seqs_to_alignment(seqs, aln, moltype, params=None):
+ """Returns an Alignment object from seqs and existing Alignment.
+
+ seqs: a cogent.core.alignment.SequenceCollection object, or data that can
+ be used to build one.
+
+ aln: a cogent.core.alignment.Alignment object, or data that can be used to
+ build one
+
+ params: dict of parameters to pass in to the Clustal app controller.
+ """
+ #create SequenceCollection object from seqs
+ seq_collection = SequenceCollection(seqs,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ seq_int_map, seq_int_keys = seq_collection.getIntMap()
+ #Create SequenceCollection from int_map.
+ seq_int_map = SequenceCollection(seq_int_map,MolType=moltype)
+
+ #create Alignment object from aln
+ aln = Alignment(aln,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_')
+ #Create SequenceCollection from int_map.
+ aln_int_map = Alignment(aln_int_map,MolType=moltype)
+
+ #Update seq_int_keys with aln_int_keys
+ seq_int_keys.update(aln_int_keys)
+
+ #Create Mafft app.
+ app = Clustalw(InputHandler='_input_as_multiline_string',\
+ params=params,
+ SuppressStderr=True)
+ app.Parameters['-align'].off()
+ app.Parameters['-infile'].off()
+ app.Parameters['-sequences'].on()
+
+ #Add aln_int_map as profile1
+ app.Parameters['-profile1'].on(\
+ app._tempfile_as_multiline_string(aln_int_map.toFasta()))
+
+ #Add seq_int_map as profile2
+ app.Parameters['-profile2'].on(\
+ app._tempfile_as_multiline_string(seq_int_map.toFasta()))
+ #Get results using int_map as input to app
+ res = app()
+
+ #Get alignment as dict out of results
+ alignment = dict(ClustalParser(res['Align'].readlines()))
+
+ #Make new dict mapping original IDs
+ new_alignment = {}
+ for k,v in alignment.items():
+ new_alignment[seq_int_keys[k]]=v
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment,MolType=moltype)
+ #Clean up
+ res.cleanUp()
+ remove(app.Parameters['-profile1'].Value)
+ remove(app.Parameters['-profile2'].Value)
+ del(seq_collection,seq_int_map,seq_int_keys,\
+ aln,aln_int_map,aln_int_keys,app,res,alignment)
+
+ return new_alignment
+
+def align_two_alignments(aln1, aln2, moltype, params=None):
+ """Returns an Alignment object from two existing Alignments.
+
+ aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
+ used to build them.
+
+ params: dict of parameters to pass in to the Clustal app controller.
+ """
+ #create SequenceCollection object from seqs
+ aln1 = Alignment(aln1,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ aln1_int_map, aln1_int_keys = aln1.getIntMap()
+ #Create SequenceCollection from int_map.
+ aln1_int_map = Alignment(aln1_int_map,MolType=moltype)
+
+ #create Alignment object from aln
+ aln2 = Alignment(aln2,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_')
+ #Create SequenceCollection from int_map.
+ aln2_int_map = Alignment(aln2_int_map,MolType=moltype)
+
+ #Update aln1_int_keys with aln2_int_keys
+ aln1_int_keys.update(aln2_int_keys)
+
+ #Create Mafft app.
+ app = Clustalw(InputHandler='_input_as_multiline_string',\
+ params=params,
+ SuppressStderr=True)
+ app.Parameters['-align'].off()
+ app.Parameters['-infile'].off()
+ app.Parameters['-profile'].on()
+
+ #Add aln_int_map as profile1
+ app.Parameters['-profile1'].on(\
+ app._tempfile_as_multiline_string(aln1_int_map.toFasta()))
+
+ #Add seq_int_map as profile2
+ app.Parameters['-profile2'].on(\
+ app._tempfile_as_multiline_string(aln2_int_map.toFasta()))
+ #Get results using int_map as input to app
+ res = app()
+
+ #Get alignment as dict out of results
+ alignment = dict(ClustalParser(res['Align'].readlines()))
+
+ #Make new dict mapping original IDs
+ new_alignment = {}
+ for k,v in alignment.items():
+ new_alignment[aln1_int_keys[k]]=v
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment,MolType=moltype)
+ #Clean up
+ res.cleanUp()
+ remove(app.Parameters['-profile1'].Value)
+ remove(app.Parameters['-profile2'].Value)
+ del(aln1,aln1_int_map,aln1_int_keys,\
+ aln2,aln2_int_map,aln2_int_keys,app,res,alignment)
+
+ return new_alignment
diff --git a/bfillings/denoiser.py b/bfillings/denoiser.py
new file mode 100644
index 0000000..98336da
--- /dev/null
+++ b/bfillings/denoiser.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+This module provides pass-through access to PyCogent's denoiser code. It's a
+bit of a hack, but it allows us to remove the direct dependency on PyCogent by
+centralizing the denoiser code with all of the other PyCogent code that is
+targeted either for complete re-write or removal pending benchmarks. The basic
+idea is that it's not worth porting this code anywhere now because it's days are
+numbered, but we still need to be able to access it for the time being.
+
+"""
+
+from cogent.parse.flowgram import (Flowgram, build_averaged_flowgram,
+ seq_to_flow)
+from cogent.parse.flowgram_parser import lazy_parse_sff_handle, get_header_info
+from cogent.parse.flowgram_collection import (FlowgramCollection, parse_sff)
+from cogent.util.trie import build_prefix_map
diff --git a/bfillings/fastq_join.py b/bfillings/fastq_join.py
new file mode 100644
index 0000000..d296aa2
--- /dev/null
+++ b/bfillings/fastq_join.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+# Application controller for ea-utils v1.1.2-537
+# fastq processing utilities
+# http://code.google.com/p/ea-utils/
+#
+
+import os
+import tempfile
+import shutil
+
+from burrito.parameters import ValuedParameter
+from burrito.util import CommandLineApplication, ResultPath
+
+
+class FastqJoin(CommandLineApplication):
+
+ """fastq-join (v1.1.2) application controller for joining paired-end reads."""
+
+ _command = 'fastq-join'
+
+ _parameters = {
+ # Description copied from 'fastq-join'
+ # Usage: fastq-join [options] <read1.fq> <read2.fq> [mate.fq] -o
+ # <read.%.fq>
+
+ # Output:
+ # You can supply 3 -o arguments, for un1, un2, join files, or one
+ # argument as a file name template. The suffix 'un1, un2, or join' is
+ # appended to the file, or they replace a %-character if present.
+ # If a 'mate' input file is present (barcode read), then the files
+ # 'un3' and 'join2' are also created.
+
+ # we'll only handle one output base path / file name
+ # -o FIL: See 'Output' above
+ '-o': ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),
+
+ # -v C: Verifies that the 2 files probe id's match up to char C
+ # use ' ' (space) for Illumina reads
+ '-v': ValuedParameter(Prefix='-', Delimiter=' ', Name='v'),
+
+ # -p N: N-percent maximum difference (8)
+ '-p': ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),
+
+ # -m N: N-minimum overlap (6)
+ '-m': ValuedParameter(Prefix='-', Delimiter=' ', Name='m'),
+
+ # -r FIL: Verbose stitch length report
+ '-r': ValuedParameter(Prefix='-', Delimiter=' ', Name='r')}
+
+ _input_handler = '_input_as_paths'
+
+ def _get_output_path(self):
+ """Checks if a base file label / path is set. Returns absolute path."""
+ if self.Parameters['-o'].isOn():
+ output_path = self._absolute(str(self.Parameters['-o'].Value))
+ else:
+ raise ValueError("No output path specified.")
+ return output_path
+
+ def _get_stitch_report_path(self):
+ """Checks if stitch report label / path is set. Returns absolute path."""
+ if self.Parameters['-r'].isOn():
+ stitch_path = self._absolute(str(self.Parameters['-r'].Value))
+ return stitch_path
+ elif self.Parameters['-r'].isOff():
+ return None
+
+ def _get_result_paths(self, data):
+ """Capture fastq-join output.
+
+ Three output files are produced, in the form of
+ outputjoin : assembled paired reads
+ outputun1 : unassembled reads_1
+ outputun2 : unassembled reads_2
+
+ If a barcode / mate-pairs file is also provided then the following
+ additional files are output:
+ outputjoin2
+ outputun3
+
+ If a verbose stitch length report (-r) is chosen to be written by the
+ user then use a user specified filename.
+ """
+ output_path = self._get_output_path()
+
+ result = {}
+
+ # always output:
+ result['Assembled'] = ResultPath(Path=output_path + 'join',
+ IsWritten=True)
+ result['UnassembledReads1'] = ResultPath(Path=output_path + 'un1',
+ IsWritten=True)
+ result['UnassembledReads2'] = ResultPath(Path=output_path + 'un2',
+ IsWritten=True)
+
+ # check if stitch report is requested:
+ stitch_path = self._get_stitch_report_path()
+ if stitch_path:
+ result['Report'] = ResultPath(Path=stitch_path,
+ IsWritten=True)
+
+ # Check if mate file / barcode file is present.
+ # If not, return result
+ # We need to check this way becuase there are no infile parameters.
+ mate_path_string = output_path + 'join2'
+ mate_unassembled_path_string = output_path + 'un3'
+ if os.path.exists(mate_path_string) and \
+ os.path.exists(mate_unassembled_path_string):
+ result['Mate'] = ResultPath(Path=mate_path_string,
+ IsWritten=True)
+ result['MateUnassembled'] = ResultPath(Path=
+ mate_unassembled_path_string,
+ IsWritten=True)
+ else:
+ pass
+ return result
+
+ def getHelp(self):
+ """fastq-join (v1.1.2) help"""
+ help_str = """
+ For issues with the actual program 'fastq-join', see the following:
+
+ For basic help, type the following at the command line:
+ 'fastq-join'
+
+ Website:
+ http://code.google.com/p/ea-utils/
+
+ For questions / comments subit an issue to:
+ http://code.google.com/p/ea-utils/issues/list
+ """
+ return help_str
+
+
+def join_paired_end_reads_fastqjoin(
+ reads1_infile_path,
+ reads2_infile_path,
+ perc_max_diff=None, # typical default is 8
+ min_overlap=None, # typical default is 6
+ outfile_label='fastqjoin',
+ params={},
+ working_dir=tempfile.gettempdir(),
+ SuppressStderr=True,
+ SuppressStdout=True,
+ HALT_EXEC=False):
+ """ Runs fastq-join, with default parameters to assemble paired-end reads.
+ Returns file path string.
+
+ -reads1_infile_path : reads1.fastq infile path
+ -reads2_infile_path : reads2.fastq infile path
+ -perc_max_diff : maximum % diff of overlap differences allowed
+ -min_overlap : minimum allowed overlap required to assemble reads
+ -outfile_label : base name for output files.
+ -params : dictionary of application controller parameters
+
+ """
+ abs_r1_path = os.path.abspath(reads1_infile_path)
+ abs_r2_path = os.path.abspath(reads2_infile_path)
+
+ infile_paths = [abs_r1_path, abs_r2_path]
+
+ # check / make absolute infile paths
+ for p in infile_paths:
+ if not os.path.exists(p):
+ raise IOError('File not found at: %s' % p)
+
+ fastq_join_app = FastqJoin(params=params,
+ WorkingDir=working_dir,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout,
+ HALT_EXEC=HALT_EXEC)
+
+ # set param. Helps with QIIME integration to have these values
+ # set to None by default. This way we do not have to worry
+ # about changes in default behaviour of the wrapped
+ # application
+ if perc_max_diff is not None:
+ if isinstance(perc_max_diff, int) and 0 <= perc_max_diff <= 100:
+ fastq_join_app.Parameters['-p'].on(perc_max_diff)
+ else:
+ raise ValueError("perc_max_diff must be int between 0-100!")
+
+ if min_overlap is not None:
+ if isinstance(min_overlap, int) and 0 < min_overlap:
+ fastq_join_app.Parameters['-m'].on(min_overlap)
+ else:
+ raise ValueError("min_overlap must be an int >= 0!")
+
+ if outfile_label is not None:
+ if isinstance(outfile_label, str):
+ fastq_join_app.Parameters['-o'].on(outfile_label + '.')
+ else:
+ raise ValueError("outfile_label must be a string!")
+ else:
+ pass
+
+ # run assembler
+ result = fastq_join_app(infile_paths)
+
+ # Store output file path data to dict
+ path_dict = {}
+ path_dict['Assembled'] = result['Assembled'].name
+ path_dict['UnassembledReads1'] = result['UnassembledReads1'].name
+ path_dict['UnassembledReads2'] = result['UnassembledReads2'].name
+
+ # sanity check that files actually exist in path lcoations
+ for path in path_dict.values():
+ if not os.path.exists(path):
+ raise IOError('Output file not found at: %s' % path)
+
+ # fastq-join automatically appends: 'join', 'un1', or 'un2'
+ # to the end of the file names. But we want to rename them so
+ # they end in '.fastq'. So, we iterate through path_dict to
+ # rename the files and overwrite the dict values.
+ for key, file_path in path_dict.items():
+ new_file_path = file_path + '.fastq'
+ shutil.move(file_path, new_file_path)
+ path_dict[key] = new_file_path
+
+ return path_dict
diff --git a/bfillings/fasttree.py b/bfillings/fasttree.py
new file mode 100644
index 0000000..4f752ca
--- /dev/null
+++ b/bfillings/fasttree.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for FastTree
+
+designed for FastTree v1.1.0 . Also functions with v2.0.1, v2.1.0, and v2.1.3
+though only with basic functionality"""
+
+from burrito.parameters import (ValuedParameter, FlagParameter,
+ MixedParameter)
+from burrito.util import (CommandLineApplication, FilePath, system,
+ CommandLineAppResult, ResultPath, remove,
+ ApplicationError)
+
+from cogent.core.tree import PhyloNode
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import SequenceCollection
+
+
+class FastTree(CommandLineApplication):
+ """FastTree application Controller"""
+
+ _command = 'FastTree'
+ _input_handler = '_input_as_multiline_string'
+ _parameters = {
+ '-quiet':FlagParameter('-',Name='quiet'),
+ '-boot':ValuedParameter('-',Delimiter=' ',Name='boot'),
+ '-seed':ValuedParameter('-',Delimiter=' ',Name='seed'),
+ '-nni':ValuedParameter('-',Delimiter=' ',Name='nni'),
+ '-slow':FlagParameter('-',Name='slow'),
+ '-fastest':FlagParameter('-',Name='fastest'),
+ '-top':FlagParameter('-',Name='top'),
+ '-notop':FlagParameter('-',Name='notop'),
+ '-topm':ValuedParameter('-',Delimiter=' ',Name='topm'),
+ '-close':ValuedParameter('-',Delimiter=' ',Name='close'),
+ '-refresh':ValuedParameter('-',Delimiter=' ',Name='refresh'),
+ '-matrix':ValuedParameter('-',Delimiter=' ',Name='matrix'),
+ '-nomatrix':FlagParameter('-',Name='nomatrix'),
+ '-nj':FlagParameter('-',Name='nj'),
+ '-bionj':FlagParameter('-',Name='bionj'),
+ '-nt':FlagParameter('-',Name='nt'),
+ '-n':ValuedParameter('-',Delimiter=' ',Name='n'),
+ '-pseudo':MixedParameter('-',Delimiter=' ', Name='pseudo'),
+ '-intree':ValuedParameter('-',Delimiter=' ',Name='intree'),
+ '-spr':ValuedParameter('-',Delimiter=' ',Name='spr'),
+ '-constraints':ValuedParameter('-',Delimiter=' ',\
+ Name='constraints'),
+ '-constraintWeight':ValuedParameter('-',Delimiter=' ',\
+ Name='constraintWeight'),\
+ '-makematrix':ValuedParameter('-',Delimiter=' ',Name='makematrix')}
+
+ def __call__(self,data=None, remove_tmp=True):
+ """Run the application with the specified kwargs on data
+
+ data: anything that can be cast into a string or written out to
+ a file. Usually either a list of things or a single string or
+ number. input_handler will be called on this data before it
+ is passed as part of the command-line argument, so by creating
+ your own input handlers you can customize what kind of data
+ you want your application to accept
+
+ remove_tmp: if True, removes tmp files
+
+ NOTE: Override of the base class to handle redirected output
+ """
+ input_handler = self.InputHandler
+ suppress_stderr = self.SuppressStderr
+
+ outfile = self.getTmpFilename(self.TmpDir)
+ self._outfile = outfile
+
+ if suppress_stderr:
+ errfile = FilePath('/dev/null')
+ else:
+ errfile = FilePath(self.getTmpFilename(self.TmpDir))
+ if data is None:
+ input_arg = ''
+ else:
+ input_arg = getattr(self,input_handler)(data)
+
+ # Build up the command, consisting of a BaseCommand followed by
+ # input and output (file) specifications
+ command = self._command_delimiter.join(filter(None,\
+ [self.BaseCommand,str(input_arg),'>',str(outfile),'2>',\
+ str(errfile)]))
+ if self.HaltExec:
+ raise AssertionError, "Halted exec with command:\n" + command
+ # The return value of system is a 16-bit number containing the signal
+ # number that killed the process, and then the exit status.
+ # We only want to keep the exit status so do a right bitwise shift to
+ # get rid of the signal number byte
+ exit_status = system(command) >> 8
+
+ # Determine if error should be raised due to exit status of
+ # appliciation
+ if not self._accept_exit_status(exit_status):
+ raise ApplicationError, \
+ 'Unacceptable application exit status: %s, command: %s'\
+ % (str(exit_status),command)
+
+ out = open(outfile,"r")
+
+ err = None
+ if not suppress_stderr:
+ err = open(errfile,"r")
+
+ result = CommandLineAppResult(out,err,exit_status,\
+ result_paths=self._get_result_paths(data))
+
+ # Clean up the input file if one was created
+ if remove_tmp:
+ if self._input_filename:
+ remove(self._input_filename)
+ self._input_filename = None
+
+ return result
+
+ def _get_result_paths(self, data):
+ result = {}
+ result['Tree'] = ResultPath(Path=self._outfile)
+ return result
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None):
+ """Returns a tree from alignment
+
+ Will check MolType of aln object
+ """
+ if params is None:
+ params = {}
+
+ if moltype == DNA or moltype == RNA:
+ params['-nt'] = True
+ elif moltype == PROTEIN:
+ params['-nt'] = False
+ else:
+ raise ValueError, \
+ "FastTree does not support moltype: %s" % moltype.label
+
+ if best_tree:
+ params['-slow'] = True
+
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = aln.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+
+ app = FastTree(params=params)
+
+ result = app(int_map.toFasta())
+ tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+ #remap tip names
+ for tip in tree.tips():
+ tip.Name = int_keys[tip.Name]
+
+ return tree
diff --git a/bfillings/fasttree_v1.py b/bfillings/fasttree_v1.py
new file mode 100644
index 0000000..a887b6d
--- /dev/null
+++ b/bfillings/fasttree_v1.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for FastTree v1.0"""
+
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, FilePath, system,
+ CommandLineAppResult, ResultPath, remove,
+ ApplicationError)
+
+from cogent.core.tree import PhyloNode
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA, RNA, PROTEIN
+
+
+class FastTree(CommandLineApplication):
+ """FastTree application Controller"""
+
+ _command = 'FastTree'
+ _input_handler = '_input_as_multiline_string'
+ _parameters = {
+ '-quiet':FlagParameter('-',Name='quiet'),
+ '-boot':ValuedParameter('-',Delimiter=' ',Name='boot'),
+ '-seed':ValuedParameter('-',Delimiter=' ',Name='seed'),
+ '-nni':ValuedParameter('-',Delimiter=' ',Name='nni'),
+ '-slow':FlagParameter('-',Name='slow'),
+ '-fastest':FlagParameter('-',Name='fastest'),
+ '-top':FlagParameter('-',Name='top'),
+ '-notop':FlagParameter('-',Name='notop'),
+ '-topm':ValuedParameter('-',Delimiter=' ',Name='topm'),
+ '-close':ValuedParameter('-',Delimiter=' ',Name='close'),
+ '-refresh':ValuedParameter('-',Delimiter=' ',Name='refresh'),
+ '-matrix':ValuedParameter('-',Delimiter=' ',Name='matrix'),
+ '-nomatrix':FlagParameter('-',Name='nomatrix'),
+ '-nj':FlagParameter('-',Name='nj'),
+ '-bionj':FlagParameter('-',Name='bionj'),
+ '-nt':FlagParameter('-',Name='nt'),
+ '-n':ValuedParameter('-',Delimiter=' ',Name='n')}
+
+ #FastTree [-quiet] [-boot 1000] [-seed 1253] [-nni 10] [-slow | -fastest]
+ # [-top | -notop] [-topm 1.0 [-close 0.75] [-refresh 0.8]]
+ # [-matrix Matrix | -nomatrix] [-nj | -bionj]
+ # [-nt] [-n 100] [alignment] > newick_tree
+
+ def __call__(self,data=None, remove_tmp=True):
+ """Run the application with the specified kwargs on data
+
+ data: anything that can be cast into a string or written out to
+ a file. Usually either a list of things or a single string or
+ number. input_handler will be called on this data before it
+ is passed as part of the command-line argument, so by creating
+ your own input handlers you can customize what kind of data
+ you want your application to accept
+
+ remove_tmp: if True, removes tmp files
+
+ NOTE: Override of the base class to handle redirected output
+ """
+ input_handler = self.InputHandler
+ suppress_stderr = self.SuppressStderr
+
+ outfile = self.getTmpFilename(self.TmpDir)
+ self._outfile = outfile
+
+ if suppress_stderr:
+ errfile = FilePath('/dev/null')
+ else:
+ errfile = FilePath(self.getTmpFilename(self.TmpDir))
+ if data is None:
+ input_arg = ''
+ else:
+ input_arg = getattr(self,input_handler)(data)
+
+ # Build up the command, consisting of a BaseCommand followed by
+ # input and output (file) specifications
+ command = self._command_delimiter.join(filter(None,\
+ [self.BaseCommand,str(input_arg),'>',str(outfile),'2>',\
+ str(errfile)]))
+ if self.HaltExec:
+ raise AssertionError, "Halted exec with command:\n" + command
+ # The return value of system is a 16-bit number containing the signal
+ # number that killed the process, and then the exit status.
+ # We only want to keep the exit status so do a right bitwise shift to
+ # get rid of the signal number byte
+ exit_status = system(command) >> 8
+
+ # Determine if error should be raised due to exit status of
+ # appliciation
+ if not self._accept_exit_status(exit_status):
+ raise ApplicationError, \
+ 'Unacceptable application exit status: %s, command: %s'\
+ % (str(exit_status),command)
+
+ out = open(outfile,"r")
+
+ err = None
+ if not suppress_stderr:
+ err = open(errfile,"r")
+
+ result = CommandLineAppResult(out,err,exit_status,\
+ result_paths=self._get_result_paths(data))
+
+ # Clean up the input file if one was created
+ if remove_tmp:
+ if self._input_filename:
+ remove(self._input_filename)
+ self._input_filename = None
+
+ return result
+
+ def _get_result_paths(self, data):
+ result = {}
+ result['Tree'] = ResultPath(Path=self._outfile)
+ return result
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None):
+ """Returns a tree from alignment
+
+ Will check MolType of aln object
+ """
+ if params is None:
+ params = {}
+
+ if moltype == DNA or moltype == RNA:
+ params['-nt'] = True
+ elif moltype == PROTEIN:
+ params['-nt'] = False
+ else:
+ raise ValueError, \
+ "FastTree does not support moltype: %s" % moltype.label
+
+ app = FastTree(params=params)
+
+ if best_tree:
+ raise NotImplementedError, "best_tree not implemented yet"
+ result = app(aln.toFasta())
+ tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+ return tree
diff --git a/bfillings/formatdb.py b/bfillings/formatdb.py
new file mode 100755
index 0000000..e089a4b
--- /dev/null
+++ b/bfillings/formatdb.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+""" Description
+File created on 16 Sep 2009.
+
+"""
+from __future__ import division
+from optparse import OptionParser
+from os.path import split, splitext
+from os import remove
+from glob import glob
+from tempfile import mkstemp
+
+from burrito.util import CommandLineApplication, ResultPath
+from burrito.parameters import ValuedParameter, FilePath
+
+
+class FormatDb(CommandLineApplication):
+ """ ApplicationController for formatting blast databases
+
+ Currently contains a minimal parameter set.
+ """
+
+ _command = 'formatdb'
+ _parameters = {
+ '-i': ValuedParameter(Prefix='-', Name='i', Delimiter=' ',
+ IsPath=True),
+ '-l': ValuedParameter(Prefix='-', Name='l', Delimiter=' ',
+ IsPath=True),
+ '-o': ValuedParameter(Prefix='-', Name='o', Delimiter=' ', Value='T'),
+ '-p': ValuedParameter(Prefix='-', Name='p', Delimiter=' ', Value='F'),
+ '-n': ValuedParameter(Prefix='-', Name='n', Delimiter=' ')
+ }
+ _input_handler = '_input_as_parameter'
+ _suppress_stdout = True
+ _suppress_stderr = True
+
+ def _input_as_parameter(self, data):
+ """ Set the input path and log path based on data (a fasta filepath)
+ """
+ self.Parameters['-i'].on(data)
+ # access data through self.Parameters so we know it's been cast
+ # to a FilePath
+ input_filepath = self.Parameters['-i'].Value
+ input_file_dir, input_filename = split(input_filepath)
+ input_file_base, input_file_ext = splitext(input_filename)
+ # FIXME: the following all other options
+ # formatdb ignores the working directory if not name is passed.
+ self.Parameters['-l'].on(FilePath('%s.log') % input_filename)
+ self.Parameters['-n'].on(FilePath(input_filename))
+ return ''
+
+ def _get_result_paths(self, data):
+ """ Build the dict of result filepaths
+ """
+ # access data through self.Parameters so we know it's been cast
+ # to a FilePath
+ wd = self.WorkingDir
+ db_name = self.Parameters['-n'].Value
+ log_name = self.Parameters['-l'].Value
+ result = {}
+ result['log'] = ResultPath(Path=wd + log_name, IsWritten=True)
+ if self.Parameters['-p'].Value == 'F':
+ extensions = ['nhr', 'nin', 'nsq', 'nsd', 'nsi']
+ else:
+ extensions = ['phr', 'pin', 'psq', 'psd', 'psi']
+ for extension in extensions:
+ for file_path in glob(wd + (db_name + '*' + extension)):
+ # this will match e.g. nr.01.psd and nr.psd
+ key = file_path.split(db_name + '.')[1]
+ result_path = ResultPath(Path=file_path, IsWritten=True)
+ result[key] = result_path
+ return result
+
+ def _accept_exit_status(self, exit_status):
+ """ Return True when the exit status was 0
+ """
+ return exit_status == 0
+
+
+def build_blast_db_from_fasta_path(fasta_path, is_protein=False,
+ output_dir=None, HALT_EXEC=False):
+ """Build blast db from fasta_path; return db name and list of files created
+
+ **If using to create temporary blast databases, you can call
+ cogent.util.misc.remove_files(db_filepaths) to clean up all the
+ files created by formatdb when you're done with the database.
+
+ fasta_path: path to fasta file of sequences to build database from
+ is_protein: True if working on protein seqs (default: False)
+ output_dir: directory where output should be written
+ (default: directory containing fasta_path)
+ HALT_EXEC: halt just before running the formatdb command and
+ print the command -- useful for debugging
+ """
+ fasta_dir, fasta_filename = split(fasta_path)
+ if not output_dir:
+ output_dir = fasta_dir or '.'
+ # Will cd to this directory, so just pass the filename
+ # so the app is not confused by relative paths
+ fasta_path = fasta_filename
+
+ if not output_dir.endswith('/'):
+ db_name = output_dir + '/' + fasta_filename
+ else:
+ db_name = output_dir + fasta_filename
+
+ # instantiate the object
+ fdb = FormatDb(WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+ if is_protein:
+ fdb.Parameters['-p'].on('T')
+ else:
+ fdb.Parameters['-p'].on('F')
+ app_result = fdb(fasta_path)
+ db_filepaths = []
+ for v in app_result.values():
+ try:
+ db_filepaths.append(v.name)
+ except AttributeError:
+ # not a file object, so no path to return
+ pass
+ return db_name, db_filepaths
+
+
+def build_blast_db_from_fasta_file(fasta_file, is_protein=False,
+ output_dir=None, HALT_EXEC=False):
+ """Build blast db from fasta_path; return db name and list of files created
+
+ **If using to create temporary blast databases, you can call
+ cogent.util.misc.remove_files(db_filepaths) to clean up all the
+ files created by formatdb when you're done with the database.
+
+ fasta_path: path to fasta file of sequences to build database from
+ is_protein: True if working on protein seqs (default: False)
+ output_dir: directory where output should be written
+ (default: directory containing fasta_path)
+ HALT_EXEC: halt just before running the formatdb command and
+ print the command -- useful for debugging
+ """
+ output_dir = output_dir or '.'
+ _, fasta_path = mkstemp(dir=output_dir, prefix="BLAST_temp_db_",
+ suffix=".fasta")
+
+ fasta_f = open(fasta_path, 'w')
+ for line in fasta_file:
+ fasta_f.write('%s\n' % line.strip())
+ fasta_f.close()
+
+ blast_db, db_filepaths = build_blast_db_from_fasta_path(fasta_path,
+ is_protein=is_protein,
+ output_dir=None,
+ HALT_EXEC=HALT_EXEC
+ )
+
+ db_filepaths.append(fasta_path)
+
+ return blast_db, db_filepaths
+
+
+def build_blast_db_from_seqs(seqs, is_protein=False, output_dir='./',
+ HALT_EXEC=False):
+ """Build blast db from seqs; return db name and list of files created
+
+ **If using to create temporary blast databases, you can call
+ cogent.util.misc.remove_files(db_filepaths) to clean up all the
+ files created by formatdb when you're done with the database.
+
+ seqs: sequence collection or alignment object
+ is_protein: True if working on protein seqs (default: False)
+ output_dir: directory where output should be written
+ (default: current directory)
+ HALT_EXEC: halt just before running the formatdb command and
+ print the command -- useful for debugging
+ """
+
+ # Build a temp filepath
+ _, tmp_fasta_filepath = mkstemp(prefix='Blast_tmp_db', suffix='.fasta')
+ # open the temp file
+ tmp_fasta_file = open(tmp_fasta_filepath, 'w')
+ # write the sequence collection to file
+ tmp_fasta_file.write(seqs.toFasta())
+ tmp_fasta_file.close()
+
+ # build the bast database
+ db_name, db_filepaths = build_blast_db_from_fasta_path(tmp_fasta_filepath,
+ is_protein=is_protein,
+ output_dir=output_dir,
+ HALT_EXEC=HALT_EXEC)
+
+ # clean-up the temporary file
+ remove(tmp_fasta_filepath)
+
+ # return the results
+ return db_name, db_filepaths
+
+
+def parse_command_line_parameters():
+ """ Parses command line arguments """
+ usage = 'usage: %prog [options] fasta_filepath'
+ version = 'Version: %prog 0.1'
+ parser = OptionParser(usage=usage, version=version)
+
+ # A binary 'verbose' flag
+ parser.add_option('-p', '--is_protein', action='store_true',
+ dest='is_protein', default=False,
+ help='Pass if building db of protein sequences [default:'
+ ' False, nucleotide db]')
+
+ parser.add_option('-o', '--output_dir', action='store', type='string',
+ dest='output_dir', default=None,
+ help='the output directory [default: directory '
+ 'containing input fasta_filepath]')
+
+ opts, args = parser.parse_args()
+ num_args = 1
+ if len(args) != num_args:
+ parser.error('Must provide single filepath to build database from.')
+
+ return opts, args
+
+
+if __name__ == "__main__":
+ opts, args = parse_command_line_parameters()
+
+ fasta_filepath = args[0]
+ is_protein = opts.is_protein
+ output_dir = opts.output_dir
+
+ db_name, db_filepaths = build_blast_db_from_fasta_path(fasta_filepath,
+ is_protein=is_protein,
+ output_dir=output_dir
+ )
diff --git a/bfillings/infernal.py b/bfillings/infernal.py
new file mode 100644
index 0000000..788d074
--- /dev/null
+++ b/bfillings/infernal.py
@@ -0,0 +1,1571 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Provides an application controller for the commandline version of:
+Infernal 1.0 and 1.0.2 only.
+"""
+from os import remove
+from tempfile import mkstemp
+
+from burrito.parameters import FlagParameter, ValuedParameter, FilePath
+from burrito.util import CommandLineApplication, ResultPath
+
+from cogent.core.alignment import SequenceCollection, Alignment, DataError
+from cogent.parse.rfam import (MinimalRfamParser, ChangedSequence,
+ ChangedRnaSequence, ChangedDnaSequence)
+from cogent.format.stockholm import stockholm_from_alignment
+from cogent.parse.infernal import CmsearchParser
+from cogent.core.moltype import DNA, RNA
+from cogent.struct.rna2d import ViennaStructure, wuss_to_vienna
+
+MOLTYPE_MAP = {'DNA':'--dna',\
+ DNA:'--dna',\
+ 'RNA':'--rna',\
+ RNA:'--rna',\
+ }
+
+SEQ_CONSTRUCTOR_MAP = {'DNA':ChangedDnaSequence,\
+ DNA:ChangedDnaSequence,\
+ 'RNA':ChangedRnaSequence,\
+ RNA:ChangedRnaSequence,\
+ }
+
+class Cmalign(CommandLineApplication):
+ """cmalign application controller."""
+ _options = {
+
+ # -o <f> Save the alignment in Stockholm format to a file <f>. The default
+ # is to write it to standard output.
+ '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' '),\
+
+ # -l Turn on the local alignment algorithm. Default is global.
+ '-l':FlagParameter(Prefix='-',Name='l'),\
+
+ # -p Annotate the alignment with posterior probabilities calculated using
+ # the Inside and Outside algorithms.
+ '-p':FlagParameter(Prefix='-',Name='p'),\
+
+ # -q Quiet; suppress the verbose banner, and only print the resulting
+ # alignment to stdout.
+ '-q':FlagParameter(Prefix='-',Name='q'),\
+
+ # --informat <s> Assert that the input seqfile is in format <s>. Do not run
+ # Babelfish format autodection. Acceptable formats are: FASTA, EMBL,
+ # UNIPROT, GENBANK, and DDBJ. <s> is case-insensitive.
+ '--informat':ValuedParameter(Prefix='--',Name='informat',Delimiter=' '),\
+
+ # --mpi Run as an MPI parallel program. (see User's Guide for details).
+ '--mpi':FlagParameter(Prefix='--',Name='mpi'),\
+
+ # Expert Options
+
+ # --optacc Align sequences using the Durbin/Holmes optimal accuracy
+ # algorithm. This is default behavior, so this option is probably useless.
+ '--optacc':FlagParameter(Prefix='--',Name='optacc'),\
+
+ # --cyk Do not use the Durbin/Holmes optimal accuracy alignment to align the
+ # sequences, instead use the CYK algorithm which determines the optimally
+ # scoring alignment of the sequence to the model.
+ '--cyk':FlagParameter(Prefix='--',Name='cyk'),\
+
+ # --sample Sample an alignment from the posterior distribution of
+ # alignments.
+ '--sample':FlagParameter(Prefix='--',Name='sample'),\
+
+ # -s <n> Set the random number generator seed to <n>, where <n> is a
+ # positive integer. This option can only be used in combination with
+ # --sample. The default is to use time() to generate a different seed for
+ # each run, which means that two different runs of cmalign --sample on the
+ # same alignment will give slightly different results. You can use this
+ # option to generate reproducible results.
+ '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+ # --viterbi Do not use the CM to align the sequences, instead use the HMM
+ # Viterbi algorithm to align with a CM Plan 9 HMM.
+ '--viterbi':FlagParameter(Prefix='--',Name='viterbi'),\
+
+ # --sub Turn on the sub model construction and alignment procedure.
+ '--sub':FlagParameter(Prefix='--',Name='sub'),\
+
+ # --small Use the divide and conquer CYK alignment algorithm described in
+ # SR Eddy, BMC Bioinformatics 3:18, 2002.
+ '--small':FlagParameter(Prefix='--',Name='small'),\
+
+ # --hbanded This option is turned on by default. Accelerate alignment by
+ # pruning away regions of the CM DP matrix that are deemed negligible by
+ # an HMM.
+ '--hbanded':FlagParameter(Prefix='--',Name='hbanded'),\
+
+ # --nonbanded Turns off HMM banding.
+ '--nonbanded':FlagParameter(Prefix='--',Name='nonbanded'),\
+
+ # --tau <x> Set the tail loss probability used during HMM band calculation
+ # to <x>.
+ '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\
+
+ # --mxsize <x> Set the maximum allowable DP matrix size to <x> megabytes.
+ '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+ # --rna Output the alignments as RNA sequence alignments. This is true by
+ # default.
+ '--rna':FlagParameter(Prefix='--',Name='rna'),\
+
+ # --dna Output the alignments as DNA sequence alignments.
+ '--dna':FlagParameter(Prefix='--',Name='dna'),\
+
+ # --matchonly Only include match columns in the output alignment, do not
+ # include any insertions relative to the consensus model.
+ '--matchonly':FlagParameter(Prefix='--',Name='matchonly'),\
+
+ # --resonly Only include match columns in the output alignment that have at
+ # least 1 residue (non-gap character) in them.
+ '--resonly':FlagParameter(Prefix='--',Name='resonly'),\
+
+ # --fins Change the behavior of how insert emissions are placed in the
+ # alignment.
+ '--fins':FlagParameter(Prefix='--',Name='fins'),\
+
+ # --onepost Modifies behavior of the -p option. Use only one character
+ # instead of two to annotate the posterior probability of each aligned
+ # residue.
+ '--onepost':FlagParameter(Prefix='--',Name='onepost'),\
+
+ # --withali <f> Reads an alignment from file <f> and aligns it as a single
+ # object to the CM; e.g. the alignment in <f> is held fixed.
+ '--withali':ValuedParameter(Prefix='--',Name='withali',Delimiter=' '),\
+
+ # --withpknots Must be used in combination with --withali <f>. Propogate
+ # structural information for any pseudoknots that exist in <f> to the
+ # output alignment.
+ '--withpknots':FlagParameter(Prefix='--',Name='withpknots'),\
+
+ # --rf Must be used in combination with --withali <f>. Specify that the
+ # alignment in <f> has the same "#=GC RF" annotation as the alignment file
+ # the CM was built from using cmbuild and further that the --rf option was
+ # supplied to cmbuild when the CM was constructed.
+ '--rf':FlagParameter(Prefix='--',Name='rf'),\
+
+ # --gapthresh <x> Must be used in combination with --withali <f>. Specify
+ # that the --gapthresh <x> option was supplied to cmbuild when the CM was
+ # constructed from the alignment file <f>.
+ '--gapthresh':ValuedParameter(Prefix='--',Name='gapthresh',Delimiter=' '),\
+
+ # --tfile <f> Dump tabular sequence tracebacks for each individual sequence
+ # to a file <f>. Primarily useful for debugging.
+ '--tfile':ValuedParameter(Prefix='--',Name='tfile',Delimiter=' '),\
+
+
+ }
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "cmalign"
+ _suppress_stderr=True
+
+ def getHelp(self):
+ """Method that points to the Infernal documentation."""
+
+ help_str = \
+ """
+ See Infernal documentation at:
+ http://infernal.janelia.org/
+ """
+ return help_str
+
+ def _tempfile_as_multiline_string(self, data):
+ """Write a multiline string to a temp file and return the filename.
+
+ data: a multiline string to be written to a file.
+
+ * Note: the result will be the filename as a FilePath object
+ (which is a string subclass).
+
+ """
+ filename = FilePath(self.getTmpFilename(self.TmpDir))
+ data_file = open(filename,'w')
+ data_file.write(data)
+ data_file.close()
+ return filename
+
+ def _alignment_out_filename(self):
+
+ if self.Parameters['-o'].isOn():
+ refined_filename = self._absolute(str(\
+ self.Parameters['-o'].Value))
+ else:
+ raise ValueError, 'No alignment output file specified.'
+ return refined_filename
+
+ def _get_result_paths(self,data):
+ result = {}
+ if self.Parameters['-o'].isOn():
+ out_name = self._alignment_out_filename()
+ result['Alignment'] = ResultPath(Path=out_name,IsWritten=True)
+
+ return result
+
+class Cmbuild(CommandLineApplication):
+ """cmbuild application controller."""
+ _options = {
+
+ # -n <s> Name the covariance model <s>. (Does not work if alifile contains
+ # more than one alignment).
+ '-n':ValuedParameter(Prefix='-',Name='n',Delimiter=' '),\
+
+ # -A Append the CM to cmfile, if cmfile already exists.
+ '-A':FlagParameter(Prefix='-',Name='A'),\
+
+ # -F Allow cmfile to be overwritten. Normally, if cmfile already exists,
+ # cmbuild exits with an error unless the -A or -F option is set.
+ '-F':FlagParameter(Prefix='-',Name='F'),\
+
+ # -v Run in verbose output mode instead of using the default single line
+ # tabular format. This output format is similar to that used by older
+ # versions of Infernal.
+ '-v':FlagParameter(Prefix='-',Name='v'),\
+
+ # --iins Allow informative insert emissions for the CM. By default, all CM
+ # insert emission scores are set to 0.0 bits.
+ '--iins':FlagParameter(Prefix='--',Name='iins'),\
+
+ # --Wbeta<x> Set the beta tail loss probability for query-dependent banding
+ # (QDB) to <x> The QDB algorithm is used to determine the maximium length
+ # of a hit to the model. For more information on QDB see (Nawrocki and
+ # Eddy, PLoS Computational Biology 3(3): e56).
+ '--Wbeta':ValuedParameter(Prefix='--',Name='Wbeta',Delimiter=' '),\
+
+ # Expert Options
+
+ # --rsearch <f> Parameterize emission scores a la RSEARCH, using the
+ # RIBOSUM matrix in file <f>. For more information see the RSEARCH
+ # publication (Klein and Eddy, BMC Bioinformatics 4:44, 2003). Actually,
+ # the emission scores will not exactly With --rsearch enabled, all
+ # alignments in alifile must contain exactly one sequence or the --call
+ # option must also be enabled.
+ '--rsearch':ValuedParameter(Prefix='--',Name='rsearch',Delimiter=' '),\
+
+ # --binary Save the model in a compact binary format. The default is a more
+ # readable ASCII text format.
+ '--binary':FlagParameter(Prefix='--',Name='binary'),\
+
+ # --rf Use reference coordinate annotation (#=GC RF line, in Stockholm) to
+ # determine which columns are consensus, and which are inserts.
+ '--rf':FlagParameter(Prefix='--',Name='rf'),\
+
+ # --gapthresh <x> Set the gap threshold (used for determining which columns
+ # are insertions versus consensus; see --rf above) to <x>. The default is
+ # 0.5.
+ '--gapthresh':ValuedParameter(Prefix='--',Name='gapthresh',Delimiter=' '),\
+
+ # --ignorant Strip all base pair secondary structure information from all
+ # input alignments in alifile before building the CM(s).
+ '--ignorant':FlagParameter(Prefix='--',Name='ignorant'),\
+
+ # --wgsc Use the Gerstein/Sonnhammer/Chothia (GSC) weighting algorithm.
+ # This is the default unless the number of sequences in the alignment
+ # exceeds a cutoff (see --pbswitch), in which case the default becomes
+ # the faster Henikoff position-based weighting scheme.
+ '--wgsc':FlagParameter(Prefix='--',Name='wgsc'),\
+
+ # --wblosum Use the BLOSUM filtering algorithm to weight the sequences,
+ # instead of the default GSC weighting.
+ '--wblosum':FlagParameter(Prefix='--',Name='wblosum'),\
+
+ # --wpb Use the Henikoff position-based weighting scheme. This weighting
+ # scheme is automatically used (overriding --wgsc and --wblosum) if the
+ # number of sequences in the alignment exceeds a cutoff (see --pbswitch).
+ '--wpb':FlagParameter(Prefix='--',Name='wpb'),\
+
+ # --wnone Turn sequence weighting off; e.g. explicitly set all sequence
+ # weights to 1.0.
+ '--wnone':FlagParameter(Prefix='--',Name='wnone'),\
+
+ # --wgiven Use sequence weights as given in annotation in the input
+ # alignment file. If no weights were given, assume they are all 1.0.
+ # The default is to determine new sequence weights by the Gerstein/
+ # Sonnhammer/Chothia algorithm, ignoring any annotated weights.
+ '--wgiven':FlagParameter(Prefix='--',Name='wgiven'),\
+
+ # --pbswitch <n> Set the cutoff for automatically switching the weighting
+ # method to the Henikoff position-based weighting scheme to <n>. If the
+ # number of sequences in the alignment exceeds <n> Henikoff weighting is
+ # used. By default <n> is 5000.
+ '--pbswitch':ValuedParameter(Prefix='--',Name='pbswitch',Delimiter=' '),\
+
+ # --wid <x> Controls the behavior of the --wblosum weighting option by
+ # setting the percent identity for clustering the alignment to <x>.
+ '--wid':ValuedParameter(Prefix='--',Name='wid',Delimiter=' '),\
+
+ # --eent Use the entropy weighting strategy to determine the effective
+ # sequence number that gives a target mean match state relative entropy.
+ '--wgiven':FlagParameter(Prefix='--',Name='wgiven'),\
+
+ # --enone Turn off the entropy weighting strategy. The effective sequence
+ # number is just the number of sequences in the alignment.
+ '--wgiven':FlagParameter(Prefix='--',Name='wgiven'),\
+
+ # --ere <x> Set the target mean match state entropy as <x>. By default the
+ # target entropy 1.46 bits.
+ '--ere':ValuedParameter(Prefix='--',Name='ere',Delimiter=' '),\
+
+ # --null <f> Read a null model from <f>. The null model defines the
+ # probability of each RNA nucleotide in background sequence, the default
+ # is to use 0.25 for each nucleotide.
+ '--null':ValuedParameter(Prefix='--',Name='null',Delimiter=' '),\
+
+ # --prior <f> Read a Dirichlet prior from <f>, replacing the default mixture
+ # Dirichlet.
+ '--prior':ValuedParameter(Prefix='--',Name='prior',Delimiter=' '),\
+
+ # --ctarget <n> Cluster each alignment in alifile by percent identity.
+ # find a cutoff percent id threshold that gives exactly <n> clusters and
+ # build a separate CM from each cluster. If <n> is greater than the number
+ # of sequences in the alignment the program will not complain, and each
+ # sequence in the alignment will be its own cluster. Each CM will have a
+ # positive integer appended to its name indicating the order in which it
+ # was built.
+ '--ctarget':ValuedParameter(Prefix='--',Name='ctarget',Delimiter=' '),\
+
+ # --cmaxid <x> Cluster each sequence alignment in alifile by percent
+ # identity. Define clusters at the cutoff fractional id similarity of <x>
+ # and build a separate CM from each cluster.
+ '--cmaxid':ValuedParameter(Prefix='--',Name='cmaxid',Delimiter=' '),\
+
+ # --call Build a separate CM from each sequence in each alignment in
+ # alifile. Naming of CMs takes place as described above for --ctarget.
+ '--call':FlagParameter(Prefix='--',Name='call'),\
+
+ # --corig After building multiple CMs using --ctarget, --cmindiff or --call
+ # as described above, build a final CM using the complete original
+ # alignment from alifile.
+ '--corig':FlagParameter(Prefix='--',Name='corig'),\
+
+ # --cdump<f> Dump the multiple alignments of each cluster to <f> in
+ # Stockholm format. This option only works in combination with --ctarget,
+ # --cmindiff or --call.
+ '--cdump':ValuedParameter(Prefix='--',Name='cdump',Delimiter=' '),\
+
+ # --refine <f> Attempt to refine the alignment before building the CM using
+ # expectation-maximization (EM). The final alignment (the alignment used
+ # to build the CM that gets written to cmfile) is written to <f>.
+ '--refine':ValuedParameter(Prefix='--',Name='refine',Delimiter=' '),\
+
+ # --gibbs Modifies the behavior of --refine so Gibbs sampling is used
+ # instead of EM.
+ '--gibbs':FlagParameter(Prefix='--',Name='gibbs'),\
+
+ # -s <n> Set the random seed to <n>, where <n> is a positive integer.
+ # This option can only be used in combination with --gibbs. The default is
+ # to use time() to generate a different seed for each run, which means
+ # that two different runs of cmbuild --refine <f> --gibbs on the same
+ # alignment will give slightly different results. You can use this option
+ # to generate reproducible results.
+ '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+ # -l With --refine, turn on the local alignment algorithm, which allows the
+ # alignment to span two or more subsequences if necessary (e.g. if the
+ # structures of the query model and target sequence are only partially
+ # shared), allowing certain large insertions and deletions in the
+ # structure to be penalized differently than normal indels. The default is
+ # to globally align the query model to the target sequences.
+ '-l':ValuedParameter(Prefix='-',Name='l',Delimiter=' '),\
+
+ # -a With --refine, print the scores of each individual sequence alignment.
+ '-a':ValuedParameter(Prefix='-',Name='a',Delimiter=' '),\
+
+ # --cyk With --refine, align with the CYK algorithm.
+ '--cyk':FlagParameter(Prefix='--',Name='cyk'),\
+
+ # --sub With --refine, turn on the sub model construction and alignment
+ # procedure.
+ '--sub':FlagParameter(Prefix='--',Name='sub'),\
+
+ # --nonbanded With --refine, do not use HMM bands to accelerate alignment.
+ # Use the full CYK algorithm which is guaranteed to give the optimal
+ # alignment. This will slow down the run significantly, especially for
+ # large models.
+ '--nonbanded':FlagParameter(Prefix='--',Name='nonbanded'),\
+
+ # --tau <x> With --refine, set the tail loss probability used during HMM
+ # band calculation to <f>. This is the amount of probability mass within
+ # the HMM posterior probabilities that is considered negligible. The
+ # default value is 1E-7. In general, higher values will result in greater
+ # acceleration, but increase the chance of missing the optimal alignment
+ # due to the HMM bands.
+ '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\
+
+ # --fins With --refine, change the behavior of how insert emissions are
+ # placed in the alignment.
+ '--fins':FlagParameter(Prefix='--',Name='fins'),\
+
+ # --mxsize <x> With --refine, set the maximum allowable matrix size for
+ # alignment to <x> megabytes.
+ '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+ # --rdump<x> With --refine, output the intermediate alignments at each
+ # iteration of the refinement procedure (as described above for --refine )
+ # to file <f>.
+ '--rdump':ValuedParameter(Prefix='--',Name='rdump',Delimiter=' '),\
+
+ }
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "cmbuild"
+ _suppress_stderr=True
+
+ def getHelp(self):
+ """Method that points to the Infernal documentation."""
+
+ help_str = \
+ """
+ See Infernal documentation at:
+ http://infernal.janelia.org/
+ """
+ return help_str
+
+ def _refine_out_filename(self):
+
+ if self.Parameters['--refine'].isOn():
+ refined_filename = self._absolute(str(\
+ self.Parameters['--refine'].Value))
+ else:
+ raise ValueError, 'No refine output file specified.'
+ return refined_filename
+
+ def _cm_out_filename(self):
+
+ if self.Parameters['-n'].isOn():
+ refined_filename = self._absolute(str(\
+ self.Parameters['-n'].Value))
+ else:
+ raise ValueError, 'No cm output file specified.'
+ return refined_filename
+
+ def _tempfile_as_multiline_string(self, data):
+ """Write a multiline string to a temp file and return the filename.
+
+ data: a multiline string to be written to a file.
+
+ * Note: the result will be the filename as a FilePath object
+ (which is a string subclass).
+
+ """
+ filename = FilePath(self.getTmpFilename(self.TmpDir))
+ data_file = open(filename,'w')
+ data_file.write(data)
+ data_file.close()
+ return filename
+
+ def _get_result_paths(self,data):
+ result = {}
+ if self.Parameters['--refine'].isOn():
+ out_name = self._refine_out_filename()
+ result['Refined'] = ResultPath(Path=out_name,IsWritten=True)
+ if self.Parameters['-n'].isOn():
+ cm_name = self._cm_out_filename()
+ result['CmFile'] = ResultPath(Path=cm_name,IsWritten=True)
+
+ return result
+
+
+class Cmcalibrate(CommandLineApplication):
+ """cmcalibrate application controller."""
+ _options = {
+
+ # -s <n> Set the random number generator seed to <n>, where <n> is a
+ # positive integer. The default is to use time() to generate a different
+ # seed for each run, which means that two different runs of cmcalibrate on
+ # the same CM will give slightly different E-value and HMM filter
+ # threshold parameters. You can use this option to generate reproducible
+ # results.
+ '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+ # --forecast <n> Predict the running time of the calibration for cmfile and
+ # provided options and exit, DO NOT perform the calibration.
+ '--forecast':ValuedParameter(Prefix='--',Name='forecast',Delimiter=' '),\
+
+ # --mpi Run as an MPI parallel program.
+ '--mpi':FlagParameter(Prefix='--',Name='mpi'),\
+
+ # Expert Options
+
+ # --exp-cmL-glc <x> Set the length of random sequence to search for the CM
+ # glocal exponential tail fits to <x> megabases (Mb).
+ '--exp-cmL-glc':ValuedParameter(Prefix='--',Name='exp-cmL-glc',\
+ Delimiter=' '),\
+
+ # --exp-cmL-loc <x> Set the length of random sequence to search for the CM
+ # local exponential tail fits to <x> megabases (Mb).
+ '--exp-cmL-loc':ValuedParameter(Prefix='--',Name='exp-cmL-loc',\
+ Delimiter=' '),\
+
+ # --exp-hmmLn-glc <x> Set the minimum random sequence length to search for
+ # the HMM glocal exponential tail fits to <x> megabases (Mb).
+ '--exp-hmmLn-glc':ValuedParameter(Prefix='--',Name='exp-hmmLn-glc',\
+ Delimiter=' '),\
+
+ # --exp-hmmLn-loc <x> Set the minimum random sequence length to search for
+ # the HMM local exponential tail fits to <x> megabases (Mb).
+ '--exp-hmmLn-loc':ValuedParameter(Prefix='--',Name='exp-hmmLn-loc',\
+ Delimiter=' '),\
+
+ # --exp-hmmLx <x> Set the maximum random sequence length to search when
+ # determining HMM E-values to <x> megabases (Mb).
+ '--exp-hmmLx':ValuedParameter(Prefix='--',Name='exp-hmmLx',Delimiter=' '),\
+
+ # --exp-fract <x> Set the HMM/CM fraction of dynamic programming
+ # calculations to <x>.
+ '--exp-fract':ValuedParameter(Prefix='--',Name='exp-fract',Delimiter=' '),\
+
+ # --exp-tailn-cglc <x> During E-value calibration of glocal CM search modes
+ # fit the exponential tail to the high scores in the histogram tail that
+ # includes <x> hits per Mb searched.
+ '--exp-tailn-cglc':ValuedParameter(Prefix='--',Name='exp-tailn-cglc',\
+ Delimiter=' '),\
+
+ # --exp-tailn-cloc <x> During E-value calibration of local CM search modes
+ # fit the exponential tail to the high scores in the histogram tail that
+ # includes <x> hits per Mb searched.
+ '--exp-tailn-cloc':ValuedParameter(Prefix='--',Name='exp-tailn-cloc',\
+ Delimiter=' '),\
+
+ # --exp-tailn-hglc <x> During E-value calibration of glocal HMM search modes
+ # fit the exponential tail to the high scores in the histogram tail that
+ # includes <x> hits per Mb searched.
+ '--exp-tailn-hglc':ValuedParameter(Prefix='--',Name='exp-tailn-hglc',\
+ Delimiter=' '),\
+
+ # --exp-tailn-hloc <x> During E-value calibration of local HMM search modes
+ # fit the exponential tail to the high scores in the histogram tail that
+ # includes <x> hits per Mb searched.
+ '--exp-tailn-hloc':ValuedParameter(Prefix='--',Name='exp-tailn-hloc',\
+ Delimiter=' '),\
+
+ # --exp-tailp <x> Ignore the --exp-tailn prefixed options and fit the <x>
+ # fraction right tail of the histogram to exponential tails, for all
+ # search modes.
+ '--exp-tailp':ValuedParameter(Prefix='--',Name='exp-tailp',Delimiter=' '),\
+
+ # --exp-tailxn <n> With --exp-tailp enforce that the maximum number of hits
+ # in the tail that is fit is <n>.
+ '--exp-tailxn':ValuedParameter(Prefix='--',Name='exp-tailxn',\
+ Delimiter=' '),\
+
+ # --exp-beta <x> During E-value calibration, by default query-dependent
+ # banding (QDB) is used to accelerate the CM search algorithms with a beta
+ # tail loss probability of 1E-15.
+ '--exp-beta':ValuedParameter(Prefix='--',Name='exp-beta',Delimiter=' '),\
+
+ # --exp-no-qdb Turn of QDB during E-value calibration. This will slow down
+ # calibration, and is not recommended unless you plan on using --no-qdb in
+ # cmsearch.
+ '--exp-no-qdb':FlagParameter(Prefix='--',Name='exp-no-qdb'),\
+
+ # --exp-hfile <f> Save the histograms fit for the E-value calibration to
+ # file <f>. The format of this file is two tab delimited columns.
+ '--exp-hfile':ValuedParameter(Prefix='--',Name='exp-hfile',Delimiter=' '),\
+
+ # --exp-sfile <f> Save a survival plot for the E-value calibration to file
+ # <f>. The format of this file is two tab delimited columns.
+ '--exp-sfile':ValuedParameter(Prefix='--',Name='exp-sfile',Delimiter=' '),\
+
+ # --exp-qqfile <f> Save a quantile-quantile plot for the E-value calibration
+ # to file <f>. The format of this file is two tab delimited columns.
+ '--exp-qqfile':ValuedParameter(Prefix='--',Name='exp-qqfile',\
+ Delimiter=' '),\
+
+ # --exp-ffile <f> Save statistics on the exponential tail statistics to file
+ # <f>. The file will contain the lambda and mu values for exponential
+ # tails fit to tails of different sizes.
+ '--exp-ffile':ValuedParameter(Prefix='--',Name='exp-ffile',Delimiter=' '),\
+
+ # --fil-N <n> Set the number of sequences sampled and searched for the HMM
+ # filter threshold calibration to <n>. By default, <n> is 10,000.
+ '--fil-N':ValuedParameter(Prefix='--',Name='fil-N',Delimiter=' '),\
+
+ # --fil-F <x> Set the fraction of sample sequences the HMM filter must be
+ # able to recognize, and allow to survive, to <x>, where <x> is a positive
+ # real number less than or equal to 1.0. By default, <x> is 0.995.
+ '--fil-F':ValuedParameter(Prefix='--',Name='fil-F',Delimiter=' '),\
+
+ # --fil-xhmm <x> Set the target number of dynamic programming calculations
+ # for a HMM filtered CM QDB search with beta = 1E-7 to <x> times the
+ # number of calculations required to do an HMM search. By default, <x> is
+ # 2.0.
+ '--fil-xhmm':ValuedParameter(Prefix='--',Name='fil-xhmm',Delimiter=' '),\
+
+ # --fil-tau <x> Set the tail loss probability during HMM band calculation
+ # for HMM filter threshold calibration to <x>.
+ '--fil-tau':ValuedParameter(Prefix='--',Name='fil-tau',Delimiter=' '),\
+
+ # --fil-gemit During HMM filter calibration, always sample sequences from a
+ # globally configured CM, even when calibrating local modes.
+ '--fil-gemit':FlagParameter(Prefix='--',Name='fil-gemit'),\
+
+ # --fil-dfile <f> Save statistics on filter threshold calibration, including
+ # HMM and CM scores for all sampled sequences, to file <f>.
+ '--fil-dfile':ValuedParameter(Prefix='--',Name='fil-dfile',Delimiter=' '),\
+
+ # --mxsize <x> Set the maximum allowable DP matrix size to <x> megabytes.
+ '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+ }
+
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "cmcalibrate"
+ _suppress_stderr=True
+
+ def getHelp(self):
+ """Method that points to the Infernal documentation."""
+
+ help_str = \
+ """
+ See Infernal documentation at:
+ http://infernal.janelia.org/
+ """
+ return help_str
+
+class Cmemit(CommandLineApplication):
+ """cmemit application controller."""
+ _options = {
+
+ # -o <f> Save the synthetic sequences to file <f> rather than writing them
+ # to stdout.
+ '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' '),\
+
+ # -n <n> Generate <n> sequences. Default is 10.
+ '-n':ValuedParameter(Prefix='-',Name='n',Delimiter=' '),\
+
+ # -u Write the generated sequences in unaligned format (FASTA). This is the
+ # default, so this option is probably useless.
+ '-u':FlagParameter(Prefix='-',Name='u'),\
+
+ # -a Write the generated sequences in an aligned format (STOCKHOLM) with
+ # consensus structure annotation rather than FASTA.
+ '-a':FlagParameter(Prefix='-',Name='a'),\
+
+ # -c Predict a single majority-rule consensus sequence instead of sampling
+ # sequences from the CM's probability distribution.
+ '-c':FlagParameter(Prefix='-',Name='c'),\
+
+ # -l Configure the CMs into local mode before emitting sequences. See the
+ # User's Guide for more information on locally configured CMs.
+ '-l':FlagParameter(Prefix='-',Name='l'),\
+
+ # -s <n> Set the random seed to <n>, where <n> is a positive integer. The
+ # default is to use time() to generate a different seed for each run,
+ # which means that two different runs of cmemit on the same CM will give
+ # different results. You can use this option to generate reproducible
+ # results.
+ '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+ # --rna Specify that the emitted sequences be output as RNA sequences. This
+ # is true by default.
+ '--rna':FlagParameter(Prefix='--',Name='rna'),\
+
+ # --dna Specify that the emitted sequences be output as DNA sequences. By
+ # default, the output alphabet is RNA.
+ '--dna':FlagParameter(Prefix='--',Name='dna'),\
+
+ # --tfile <f> Dump tabular sequence parsetrees (tracebacks) for each emitted
+ # sequence to file <f>. Primarily useful for debugging.
+ '--tfile':ValuedParameter(Prefix='--',Name='tfile',Delimiter=' '),\
+
+ # --exp <x> Exponentiate the emission and transition probabilities of the CM
+ # by <x> and then renormalize those distributions before emitting
+ # sequences.
+ '--exp':ValuedParameter(Prefix='--',Name='exp',Delimiter=' '),\
+
+ # --begin <n> Truncate the resulting alignment by removing all residues
+ # before consensus column <n>, where <n> is a positive integer no greater
+ # than the consensus length of the CM. Must be used in combination with
+ # --end and either -a or --shmm (a developer option).
+ '--begin':ValuedParameter(Prefix='--',Name='begin',Delimiter=' '),\
+
+ # --end <n> Truncate the resulting alignment by removing all residues after
+ # consensus column <n>, where <n> is a positive integer no greater than
+ # the consensus length of the CM. Must be used in combination with --begin
+ # and either -a or --shmm (a developer option).
+ '--end':ValuedParameter(Prefix='--',Name='end',Delimiter=' '),\
+
+ }
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "cmemit"
+ _suppress_stderr=True
+
+ def getHelp(self):
+ """Method that points to the Infernal documentation."""
+
+ help_str = \
+ """
+ See Infernal documentation at:
+ http://infernal.janelia.org/
+ """
+ return help_str
+
+class Cmscore(CommandLineApplication):
+ """cmscore application controller."""
+ _options = {
+
+ # -n <n> Set the number of sequences to generate and align to <n>. This
+ # option is incompatible with the --infile option.
+ '-n':ValuedParameter(Prefix='-',Name='n',Delimiter=' '),\
+
+ # -l Turn on the local alignment algorithm, which allows the alignment to
+ # span two or more subsequences if necessary (e.g. if the structures of
+ # the query model and target sequence are only partially shared), allowing
+ # certain large insertions and deletions in the structure to be penalized
+ # differently than normal indels. The default is to globally align the
+ # query model to the target sequences.
+ '-l':FlagParameter(Prefix='-',Name='l'),\
+
+ # -s <n> Set the random seed to <n>, where <n> is a positive integer. The
+ # default is to use time() to generate a different seed for each run,
+ # which means that two different runs of cmscore on the same CM will give
+ # different results. You can use this option to generate reproducible
+ # results. The random number generator is used to generate sequences to
+ # score, so -s is incompatible with the --infile option which supplies
+ # the sequences to score in an input file.
+ '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+ # -a Print individual timings and score comparisons for each sequence in
+ # seqfile. By default only summary statistics are printed.
+ '-a':FlagParameter(Prefix='-',Name='a'),\
+
+ # --sub Turn on the sub model construction and alignment procedure.
+ '--sub':FlagParameter(Prefix='--',Name='sub'),\
+
+ # --mxsize <x> Set the maximum allowable DP matrix size to <x> megabytes.
+ '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+ # --mpi Run as an MPI parallel program.
+ '--mpi':FlagParameter(Prefix='--',Name='mpi'),\
+
+ # Expert Options
+
+ # --emit Generate sequences to score by sampling from the CM.
+ '--emit':FlagParameter(Prefix='--',Name='emit'),\
+
+ # --random Generate sequences to score by sampling from the CMs null
+ # distribution. This option turns the --emit option off.
+ '--random':FlagParameter(Prefix='--',Name='random'),\
+
+ # --infile <f> Sequences to score are read from the file <f>. All the
+ # sequences from <f> are read and scored, the -n and -s options are
+ # incompatible with --infile.
+ '--infile':ValuedParameter(Prefix='--',Name='infile',Delimiter=' '),\
+
+ # --outfile <f> Save generated sequences that are scored to the file <f> in
+ # FASTA format. This option is incompatible with the --infile option.
+ '--outfile':ValuedParameter(Prefix='--',Name='outfile',Delimiter=' '),\
+
+ # --Lmin <n1> Must be used in combination with --random and --Lmax <n2>.
+ '--Lmin':ValuedParameter(Prefix='--',Name='Lmin',Delimiter=' '),\
+
+ # --pad Must be used in combination with --emit and --search. Add <n> cm->W
+ # (max hit length) minus L (sequence <x> length) residues to the 5' and 3'
+ # end of each emitted sequence <x>.
+ '--pad':FlagParameter(Prefix='--',Name='pad'),\
+
+ # --hbanded Specify that the second stage alignment algorithm be HMM banded
+ # CYK. This option is on by default.
+ '--hbanded':FlagParameter(Prefix='--',Name='hbanded'),\
+
+ # --tau <x> For stage 2 alignment, set the tail loss probability used during
+ # HMM band calculation to <x>.
+ '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\
+
+ # --aln2bands With --search, when calculating HMM bands, use an HMM
+ # alignment algorithm instead of an HMM search algorithm.
+ '--aln2bands':FlagParameter(Prefix='--',Name='aln2bands'),\
+
+ # --hsafe For stage 2 HMM banded alignment, realign any sequences with a
+ # negative alignment score using non-banded CYK to guarantee finding the
+ # optimal alignment.
+ '--hsafe':FlagParameter(Prefix='--',Name='hsafe'),\
+
+ # --nonbanded Specify that the second stage alignment algorithm be standard,
+ # non-banded, non-D&C CYK. When --nonbanded is enabled, the program fails
+ # with a non-zero exit code and prints an error message if the parsetree
+ # score for any sequence from stage 1 D&C alignment and stage 2 alignment
+ # differs by more than 0.01 bits. In theory, this should never happen as
+ # both algorithms are guaranteed to determine the optimal parsetree. For
+ # larger RNAs (more than 300 residues) if memory is limiting, --nonbanded
+ # should be used in combination with --scoreonly.
+ '--nonbanded':FlagParameter(Prefix='--',Name='nonbanded'),\
+
+ # --scoreonly With --nonbanded during the second stage standard non-banded
+ # CYK alignment, use the "score only" variant of the algorithm to save
+ # memory, and don't recover a parse tree.
+ '--scoreonly':FlagParameter(Prefix='--',Name='scoreonly'),\
+
+ # --viterbi Specify that the second stage alignment algorithm be Viterbi to
+ # a CM Plan 9 HMM.
+ '--viterbi':FlagParameter(Prefix='--',Name='viterbi'),\
+
+ # --search Run all algorithms in scanning mode, not alignment mode.
+ '--search':FlagParameter(Prefix='--',Name='search'),\
+
+ # --inside With --search Compare the non-banded scanning Inside algorithm to
+ # the HMM banded scanning Inside algorith, instead of using CYK versions.
+ '--inside':FlagParameter(Prefix='--',Name='inside'),\
+
+ # --forward With --search Compare the scanning Forward scoring algorithm
+ # against CYK.
+ '--forward':FlagParameter(Prefix='--',Name='forward'),\
+
+ # --taus <n> Specify the first alignment algorithm as non-banded D&C CYK,
+ # and multiple stages of HMM banded CYK alignment. The first HMM banded
+ # alignment will use tau=1E-<x>, which will be the highest value of tau
+ # used. Must be used in combination with --taue.
+ '--taus':ValuedParameter(Prefix='--',Name='taus',Delimiter=' '),\
+
+ # --taue <n> Specify the first alignment algorithm as non-banded D&C CYK,
+ # and multiple stages of HMM banded CYK alignment. The final HMM banded
+ # alignment will use tau=1E-<x>, which will be the lowest value of tau
+ # used. Must be used in combination with --taus.
+ '--taue':ValuedParameter(Prefix='--',Name='taue',Delimiter=' '),\
+
+ # --tfile <f> Print the parsetrees for each alignment of each sequence to
+ # file <f>.
+ '--tfile':ValuedParameter(Prefix='--',Name='tfile',Delimiter=' '),\
+
+ }
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "cmscore"
+ _suppress_stderr=True
+
+ def getHelp(self):
+ """Method that points to the Infernal documentation."""
+
+ help_str = \
+ """
+ See Infernal documentation at:
+ http://infernal.janelia.org/
+ """
+ return help_str
+
+class Cmsearch(CommandLineApplication):
+ """cmsearch application controller."""
+ _options = {
+
+ # -o <f> Save the high-scoring alignments of hits to a file <f>. The default
+ # is to write them to standard output.
+ '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' '),\
+
+ # -g <f> Turn on the 'glocal' alignment algorithm, local with respect to the
+ # target database, and global with respect to the model. By default, the
+ # local alignment algorithm is used which is local with respect to both
+ # the target sequence and the model.
+ '-g':ValuedParameter(Prefix='-',Name='g',Delimiter=' '),\
+
+ # -p Append posterior probabilities to alignments of hits.
+ '-p':FlagParameter(Prefix='-',Name='p'),\
+
+ # -x Annotate non-compensatory basepairs and basepairs that include a gap in
+ # the left and/or right half of the pair with x's in the alignments of
+ # hits.
+ '-x':FlagParameter(Prefix='-',Name='x'),\
+
+ # -Z <x> Calculate E-values as if the target database size was <x> megabases
+ # (Mb). Ignore the actual size of the database. This option is only valid
+ # if the CM file has been calibrated. Warning: the predictions for timings
+ # and survival fractions will be calculated as if the database was of size
+ # <x> Mb, which means they will be inaccurate.
+ '-Z':ValuedParameter(Prefix='-',Name='Z',Delimiter=' '),\
+
+ # --toponly Only search the top (Watson) strand of the sequences in seqfile.
+ # By default, both strands are searched.
+ '--toponly':FlagParameter(Prefix='--',Name='toponly'),\
+
+ # --bottomonly Only search the bottom (Crick) strand of the sequences in
+ # seqfile. By default, both strands are searched.
+ '--bottomonly':FlagParameter(Prefix='--',Name='bottomonly'),\
+
+ # --forecast <n> Predict the running time of the search with provided files
+ # and options and exit, DO NOT perform the search. This option is only
+ # available with calibrated CM files.
+ '--forecast':ValuedParameter(Prefix='--',Name='forecast',Delimiter=' '),\
+
+ # --informat <s> Assert that the input seqfile is in format <s>. Do not run
+ # Babelfish format autodection. This increases the reliability of the
+ # program somewhat, because the Babelfish can make mistakes; particularly
+ # recommended for unattended, high-throughput runs of @PACKAGE at . <s> is
+ # case-insensitive. Acceptable formats are: FASTA, EMBL, UNIPROT, GENBANK,
+ # and DDBJ. <s> is case-insensitive.
+ '--informat':ValuedParameter(Prefix='--',Name='informat',Delimiter=' '),\
+
+ # --mxsize <x> Set the maximum allowable DP matrix size to <x> megabytes.
+ '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+ # --mpi Run as an MPI parallel program.
+ '--mpi':FlagParameter(Prefix='--',Name='mpi'),\
+
+ # Expert Options
+
+ # --inside Use the Inside algorithm for the final round of searching. This
+ # is true by default.
+ '--inside':FlagParameter(Prefix='--',Name='inside'),\
+
+ # --cyk Use the CYK algorithm for the final round of searching.
+ '--cyk':FlagParameter(Prefix='--',Name='cyk'),\
+
+ # --viterbi Search only with an HMM. This is much faster but less sensitive
+ # than a CM search. Use the Viterbi algorithm for the HMM search.
+ '--viterbi':FlagParameter(Prefix='--',Name='viterbi'),\
+
+ # --forward Search only with an HMM. This is much faster but less sensitive
+ # than a CM search. Use the Forward algorithm for the HMM search.
+ '--forward':FlagParameter(Prefix='--',Name='forward'),\
+
+ # -E <x> Set the E-value cutoff for the per-sequence/strand ranked hit list
+ # to <x>, where <x> is a positive real number.
+ '-E':ValuedParameter(Prefix='-',Name='E',Delimiter=' '),\
+
+ # -T <x> Set the bit score cutoff for the per-sequence ranked hit list to
+ # <x>, where <x> is a positive real number.
+ '-T':ValuedParameter(Prefix='-',Name='T',Delimiter=' '),\
+
+ # --nc Set the bit score cutoff as the NC cutoff value used by Rfam curators
+ # as the noise cutoff score.
+ '--nc':FlagParameter(Prefix='--',Name='nc'),\
+
+ # --ga Set the bit score cutoff as the GA cutoff value used by Rfam curators
+ # as the gathering threshold.
+ '--ga':FlagParameter(Prefix='--',Name='ga'),\
+
+ # --tc Set the bit score cutoff as the TC cutoff value used by Rfam curators
+ # as the trusted cutoff.
+ '--tc':FlagParameter(Prefix='--',Name='tc'),\
+
+ # --no-qdb Do not use query-dependent banding (QDB) for the final round of
+ # search.
+ '--no-qdb':FlagParameter(Prefix='--',Name='no-qdb'),\
+
+ # --beta " <x>" For query-dependent banding (QDB) during the final round of
+ # search, set the beta parameter to <x> where <x> is any positive real
+ # number less than 1.0.
+ '--beta':ValuedParameter(Prefix='--',Name='beta',Delimiter=' '),\
+
+ # --hbanded Use HMM bands to accelerate the final round of search.
+ # Constraints for the CM search are derived from posterior probabilities
+ # from an HMM. This is an experimental option and it is not recommended
+ # for use unless you know exactly what you're doing.
+ '--hbanded':FlagParameter(Prefix='--',Name='hbanded'),\
+
+ # --tau <x> Set the tail loss probability during HMM band calculation to
+ # <x>.
+ '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\
+
+ # --fil-no-hmm Turn the HMM filter off.
+ '--fil-no-hmm':FlagParameter(Prefix='--',Name='fil-no-hmm'),\
+
+ # --fil-no-qdb Turn the QDB filter off.
+ '--fil-no-qdb':FlagParameter(Prefix='--',Name='fil-no-qdb'),\
+
+ # --fil-beta For the QDB filter, set the beta parameter to <x> where <x> is
+ # any positive real number less than 1.0.
+ '--fil-beta':FlagParameter(Prefix='--',Name='fil-beta'),\
+
+ # --fil-T-qdb <x> Set the bit score cutoff for the QDB filter round to <x>,
+ # where <x> is a positive real number.
+ '--fil-T-qdb':ValuedParameter(Prefix='--',Name='fil-T-qdb',Delimiter=' '),\
+
+ # --fil-T-hmm <x> Set the bit score cutoff for the HMM filter round to <x>,
+ # where <x> is a positive real number.
+ '--fil-T-hmm':ValuedParameter(Prefix='--',Name='fil-T-hmm',Delimiter=' '),\
+
+ # --fil-E-qdb <x> Set the E-value cutoff for the QDB filter round. <x>,
+ # where <x> is a positive real number. Hits with E-values better than
+ # (less than) or equal to this threshold will survive and be passed to the
+ # final round. This option is only available if the CM file has been
+ # calibrated.
+ '--fil-E-qdb':ValuedParameter(Prefix='--',Name='fil-E-qdb',Delimiter=' '),\
+
+ # --fil-E-hmm <x> Set the E-value cutoff for the HMM filter round. <x>,
+ # where <x> is a positive real number. Hits with E-values better than
+ # (less than) or equal to this threshold will survive and be passed to the
+ # next round, either a QDB filter round, or if the QDB filter is disable,
+ # to the final round of search. This option is only available if the CM
+ # file has been calibrated.
+ '--fil-E-hmm':ValuedParameter(Prefix='--',Name='fil-E-hmm',Delimiter=' '),\
+
+ # --fil-Smax-hmm <x> Set the maximum predicted survival fraction for an HMM
+ # filter as <x>, where <x> is a positive real number less than 1.0.
+ '--fil-Smax-hmm':ValuedParameter(Prefix='--',Name='fil-Smax-hmm',\
+ Delimiter=' '),\
+
+ # --noalign Do not calculate and print alignments of each hit, only print
+ # locations and scores.
+ '--noalign':FlagParameter(Prefix='--',Name='noalign'),\
+
+ # --aln-hbanded Use HMM bands to accelerate alignment during the hit
+ # alignment stage.
+ '--aln-hbanded':FlagParameter(Prefix='--',Name='aln-hbanded'),\
+
+ # --aln-optacc Calculate alignments of hits from final round of search using
+ # the optimal accuracy algorithm which computes the alignment that
+ # maximizes the summed posterior probability of all aligned residues given
+ # the model, which can be different from the highest scoring one.
+ '--aln-optacc':FlagParameter(Prefix='--',Name='aln-optacc'),\
+
+ # --tabfile <f> Create a new output file <f> and print tabular results to
+ # it.
+ '--tabfile':ValuedParameter(Prefix='--',Name='tabfile',Delimiter=' '),\
+
+ # --gcfile <f> Create a new output file <f> and print statistics of the GC
+ # content of the sequences in seqfile to it.
+ '--gcfile':ValuedParameter(Prefix='--',Name='gcfile',Delimiter=' '),\
+
+ # --rna Output the hit alignments as RNA sequences alignments. This is true
+ # by default.
+ '--rna':FlagParameter(Prefix='--',Name='rna'),\
+
+ # --dna Output the hit alignments as DNA sequence alignments.
+ '--dna':FlagParameter(Prefix='--',Name='dna'),\
+
+ }
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "cmsearch"
+ _suppress_stderr=True
+
+ def getHelp(self):
+ """Method that points to the Infernal documentation."""
+
+ help_str = \
+ """
+ See Infernal documentation at:
+ http://infernal.janelia.org/
+ """
+ return help_str
+
+ def _tabfile_out_filename(self):
+
+ if self.Parameters['--tabfile'].isOn():
+ tabfile_filename = self._absolute(str(\
+ self.Parameters['--tabfile'].Value))
+ else:
+ raise ValueError, 'No tabfile output file specified.'
+ return tabfile_filename
+
+ def _tempfile_as_multiline_string(self, data):
+ """Write a multiline string to a temp file and return the filename.
+
+ data: a multiline string to be written to a file.
+
+ * Note: the result will be the filename as a FilePath object
+ (which is a string subclass).
+
+ """
+ filename = FilePath(self.getTmpFilename(self.TmpDir))
+ data_file = open(filename,'w')
+ data_file.write(data)
+ data_file.close()
+ return filename
+
+ def _get_result_paths(self,data):
+ result = {}
+ if self.Parameters['--tabfile'].isOn():
+ out_name = self._tabfile_out_filename()
+ result['SearchResults'] = ResultPath(Path=out_name,IsWritten=True)
+
+ return result
+
+class Cmstat(CommandLineApplication):
+ """cmstat application controller."""
+ _options = {
+
+ # -g Turn on the 'glocal' alignment algorithm, local with respect to the
+ # target database, and global with respect to the model. By default, the
+ # model is configured for local alignment which is local with respect to
+ # both the target sequence and the model.
+ '-g':FlagParameter(Prefix='-',Name='g'),\
+
+ # -m print general statistics on the models in cmfile and the alignment it
+ # was built from.
+ '-m':FlagParameter(Prefix='-',Name='m'),\
+
+ # -Z <x> Calculate E-values as if the target database size was <x> megabases
+ # (Mb). Ignore the actual size of the database. This option is only valid
+ # if the CM file has been calibrated.
+ '-Z':ValuedParameter(Prefix='-',Name='Z',Delimiter=' '),\
+
+ # --all print all available statistics
+ '--all':FlagParameter(Prefix='--',Name='all'),\
+
+ # --le print local E-value statistics. This option only works if cmfile has
+ # been calibrated with cmcalibrate.
+ '--le':FlagParameter(Prefix='--',Name='le'),\
+
+ # --ge print glocal E-value statistics. This option only works if cmfile has
+ # been calibrated with cmcalibrate.
+ '--ge':FlagParameter(Prefix='--',Name='ge'),\
+
+ # --beta <x> With the --search option set the beta parameter for the query-
+ # dependent banding algorithm stages to <x> Beta is the probability mass
+ # considered negligible during band calculation. The default is 1E-7.
+ '--beta':ValuedParameter(Prefix='--',Name='beta',Delimiter=' '),\
+
+ # --qdbfile <f> Save the query-dependent bands (QDBs) for each state to file
+ # <f>
+ '--qdbfile':ValuedParameter(Prefix='--',Name='qdbfile',Delimiter=' '),\
+
+ # Expert Options
+
+ # --lfi Print the HMM filter thresholds for the range of relevant CM bit
+ # score cutoffs for searches with locally configured models using the
+ # Inside algorithm.
+ '--lfi':FlagParameter(Prefix='--',Name='lfi'),\
+
+ # --gfi Print the HMM filter thresholds for the range of relevant CM bit
+ # score cutoffs for searches with globally configured models using the
+ # Inside algorithm.
+ '--gfi':FlagParameter(Prefix='--',Name='gfi'),\
+
+ # --lfc Print the HMM filter thresholds for the range of relevant CM bit
+ # score cutoffs for searches with locally configured models using the CYK
+ # algorithm.
+ '--lfc':FlagParameter(Prefix='--',Name='lfc'),\
+
+ # --gfc Print the HMM filter thresholds for the range of relevant CM bit
+ # score cutoffs for searches with globally configured models using the CYK
+ # algorithm.
+ '--gfc':FlagParameter(Prefix='--',Name='gfc'),\
+
+ # -E <x> Print filter threshold statistics for an HMM filter if a final CM
+ # E-value cutoff of <x> were to be used for a run of cmsearch on 1 MB of
+ # sequence.
+ '-E':ValuedParameter(Prefix='-',Name='E',Delimiter=' '),\
+
+ # -T <x> Print filter threshold statistics for an HMM filter if a final CM
+ # bit score cutoff of <x> were to be used for a run of cmsearch.
+ '-T':ValuedParameter(Prefix='-',Name='T',Delimiter=' '),\
+
+ # --nc Print filter threshold statistics for an HMM filter if a CM bit score
+ # cutoff equal to the Rfam NC cutoff were to be used for a run of
+ # cmsearch.
+ '--nc':FlagParameter(Prefix='--',Name='nc'),\
+
+ # --ga Print filter threshold statistics for an HMM filter if a CM bit score
+ # cutoff of Rfam GA cutoff value were to be used for a run of cmsearch.
+ '--ga':FlagParameter(Prefix='--',Name='ga'),\
+
+ # --tc Print filter threshold statistics for an HMM filter if a CM bit score
+ # cutoff equal to the Rfam TC cutoff value were to be used for a run of
+ # cmsearch.
+ '--tc':FlagParameter(Prefix='--',Name='tc'),\
+
+ # --seqfile <x> With the -E option, use the database size of the database in
+ # <x> instead of the default database size of 1 MB.
+ '--seqfile':ValuedParameter(Prefix='--',Name='seqfile',Delimiter=' '),\
+
+ # --toponly In combination with --seqfile <x> option, only consider the top
+ # strand of the database in <x> instead of both strands. --search perform
+ # an experiment to determine how fast the CM(s) can search with different
+ # search algorithms.
+ '--toponly':FlagParameter(Prefix='--',Name='toponly'),\
+
+ # --cmL <n> With the --search option set the length of sequence to search
+ # with CM algorithms as <n> residues. By default, <n> is 1000.
+ '--cmL':ValuedParameter(Prefix='--',Name='cmL',Delimiter=' '),\
+
+ # --hmmL <n> With the --search option set the length of sequence to search
+ # with HMM algorithms as <n> residues. By default, <n> is 100,000.
+ '--hmmL':ValuedParameter(Prefix='--',Name='hmmL',Delimiter=' '),\
+
+ # --efile <f> Save a plot of cmsearch HMM filter E value cutoffs versus CM
+ # E-value cutoffs in xmgrace format to file <f>. This option must be used
+ # in combination with --lfi, --gfi, --lfc or --gfc.
+ '--efile':ValuedParameter(Prefix='--',Name='efile',Delimiter=' '),\
+
+ # --bfile <f> Save a plot of cmsearch HMM bit score cutoffs versus CM bit
+ # score cutoffs in xmgrace format to file <f>. This option must be used in
+ # combination with --lfi, --gfi, --lfc or --gfc.
+ '--bfile':ValuedParameter(Prefix='--',Name='bfile',Delimiter=' '),\
+
+ # --sfile <f> Save a plot of cmsearch predicted survival fraction from the
+ # HMM filter versus CM E value cutoff in xmgrace format to file <f>. This
+ # option must be used in combination with --lfi, --gfi, --lfc or --gfc.
+ '--sfile':ValuedParameter(Prefix='--',Name='sfile',Delimiter=' '),\
+
+ # --xfile <f> Save a plot of 'xhmm' versus CM E value cutoff in xmgrace
+ # format to file <f> 'xhmm' is the ratio of the number of dynamic
+ # programming calculations predicted to be required for the HMM filter and
+ # the CM search of the filter survivors versus the number of dynamic
+ # programming calculations for the filter alone. This option must be
+ # used in combination with --lfi, --gfi, --lfc or --gfc.
+ '--xfile':ValuedParameter(Prefix='--',Name='xfile',Delimiter=' '),\
+
+ # --afile <f> Save a plot of the predicted acceleration for an HMM filtered
+ # search versus CM E value cutoff in xmgrace format to file <f>. This
+ # option must be used in combination with --lfi, --gfi, --lfc or --gfc.
+ '--afile':ValuedParameter(Prefix='--',Name='afile',Delimiter=' '),\
+
+ # --bits With --efile, --sfile, --xfile, and --afile use CM bit score
+ # cutoffs instead of CM E value cutoffs for the x-axis values of the plot.
+ '--bits':FlagParameter(Prefix='--',Name='bits'),\
+
+ }
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "cmstat"
+ _suppress_stderr=True
+
+ def getHelp(self):
+ """Method that points to the Infernal documentation."""
+
+ help_str = \
+ """
+ See Infernal documentation at:
+ http://infernal.janelia.org/
+ """
+ return help_str
+
+def cmbuild_from_alignment(aln, structure_string, refine=False, \
+ return_alignment=False,params=None):
+ """Uses cmbuild to build a CM file given an alignment and structure string.
+
+ - aln: an Alignment object or something that can be used to construct
+ one. All sequences must be the same length.
+ - structure_string: vienna structure string representing the consensus
+ stucture for the sequences in aln. Must be the same length as the
+ alignment.
+ - refine: refine the alignment and realign before building the cm.
+ (Default=False)
+ - return_alignment: Return (in Stockholm format) alignment file used to
+ construct the CM file. This will either be the original alignment
+ and structure string passed in, or the refined alignment if --refine
+ was used. (Default=False)
+ - Note. This will be a string that can either be written to a file
+ or parsed.
+ """
+ aln = Alignment(aln)
+ if len(structure_string) != aln.SeqLen:
+ raise ValueError, """Structure string is not same length as alignment. Structure string is %s long. Alignment is %s long."""%(len(structure_string),\
+ aln.SeqLen)
+ else:
+ struct_dict = {'SS_cons':structure_string}
+ #Make new Cmbuild app instance.
+ app = Cmbuild(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+ params=params)
+
+ #turn on refine flag if True.
+ if refine:
+ _, tmp_file = mkstemp(dir=app.WorkingDir)
+ app.Parameters['--refine'].on(tmp_file)
+
+ #Get alignment in Stockholm format
+ aln_file_string = stockholm_from_alignment(aln,GC_annotation=struct_dict)
+
+ #get path to alignment filename
+ aln_path = app._input_as_multiline_string(aln_file_string)
+ cm_path = aln_path.split('.txt')[0]+'.cm'
+ app.Parameters['-n'].on(cm_path)
+
+ filepaths = [cm_path,aln_path]
+
+ res = app(filepaths)
+
+ cm_file = res['CmFile'].read()
+
+ if return_alignment:
+ #If alignment was refined, return refined alignment and structure,
+ # otherwise return original alignment and structure.
+ if refine:
+ aln_file_string = res['Refined'].read()
+ res.cleanUp()
+ return cm_file, aln_file_string
+ #Just return cm_file
+ else:
+ res.cleanUp()
+ return cm_file
+
+
+def cmbuild_from_file(stockholm_file_path, refine=False,return_alignment=False,\
+ params=None):
+ """Uses cmbuild to build a CM file given a stockholm file.
+
+ - stockholm_file_path: a path to a stockholm file. This file should
+ contain a multiple sequence alignment formated in Stockholm format.
+ This must contain a sequence structure line:
+ #=GC SS_cons <structure string>
+ - refine: refine the alignment and realign before building the cm.
+ (Default=False)
+ - return_alignment: Return alignment and structure string used to
+ construct the CM file. This will either be the original alignment
+ and structure string passed in, or the refined alignment if
+ --refine was used. (Default=False)
+ """
+ #get alignment and structure string from stockholm file.
+ info, aln, structure_string = \
+ list(MinimalRfamParser(open(stockholm_file_path,'U'),\
+ seq_constructor=ChangedSequence))[0]
+
+ #call cmbuild_from_alignment.
+ res = cmbuild_from_alignment(aln, structure_string, refine=refine, \
+ return_alignment=return_alignment,params=params)
+ return res
+
+def cmalign_from_alignment(aln, structure_string, seqs, moltype=DNA,\
+ include_aln=True,refine=False, return_stdout=False,params=None,\
+ cmbuild_params=None):
+ """Uses cmbuild to build a CM file, then cmalign to build an alignment.
+
+ - aln: an Alignment object or something that can be used to construct
+ one. All sequences must be the same length.
+ - structure_string: vienna structure string representing the consensus
+ stucture for the sequences in aln. Must be the same length as the
+ alignment.
+ - seqs: SequenceCollection object or something that can be used to
+ construct one, containing unaligned sequences that are to be aligned
+ to the aligned sequences in aln.
+ - moltype: Cogent moltype object. Must be RNA or DNA.
+ - include_aln: Boolean to include sequences in aln in final alignment.
+ (Default=True)
+ - refine: refine the alignment and realign before building the cm.
+ (Default=False)
+ - return_stdout: Boolean to return standard output from infernal. This
+ includes alignment and structure bit scores and average
+ probabilities for each sequence. (Default=False)
+ """
+ #NOTE: Must degap seqs or Infernal well seg fault!
+ seqs = SequenceCollection(seqs,MolType=moltype).degap()
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = seqs.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+
+ cm_file, aln_file_string = cmbuild_from_alignment(aln, structure_string,\
+ refine=refine,return_alignment=True,params=cmbuild_params)
+
+ if params is None:
+ params = {}
+ params.update({MOLTYPE_MAP[moltype]:True})
+
+ app = Cmalign(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+ params=params)
+ app.Parameters['--informat'].on('FASTA')
+
+ #files to remove that aren't cleaned up by ResultPath object
+ to_remove = []
+ #turn on --withali flag if True.
+ if include_aln:
+ app.Parameters['--withali'].on(\
+ app._tempfile_as_multiline_string(aln_file_string))
+ #remove this file at end
+ to_remove.append(app.Parameters['--withali'].Value)
+
+ seqs_path = app._input_as_multiline_string(int_map.toFasta())
+ cm_path = app._tempfile_as_multiline_string(cm_file)
+
+ #add cm_path to to_remove
+ to_remove.append(cm_path)
+ paths = [cm_path,seqs_path]
+
+ _, tmp_file = mkstemp(dir=app.WorkingDir)
+ app.Parameters['-o'].on(tmp_file)
+
+ res = app(paths)
+
+ info, aligned, struct_string = \
+ list(MinimalRfamParser(res['Alignment'].readlines(),\
+ seq_constructor=SEQ_CONSTRUCTOR_MAP[moltype]))[0]
+
+ #Make new dict mapping original IDs
+ new_alignment={}
+ for k,v in aligned.NamedSeqs.items():
+ new_alignment[int_keys.get(k,k)]=v
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment,MolType=moltype)
+
+ std_out = res['StdOut'].read()
+ #clean up files
+ res.cleanUp()
+ for f in to_remove: remove(f)
+
+ if return_stdout:
+ return new_alignment, struct_string, std_out
+ else:
+ return new_alignment, struct_string
+
+
+def cmalign_from_file(cm_file_path, seqs, moltype=DNA, alignment_file_path=None,\
+ include_aln=False,return_stdout=False,params=None):
+ """Uses cmalign to align seqs to alignment in cm_file_path.
+
+ - cm_file_path: path to the file created by cmbuild, containing aligned
+ sequences. This will be used to align sequences in seqs.
+ - seqs: unaligned sequendes that are to be aligned to the sequences in
+ cm_file.
+ - moltype: cogent.core.moltype object. Must be DNA or RNA
+ - alignment_file_path: path to stockholm alignment file used to create
+ cm_file.
+ __IMPORTANT__: This MUST be the same file used by cmbuild
+ originally. Only need to pass in this file if include_aln=True.
+ This helper function will NOT check if the alignment file is correct
+ so you must use it correctly.
+ - include_aln: Boolean to include sequences in aln_file in final
+ alignment. (Default=False)
+ - return_stdout: Boolean to return standard output from infernal. This
+ includes alignment and structure bit scores and average
+ probabilities for each sequence. (Default=False)
+ """
+ #NOTE: Must degap seqs or Infernal well seg fault!
+ seqs = SequenceCollection(seqs,MolType=moltype).degap()
+
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = seqs.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+
+ if params is None:
+ params = {}
+ params.update({MOLTYPE_MAP[moltype]:True})
+
+ app = Cmalign(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+ params=params)
+ app.Parameters['--informat'].on('FASTA')
+
+ #turn on --withali flag if True.
+ if include_aln:
+ if alignment_file_path is None:
+ raise DataError, """Must have path to alignment file used to build CM if include_aln=True."""
+ else:
+ app.Parameters['--withali'].on(alignment_file_path)
+
+ seqs_path = app._input_as_multiline_string(int_map.toFasta())
+ paths = [cm_file_path,seqs_path]
+
+ _, tmp_file = mkstemp(dir=app.WorkingDir)
+ app.Parameters['-o'].on(tmp_file)
+ res = app(paths)
+
+ info, aligned, struct_string = \
+ list(MinimalRfamParser(res['Alignment'].readlines(),\
+ seq_constructor=SEQ_CONSTRUCTOR_MAP[moltype]))[0]
+
+
+ #Make new dict mapping original IDs
+ new_alignment={}
+ for k,v in aligned.items():
+ new_alignment[int_keys.get(k,k)]=v
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment,MolType=moltype)
+ std_out = res['StdOut'].read()
+ res.cleanUp()
+ if return_stdout:
+ return new_alignment, struct_string, std_out
+ else:
+ return new_alignment, struct_string
+
+def cmsearch_from_alignment(aln, structure_string, seqs, moltype, cutoff=0.0,\
+ refine=False,params=None):
+ """Uses cmbuild to build a CM file, then cmsearch to find homologs.
+
+ - aln: an Alignment object or something that can be used to construct
+ one. All sequences must be the same length.
+ - structure_string: vienna structure string representing the consensus
+ stucture for the sequences in aln. Must be the same length as the
+ alignment.
+ - seqs: SequenceCollection object or something that can be used to
+ construct one, containing unaligned sequences that are to be
+ searched.
+ - moltype: cogent.core.moltype object. Must be DNA or RNA
+ - cutoff: bitscore cutoff. No sequences < cutoff will be kept in
+ search results. (Default=0.0). Infernal documentation suggests
+ a cutoff of log2(number nucleotides searching) will give most
+ likely true homologs.
+ - refine: refine the alignment and realign before building the cm.
+ (Default=False)
+ """
+ #NOTE: Must degap seqs or Infernal well seg fault!
+ seqs = SequenceCollection(seqs,MolType=moltype).degap()
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = seqs.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+
+ cm_file, aln_file_string = cmbuild_from_alignment(aln, structure_string,\
+ refine=refine,return_alignment=True)
+
+ app = Cmsearch(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+ params=params)
+ app.Parameters['--informat'].on('FASTA')
+ app.Parameters['-T'].on(cutoff)
+
+ to_remove = []
+
+ seqs_path = app._input_as_multiline_string(int_map.toFasta())
+ cm_path = app._tempfile_as_multiline_string(cm_file)
+ paths = [cm_path,seqs_path]
+ to_remove.append(cm_path)
+
+ _, tmp_file = mkstemp(dir=app.WorkingDir)
+ app.Parameters['--tabfile'].on(tmp_file)
+ res = app(paths)
+
+ search_results = list(CmsearchParser(res['SearchResults'].readlines()))
+ if search_results:
+ for i,line in enumerate(search_results):
+ label = line[1]
+ search_results[i][1]=int_keys.get(label,label)
+
+ res.cleanUp()
+ for f in to_remove:remove(f)
+
+ return search_results
+
+def cmsearch_from_file(cm_file_path, seqs, moltype, cutoff=0.0, params=None):
+ """Uses cmbuild to build a CM file, then cmsearch to find homologs.
+
+ - cm_file_path: path to the file created by cmbuild, containing aligned
+ sequences. This will be used to search sequences in seqs.
+ - seqs: SequenceCollection object or something that can be used to
+ construct one, containing unaligned sequences that are to be
+ searched.
+ - moltype: cogent.core.moltype object. Must be DNA or RNA
+ - cutoff: bitscore cutoff. No sequences < cutoff will be kept in
+ search results. (Default=0.0). Infernal documentation suggests
+ a cutoff of log2(number nucleotides searching) will give most
+ likely true homologs.
+ """
+ #NOTE: Must degap seqs or Infernal well seg fault!
+ seqs = SequenceCollection(seqs,MolType=moltype).degap()
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = seqs.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+
+ app = Cmsearch(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+ params=params)
+ app.Parameters['--informat'].on('FASTA')
+ app.Parameters['-T'].on(cutoff)
+
+ seqs_path = app._input_as_multiline_string(int_map.toFasta())
+
+ paths = [cm_file_path,seqs_path]
+
+ _, tmp_file = mkstemp(dir=app.WorkingDir)
+ app.Parameters['--tabfile'].on(tmp_file)
+ res = app(paths)
+
+ search_results = list(CmsearchParser(res['SearchResults'].readlines()))
+
+ if search_results:
+ for i,line in enumerate(search_results):
+ label = line[1]
+ search_results[i][1]=int_keys.get(label,label)
+
+ res.cleanUp()
+
+ return search_results
diff --git a/bfillings/mafft.py b/bfillings/mafft.py
new file mode 100644
index 0000000..417bc08
--- /dev/null
+++ b/bfillings/mafft.py
@@ -0,0 +1,470 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Provides an application controller for the commandline version of:
+MAFFT v6.602
+"""
+from random import choice
+from os import remove
+
+from burrito.parameters import FlagParameter, ValuedParameter, FilePath
+from burrito.util import CommandLineApplication, ResultPath, get_tmp_filename
+from skbio.parse.sequences import parse_fasta
+
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.core.tree import PhyloNode
+from cogent.parse.tree import DndParser
+
+
+MOLTYPE_MAP = {'DNA':'--nuc',\
+ 'RNA':'--nuc',\
+ 'PROTEIN':'--amino',\
+ }
+
+class Mafft(CommandLineApplication):
+ """Mafft application controller"""
+
+
+ _options ={
+ # Algorithm
+
+ # Automatically selects an appropriate strategy from L-INS-i, FFT-NS-i
+ # and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
+ '--auto':FlagParameter(Prefix='--',Name='auto'),\
+
+ # Distance is calculated based on the number of shared 6mers. Default: on
+ '--6merpair':FlagParameter(Prefix='--',Name='6merpair'),\
+
+ # All pairwise alignments are computed with the Needleman-Wunsch algorithm.
+ # More accurate but slower than --6merpair. Suitable for a set of globally
+ # alignable sequences. Applicable to up to ~200 sequences. A combination
+ # with --maxiterate 1000 is recommended (G-INS-i). Default: off
+ # (6mer distance is used)
+ '--globalpair':FlagParameter(Prefix='--',Name='globalpair'),\
+
+ # All pairwise alignments are computed with the Smith-Waterman algorithm.
+ # More accurate but slower than --6merpair. Suitable for a set of locally
+ # alignable sequences. Applicable to up to ~200 sequences. A combination
+ # with --maxiterate 1000 is recommended (L-INS-i). Default: off
+ # (6mer distance is used)
+ '--localpair':FlagParameter(Prefix='--',Name='localpair'),\
+
+ # All pairwise alignments are computed with a local algorithm with the
+ # generalized affine gap cost (Altschul 1998). More accurate but slower than
+ # --6merpair. Suitable when large internal gaps are expected. Applicable to
+ # up to ~200 sequences. A combination with --maxiterate 1000 is recommended
+ # (E-INS-i). Default: off (6mer distance is used)
+ '--genafpair':FlagParameter(Prefix='--',Name='genafpair'),\
+
+ # All pairwise alignments are computed with FASTA (Pearson and Lipman 1988).
+ # FASTA is required. Default: off (6mer distance is used)
+ '--fastapair':FlagParameter(Prefix='--',Name='fastapair'),\
+
+ # Weighting factor for the consistency term calculated from pairwise
+ # alignments. Valid when either of --blobalpair, --localpair, --genafpair,
+ # --fastapair or --blastpair is selected. Default: 2.7
+ '--weighti':ValuedParameter(Prefix='--',Name='weighti',Delimiter=' '),\
+
+ # Guide tree is built number times in the progressive stage. Valid with 6mer
+ # distance. Default: 2
+ '--retree':ValuedParameter(Prefix='--',Name='retree',Delimiter=' '),\
+
+ # number cycles of iterative refinement are performed. Default: 0
+ '--maxiterate':ValuedParameter(Prefix='--',Name='maxiterate',\
+ Delimiter=' '),\
+
+ # Use FFT approximation in group-to-group alignment. Default: on
+ '--fft':FlagParameter(Prefix='--',Name='fft'),\
+
+ # Do not use FFT approximation in group-to-group alignment. Default: off
+ '--nofft':FlagParameter(Prefix='--',Name='nofft'),\
+
+ #Alignment score is not checked in the iterative refinement stage. Default:
+ # off (score is checked)
+ '--noscore':FlagParameter(Prefix='--',Name='noscore'),\
+
+ # Use the Myers-Miller (1988) algorithm. Default: automatically turned on
+ # when the alignment length exceeds 10,000 (aa/nt).
+ '--memsave':FlagParameter(Prefix='--',Name='memsave'),\
+
+ # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with the
+ # 6mer distance. Recommended for a large number (> ~10,000) of sequences are
+ # input. Default: off
+ '--parttree':FlagParameter(Prefix='--',Name='parttree'),\
+
+ # The PartTree algorithm is used with distances based on DP. Slightly more
+ # accurate and slower than --parttree. Recommended for a large number
+ # (> ~10,000) of sequences are input. Default: off
+ '--dpparttree':FlagParameter(Prefix='--',Name='dpparttree'),\
+
+ # The PartTree algorithm is used with distances based on FASTA. Slightly
+ # more accurate and slower than --parttree. Recommended for a large number
+ # (> ~10,000) of sequences are input. FASTA is required. Default: off
+ '--fastaparttree':FlagParameter(Prefix='--',Name='fastaparttree'),\
+
+ # The number of partitions in the PartTree algorithm. Default: 50
+ '--partsize':ValuedParameter(Prefix='--',Name='partsize',Delimiter=' '),\
+
+ # Do not make alignment larger than number sequences. Valid only with the
+ # --*parttree options. Default: the number of input sequences
+ '--groupsize':ValuedParameter(Prefix='--',Name='groupsize',Delimiter=' '),\
+
+ # Parameter
+
+ # Gap opening penalty at group-to-group alignment. Default: 1.53
+ '--op':ValuedParameter(Prefix='--',Name='op',Delimiter=' '),\
+
+ # Offset value, which works like gap extension penalty, for group-to-group
+ # alignment. Deafult: 0.123
+ '--ep':ValuedParameter(Prefix='--',Name='ep',Delimiter=' '),\
+
+ # Gap opening penalty at local pairwise alignment. Valid when the
+ # --localpair or --genafpair option is selected. Default: -2.00
+ '--lop':ValuedParameter(Prefix='--',Name='lop',Delimiter=' '),\
+
+ # Offset value at local pairwise alignment. Valid when the --localpair or
+ # --genafpair option is selected. Default: 0.1
+ '--lep':ValuedParameter(Prefix='--',Name='lep',Delimiter=' '),\
+
+ # Gap extension penalty at local pairwise alignment. Valid when the
+ # --localpair or --genafpair option is selected. Default: -0.1
+ '--lexp':ValuedParameter(Prefix='--',Name='lexp',Delimiter=' '),\
+
+ # Gap opening penalty to skip the alignment. Valid when the --genafpair
+ # option is selected. Default: -6.00
+ '--LOP':ValuedParameter(Prefix='--',Name='LOP',Delimiter=' '),\
+
+ # Gap extension penalty to skip the alignment. Valid when the --genafpair
+ # option is selected. Default: 0.00
+ '--LEXP':ValuedParameter(Prefix='--',Name='LEXP',Delimiter=' '),\
+
+ # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. number=30, 45,
+ # 62 or 80. Default: 62
+ '--bl':ValuedParameter(Prefix='--',Name='bl',Delimiter=' '),\
+
+ # JTT PAM number (Jones et al. 1992) matrix is used. number>0.
+ # Default: BLOSUM62
+ '--jtt':ValuedParameter(Prefix='--',Name='jtt',Delimiter=' '),\
+
+ # Transmembrane PAM number (Jones et al. 1994) matrix is used. number>0.
+ # Default: BLOSUM62
+ '--tm':ValuedParameter(Prefix='--',Name='tm',Delimiter=' '),\
+
+ # Use a user-defined AA scoring matrix. The format of matrixfile is the same
+ # to that of BLAST. Ignored when nucleotide sequences are input.
+ # Default: BLOSUM62
+ '--aamatrix':ValuedParameter(Prefix='--',Name='aamatrix',Delimiter=' '),\
+
+ # Incorporate the AA/nuc composition information into the scoring matrix.
+ # Deafult: off
+ '--fmodel':FlagParameter(Prefix='--',Name='fmodel'),\
+
+ # Output
+
+ # Output format: clustal format. Default: off (fasta format)
+ '--clustalout':FlagParameter(Prefix='--',Name='clustalout'),\
+
+ # Output order: same as input. Default: on
+ '--inputorder':FlagParameter(Prefix='--',Name='inputorder'),\
+
+ # Output order: aligned. Default: off (inputorder)
+ '--reorder':FlagParameter(Prefix='--',Name='reorder'),\
+
+ # Guide tree is output to the input.tree file. Default: off
+ '--treeout':FlagParameter(Prefix='--',Name='treeout'),\
+
+ # Do not report progress. Default: off
+ '--quiet':FlagParameter(Prefix='--',Name='quiet'),\
+
+# Input
+
+ # Assume the sequences are nucleotide. Deafult: auto
+ '--nuc':FlagParameter(Prefix='--',Name='nuc'),\
+
+ # Assume the sequences are amino acid. Deafult: auto
+ '--amino':FlagParameter(Prefix='--',Name='amino'),\
+
+ # Seed alignments given in alignment_n (fasta format) are aligned with
+ # sequences in input. The alignment within every seed is preserved.
+ '--seed':ValuedParameter(Prefix='--',Name='seed',Delimiter=' '),\
+ }
+
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "mafft"
+ _suppress_stderr=True
+
+ def _input_as_seqs(self,data):
+ lines = []
+ for i,s in enumerate(data):
+ #will number the sequences 1,2,3,etc.
+ lines.append(''.join(['>',str(i+1)]))
+ lines.append(s)
+ return self._input_as_lines(lines)
+
+ def _tree_out_filename(self):
+ if self.Parameters['--treeout'].isOn():
+ tree_filename = self._absolute(str(self._input_filename))+'.tree'
+ else:
+ raise ValueError, "No tree output file specified."
+ return tree_filename
+
+ def _tempfile_as_multiline_string(self, data):
+ """Write a multiline string to a temp file and return the filename.
+
+ data: a multiline string to be written to a file.
+
+ * Note: the result will be the filename as a FilePath object
+ (which is a string subclass).
+
+ """
+ filename = FilePath(self.getTmpFilename(self.TmpDir))
+ data_file = open(filename,'w')
+ data_file.write(data)
+ data_file.close()
+ return filename
+
+ def getHelp(self):
+ """Method that points to the Mafft documentation."""
+
+ help_str = \
+ """
+ See Mafft documentation at:
+ http://align.bmr.kyushu-u.ac.jp/mafft/software/manual/manual.html
+ """
+ return help_str
+
+ def _get_result_paths(self,data):
+ result = {}
+ if self.Parameters['--treeout'].isOn():
+ out_name = self._tree_out_filename()
+ result['Tree'] = ResultPath(Path=out_name,IsWritten=True)
+ return result
+
+def align_unaligned_seqs(seqs,moltype=DNA,params=None,accurate=False):
+ """Aligns unaligned sequences
+
+ seqs: either list of sequence objects or list of strings
+ add_seq_names: boolean. if True, sequence names are inserted in the list
+ of sequences. if False, it assumes seqs is a list of lines of some
+ proper format that the program can handle
+ """
+ #create SequenceCollection object from seqs
+ seq_collection = SequenceCollection(seqs,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = seq_collection.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+ #Create Mafft app.
+ app = Mafft(InputHandler='_input_as_multiline_string',params=params)
+
+ #Turn on correct moltype
+ moltype_string = moltype.label.upper()
+ app.Parameters[MOLTYPE_MAP[moltype_string]].on()
+
+ #Do not report progress
+ app.Parameters['--quiet'].on()
+
+ #More accurate alignment, sacrificing performance.
+ if accurate:
+ app.Parameters['--globalpair'].on()
+ app.Parameters['--maxiterate'].Value=1000
+
+ #Get results using int_map as input to app
+ res = app(int_map.toFasta())
+ #Get alignment as dict out of results
+ alignment = dict(parse_fasta(res['StdOut']))
+ #Make new dict mapping original IDs
+ new_alignment = {}
+ for k,v in alignment.items():
+ new_alignment[int_keys[k]]=v
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment,MolType=moltype)
+ #Clean up
+ res.cleanUp()
+ del(seq_collection,int_map,int_keys,app,res,alignment)
+
+ return new_alignment
+
+
+def align_and_build_tree(seqs, moltype, best_tree=False, params={}):
+ """Returns an alignment and a tree from Sequences object seqs.
+
+ seqs: SequenceCollection object, or data that can be used to build one.
+
+ best_tree: if True (default:False), uses a slower but more accurate
+ algorithm to build the tree.
+
+ params: dict of parameters to pass in to the Mafft app controller.
+
+ The result will be a tuple containing an Alignment object and a
+ cogent.core.tree.PhyloNode object (or None for the alignment and/or tree
+ if either fails).
+ """
+ #Current version of Mafft does not support tree building.
+ raise NotImplementedError, """Current version of Mafft does not support tree building."""
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={},\
+ working_dir='/tmp'):
+ """Returns a tree from Alignment object aln.
+
+ aln: a cogent.core.alignment.Alignment object, or data that can be used
+ to build one.
+
+ best_tree: if True (default:False), uses a slower but more accurate
+ algorithm to build the tree.
+ NOTE: Mafft does not necessarily support best_tree option.
+ Will only return guide tree used to align sequences. Passing
+ best_tree = True will construct the guide tree 100 times instead
+ of default 2 times.
+
+ ***Mafft does allow you to get the guide tree back, but the IDs in the
+ output guide tree do not match the original IDs in the fasta file
+ and are impossible to map. Sent bug report to Mafft authors; possibly
+ expect this option in future version.***
+
+ params: dict of parameters to pass in to the Mafft app controller.
+
+ The result will be an cogent.core.tree.PhyloNode object, or None if tree
+ fails.
+ """
+ #Current version of Mafft does not support tree building.
+ raise NotImplementedError, """Current version of Mafft does not support tree building."""
+
+def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False):
+ """Returns an Alignment object from seqs and existing Alignment.
+
+ seqs: a cogent.core.sequence.Sequence object, or data that can be used
+ to build one.
+
+ aln: an cogent.core.alignment.Alignment object, or data that can be used
+ to build one
+
+ params: dict of parameters to pass in to the Mafft app controller.
+ """
+ #create SequenceCollection object from seqs
+ seq_collection = SequenceCollection(seqs,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ seq_int_map, seq_int_keys = seq_collection.getIntMap()
+ #Create SequenceCollection from int_map.
+ seq_int_map = SequenceCollection(seq_int_map,MolType=moltype)
+
+ #create Alignment object from aln
+ aln = Alignment(aln,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_')
+ #Create SequenceCollection from int_map.
+ aln_int_map = Alignment(aln_int_map,MolType=moltype)
+
+ #Update seq_int_keys with aln_int_keys
+ seq_int_keys.update(aln_int_keys)
+
+ #Create Mafft app.
+ app = Mafft(InputHandler='_input_as_multiline_string',\
+ params=params,
+ SuppressStderr=True)
+
+ #Turn on correct moltype
+ moltype_string = moltype.label.upper()
+ app.Parameters[MOLTYPE_MAP[moltype_string]].on()
+
+ #Do not report progress
+ app.Parameters['--quiet'].on()
+
+ #Add aln_int_map as seed alignment
+ app.Parameters['--seed'].on(\
+ app._tempfile_as_multiline_string(aln_int_map.toFasta()))
+
+ #More accurate alignment, sacrificing performance.
+ if accurate:
+ app.Parameters['--globalpair'].on()
+ app.Parameters['--maxiterate'].Value=1000
+
+ #Get results using int_map as input to app
+ res = app(seq_int_map.toFasta())
+ #Get alignment as dict out of results
+ alignment = dict(parse_fasta(res['StdOut']))
+
+ #Make new dict mapping original IDs
+ new_alignment = {}
+ for k,v in alignment.items():
+ key = k.replace('_seed_','')
+ new_alignment[seq_int_keys[key]]=v
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment,MolType=moltype)
+ #Clean up
+ res.cleanUp()
+ remove(app.Parameters['--seed'].Value)
+ del(seq_collection,seq_int_map,seq_int_keys,\
+ aln,aln_int_map,aln_int_keys,app,res,alignment)
+
+ return new_alignment
+
+def align_two_alignments(aln1, aln2, moltype, params=None):
+ """Returns an Alignment object from two existing Alignments.
+
+ aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
+ used to build them.
+ - Mafft profile alignment only works with aligned sequences. Alignment
+ object used to handle unaligned sequences.
+
+ params: dict of parameters to pass in to the Mafft app controller.
+ """
+ #create SequenceCollection object from seqs
+ aln1 = Alignment(aln1,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ aln1_int_map, aln1_int_keys = aln1.getIntMap()
+ #Create SequenceCollection from int_map.
+ aln1_int_map = Alignment(aln1_int_map,MolType=moltype)
+
+ #create Alignment object from aln
+ aln2 = Alignment(aln2,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_')
+ #Create SequenceCollection from int_map.
+ aln2_int_map = Alignment(aln2_int_map,MolType=moltype)
+
+ #Update aln1_int_keys with aln2_int_keys
+ aln1_int_keys.update(aln2_int_keys)
+
+ #Create Mafft app.
+ app = Mafft(InputHandler='_input_as_paths',\
+ params=params,
+ SuppressStderr=False)
+ app._command = 'mafft-profile'
+
+ aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta())
+ aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta())
+ filepaths = [aln1_path,aln2_path]
+
+ #Get results using int_map as input to app
+ res = app(filepaths)
+
+ #Get alignment as dict out of results
+ alignment = dict(parse_fasta(res['StdOut']))
+
+ #Make new dict mapping original IDs
+ new_alignment = {}
+ for k,v in alignment.items():
+ key = k.replace('_seed_','')
+ new_alignment[aln1_int_keys[key]]=v
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment,MolType=moltype)
+ #Clean up
+ res.cleanUp()
+ remove(aln1_path)
+ remove(aln2_path)
+ remove('pre')
+ remove('trace')
+ del(aln1,aln1_int_map,aln1_int_keys,\
+ aln2,aln2_int_map,aln2_int_keys,app,res,alignment)
+
+ return new_alignment
diff --git a/bfillings/mothur.py b/bfillings/mothur.py
new file mode 100644
index 0000000..ae6aca9
--- /dev/null
+++ b/bfillings/mothur.py
@@ -0,0 +1,589 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Provides an application controller for the commandline version of
+mothur Version 1.6.0
+"""
+
+
+from __future__ import with_statement, division
+from os import path, getcwd, mkdir, remove, listdir
+import re
+from shutil import copyfile, rmtree
+from subprocess import Popen
+from tempfile import NamedTemporaryFile, mkdtemp, gettempdir
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import ValuedParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+ CommandLineAppResult, ApplicationError)
+
+
+def is_empty(line):
+ """Returns True empty lines and lines consisting only of whitespace."""
+ return (not line) or line.isspace()
+
+
+def parse_otu_list(lines, precision=0.0049):
+ """Parser for mothur *.list file
+
+ To ensure all distances are of type float, the parser returns a
+ distance of 0.0 for the unique groups. However, if some sequences
+ are very similar, mothur may return a grouping at zero distance.
+ What Mothur really means by this, however, is that the clustering
+ is at the level of Mothur's precision. In this case, the parser
+ returns the distance explicitly.
+
+ If you are parsing otu's with a non-default precision, you must
+ specify the precision here to ensure that the parsed distances are
+ in order.
+
+ Returns an iterator over (distance, otu_list)
+ """
+ for line in lines:
+ if is_empty(line):
+ continue
+ tokens = line.strip().split('\t')
+
+ distance_str = tokens.pop(0)
+ if distance_str.lstrip().lower().startswith('u'):
+ distance = 0.0
+ elif distance_str == '0.0':
+ distance = float(precision)
+ else:
+ distance = float(distance_str)
+
+ num_otus = int(tokens.pop(0))
+ otu_list = [t.split(',') for t in tokens]
+
+ yield (distance, otu_list)
+
+
+class Mothur(CommandLineApplication):
+
+ """Mothur application controller
+ """
+ _options = {
+ # Clustering algorithm. Choices are furthest, nearest, and
+ # average
+ 'method': ValuedParameter(
+ Name='method', Value='furthest', Delimiter='=', Prefix=''),
+ # Cutoff distance for the distance matrix
+ 'cutoff': ValuedParameter(
+ Name='cutoff', Value=None, Delimiter='=', Prefix=''),
+ # Minimum pairwise distance to consider for clustering
+ 'precision': ValuedParameter(
+ Name='precision', Value=None, Delimiter='=', Prefix=''),
+ }
+ _parameters = {}
+ _parameters.update(_options)
+ _input_handler = '_input_as_multiline_string'
+ _command = 'mothur'
+
+ def __init__(self, params=None, InputHandler=None, SuppressStderr=None,
+ SuppressStdout=None, WorkingDir=None, TmpDir=None,
+ TmpNameLen=20, HALT_EXEC=False):
+ """Initialize a Mothur application controller
+
+ params: a dictionary mapping the Parameter id or synonym to its
+ value (or None for FlagParameters or MixedParameters in flag
+ mode) for Parameters that should be turned on
+ InputHandler: this is the method to be run on data when it is
+ passed into call. This should be a string containing the
+ method name. The default is _input_as_string which casts data
+ to a string before appending it to the command line argument
+ SuppressStderr: if set to True, will route standard error to
+ /dev/null, False by default
+ SuppressStdout: if set to True, will route standard out to
+ /dev/null, False by default
+ WorkingDir: the directory where you want the application to run,
+ default is the current working directory, but is useful to
+ change in cases where the program being run creates output
+ to its current working directory and you either don't want
+ it to end up where you are running the program, or the user
+ running the script doesn't have write access to the current
+ working directory
+ WARNING: WorkingDir MUST be an absolute path!
+ TmpDir: the directory where temp files will be created, default
+ value is determined by environment variables.
+ TmpNameLen: the length of the temp file name
+ HALT_EXEC: if True, raises exception w/ command output just
+ before execution, doesn't clean up temp files. Default False.
+
+ Note: Mothur input files are copied to the working directory,
+ not the temp directory, when the application controller is
+ called. Output files generated by mothur are generated based on
+ the name of the input file, and if written to the tmp directory
+ could collide with another filename. Our strategy is to allow
+ the user to specify a working directory where input, output, and
+ log files are created.
+
+ File cleanup is handled in the same way as other app
+ controllers: input files are removed when the controller is
+ called unless remove_tmp is True, output and log files are
+ cleaned up when the cleanUp method is called on the results.
+ """
+ super(Mothur, self).__init__(
+ params=params, InputHandler=InputHandler,
+ SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout,
+ WorkingDir='', TmpDir='', TmpNameLen=TmpNameLen,
+ HALT_EXEC=HALT_EXEC)
+ # Prevent self.WorkingDir from being explicitly cast as a
+ # FilePath object. This behavior does not seem necessary in
+ # the parent's __init__() method, since the casting is
+ # repeated in _set_WorkingDir().
+ if WorkingDir is not None:
+ working_dir = WorkingDir
+ else:
+ working_dir = self._working_dir or getcwd()
+ self.WorkingDir = working_dir
+ if TmpDir is not None:
+ self.TmpDir = TmpDir
+ else:
+ self.TmpDir = gettempdir()
+
+ @staticmethod
+ def getHelp():
+ """Returns link to online manual"""
+ help = (
+ 'See manual, available on the MOTHUR wiki:\n'
+ 'http://schloss.micro.umass.edu/mothur/'
+ )
+ return help
+
+ def __call__(self, data=None, remove_tmp=True):
+ """Run the application with the specified kwargs on data
+
+ data: anything that can be cast into a string or written out to
+ a file. Usually either a list of things or a single string or
+ number. input_handler will be called on this data before it
+ is passed as part of the command-line argument, so by creating
+ your own input handlers you can customize what kind of data
+ you want your application to accept
+
+ remove_tmp: if True, removes tmp files
+ """
+ # Process the input data. Input filepath is stored in
+ # self._input_filename
+ getattr(self, self.InputHandler)(data)
+
+ if self.SuppressStdout:
+ outfile = None
+ else:
+ outfile = open(self.getTmpFilename(self.TmpDir), 'w')
+ if self.SuppressStderr:
+ errfile = None
+ else:
+ errfile = open(self.getTmpFilename(self.TmpDir), 'w')
+
+ args = [self._command, self._compile_mothur_script()]
+ process = Popen(
+ args, stdout=outfile, stderr=errfile, cwd=self.WorkingDir)
+ exit_status = process.wait()
+ if not self._accept_exit_status(exit_status):
+ raise ApplicationError(
+ 'Unacceptable application exit status: %s, command: %s' %
+ (exit_status, args))
+
+ if outfile is not None:
+ outfile.seek(0)
+ if errfile is not None:
+ errfile.seek(0)
+ result = CommandLineAppResult(
+ outfile, errfile, exit_status, result_paths=self._get_result_paths())
+
+ # Clean up the input file if one was created
+ if remove_tmp:
+ if self._input_filename:
+ remove(self._input_filename)
+ self._input_filename = None
+
+ return result
+
+ def _accept_exit_status(self, status):
+ return int(status) == 0
+
+ def _compile_mothur_script(self):
+ """Returns a Mothur batch script as a string"""
+ def format_opts(*opts):
+ """Formats a series of options for a Mothur script"""
+ return ', '.join(filter(None, map(str, opts)))
+ vars = {
+ 'in': self._input_filename,
+ 'unique': self._derive_unique_path(),
+ 'dist': self._derive_dist_path(),
+ 'names': self._derive_names_path(),
+ 'cluster_opts': format_opts(
+ self.Parameters['method'],
+ self.Parameters['cutoff'],
+ self.Parameters['precision'],
+ ),
+ }
+ script = (
+ '#'
+ 'unique.seqs(fasta=%(in)s); '
+ 'dist.seqs(fasta=%(unique)s); '
+ 'read.dist(column=%(dist)s, name=%(names)s); '
+ 'cluster(%(cluster_opts)s)' % vars
+ )
+ return script
+
+ def _get_result_paths(self):
+ paths = {
+ 'distance matrix': self._derive_dist_path(),
+ 'otu list': self._derive_list_path(),
+ 'rank abundance': self._derive_rank_abundance_path(),
+ 'species abundance': self._derive_species_abundance_path(),
+ 'unique names': self._derive_names_path(),
+ 'unique seqs': self._derive_unique_path(),
+ 'log': self._derive_log_path(),
+ }
+ return dict([(k, ResultPath(v)) for (k, v) in paths.items()])
+
+ # Methods to derive/guess output pathnames produced by MOTHUR.
+ # TODO: test for input files that do not have a filetype extension
+
+ def _derive_log_path(self):
+ """Guess logfile path produced by Mothur
+
+ This method checks the working directory for log files
+ generated by Mothur. It will raise an ApplicationError if no
+ log file can be found.
+
+ Mothur generates log files named in a nondeterministic way,
+ using the current time. We return the log file with the most
+ recent time, although this may lead to incorrect log file
+ detection if you are running many instances of mothur
+ simultaneously.
+ """
+ filenames = listdir(self.WorkingDir)
+ lognames = [
+ x for x in filenames if re.match(
+ "^mothur\.\d+\.logfile$",
+ x)]
+ if not lognames:
+ raise ApplicationError(
+ 'No log file detected in directory %s. Contents: \n\t%s' % (
+ input_dir, '\n\t'.join(possible_logfiles)))
+ most_recent_logname = sorted(lognames, reverse=True)[0]
+ return path.join(self.WorkingDir, most_recent_logname)
+
+ def _derive_unique_path(self):
+ """Guess unique sequences path produced by Mothur"""
+ base, ext = path.splitext(self._input_filename)
+ return '%s.unique%s' % (base, ext)
+
+ def _derive_dist_path(self):
+ """Guess distance matrix path produced by Mothur"""
+ base, ext = path.splitext(self._input_filename)
+ return '%s.unique.dist' % base
+
+ def _derive_names_path(self):
+ """Guess unique names file path produced by Mothur"""
+ base, ext = path.splitext(self._input_filename)
+ return '%s.names' % base
+
+ def __get_method_abbrev(self):
+ """Abbreviated form of clustering method parameter.
+
+ Used to guess output filenames for MOTHUR.
+ """
+ abbrevs = {
+ 'furthest': 'fn',
+ 'nearest': 'nn',
+ 'average': 'an',
+ }
+ if self.Parameters['method'].isOn():
+ method = self.Parameters['method'].Value
+ else:
+ method = self.Parameters['method'].Default
+ return abbrevs[method]
+
+ def _derive_list_path(self):
+ """Guess otu list file path produced by Mothur"""
+ base, ext = path.splitext(self._input_filename)
+ return '%s.unique.%s.list' % (base, self.__get_method_abbrev())
+
+ def _derive_rank_abundance_path(self):
+ """Guess rank abundance file path produced by Mothur"""
+ base, ext = path.splitext(self._input_filename)
+ return '%s.unique.%s.rabund' % (base, self.__get_method_abbrev())
+
+ def _derive_species_abundance_path(self):
+ """Guess species abundance file path produced by Mothur"""
+ base, ext = path.splitext(self._input_filename)
+ return '%s.unique.%s.sabund' % (base, self.__get_method_abbrev())
+
+ def getTmpFilename(self, tmp_dir=None, prefix='tmp', suffix='.txt'):
+ """Returns a temporary filename
+
+ Similar interface to tempfile.mktmp()
+ """
+ # Override to change default constructor to str(). FilePath
+ # objects muck up the Mothur script.
+ return super(Mothur, self).getTmpFilename(
+ tmp_dir=tmp_dir, prefix=prefix, suffix=suffix,
+ result_constructor=str)
+
+ # Temporary input file needs to be in the working directory, so we
+ # override all input handlers.
+
+ def _input_as_multiline_string(self, data):
+ """Write multiline string to temp file, return filename
+
+ data: a multiline string to be written to a file.
+ """
+ self._input_filename = self.getTmpFilename(
+ self.WorkingDir, suffix='.fasta')
+ with open(self._input_filename, 'w') as f:
+ f.write(data)
+ return self._input_filename
+
+ def _input_as_lines(self, data):
+ """Write sequence of lines to temp file, return filename
+
+ data: a sequence to be written to a file, each element of the
+ sequence will compose a line in the file
+
+ * Note: '\n' will be stripped off the end of each sequence
+ element before writing to a file in order to avoid
+ multiple new lines accidentally be written to a file
+ """
+ self._input_filename = self.getTmpFilename(
+ self.WorkingDir, suffix='.fasta')
+ with open(self._input_filename, 'w') as f:
+ # Use lazy iteration instead of list comprehension to
+ # prevent reading entire file into memory
+ for line in data:
+ f.write(str(line).strip('\n'))
+ f.write('\n')
+ return self._input_filename
+
+ def _input_as_path(self, data):
+ """Copys the provided file to WorkingDir and returns the new filename
+
+ data: path or filename
+ """
+ self._input_filename = self.getTmpFilename(
+ self.WorkingDir, suffix='.fasta')
+ copyfile(data, self._input_filename)
+ return self._input_filename
+
+ def _input_as_paths(self, data):
+ raise NotImplementedError('Not applicable for MOTHUR controller.')
+
+ def _input_as_string(self, data):
+ raise NotImplementedError('Not applicable for MOTHUR controller.')
+
+ # FilePath objects muck up the Mothur script, so we override the
+ # property methods for self.WorkingDir
+
+ def _get_WorkingDir(self):
+ """Gets the working directory"""
+ return self._curr_working_dir
+
+ def _set_WorkingDir(self, path):
+ """Sets the working directory
+ """
+ self._curr_working_dir = path
+ try:
+ mkdir(self.WorkingDir)
+ except OSError:
+ # Directory already exists
+ pass
+
+ WorkingDir = property(_get_WorkingDir, _set_WorkingDir)
+
+
+def mothur_from_file(file):
+ app = Mothur(InputHandler='_input_as_lines')
+ result = app(file)
+ # Force evaluation, so we can safely clean up files
+ otus = list(parse_otu_list(result['otu list']))
+ result.cleanUp()
+ return otus
+
+
+# Files with dashes currently break MOTHUR -- in the upcoming version
+# of the software, they may be escaped with a backslash. We implement
+# and test for this now, since it's broken anyway!
+
+
+class _MothurFilepathParameter(ValuedParameter):
+
+ """Inserts escape characters in filepath parameters for Mothur."""
+
+ def _get_value(self):
+ return self._Value
+
+ def _set_value(self, val):
+ if val:
+ self._Value = str(val).replace("-", "\\-")
+ else:
+ self._Value = val
+
+ Value = property(_get_value, _set_value)
+
+
+class MothurClassifySeqs(Mothur):
+ _options = {
+ 'reference': _MothurFilepathParameter(
+ Name='reference', Value=None, Delimiter='=', Prefix=''),
+ 'taxonomy': _MothurFilepathParameter(
+ Name='taxonomy', Value=None, Delimiter='=', Prefix=''),
+ 'cutoff': ValuedParameter(
+ Name='cutoff', Value=None, Delimiter='=', Prefix=''),
+ 'iters': ValuedParameter(
+ Name='iters', Value=None, Delimiter='=', Prefix=''),
+ 'ksize': ValuedParameter(
+ Name='ksize', Value=None, Delimiter='=', Prefix=''),
+ }
+ _parameters = {}
+ _parameters.update(_options)
+ _filepath_parameters = set(['reference', 'taxonomy'])
+
+ def _format_function_arguments(self, opts):
+ """Format a series of function arguments in a Mothur script."""
+ params = [self.Parameters[x] for x in opts]
+ return ', '.join(filter(None, map(str, params)))
+
+ def _compile_mothur_script(self):
+ """Returns a Mothur batch script as a string"""
+ fasta = self._input_filename
+
+ required_params = ["reference", "taxonomy"]
+ for p in required_params:
+ if self.Parameters[p].Value is None:
+ raise ValueError("Must provide value for parameter %s" % p)
+ optional_params = ["ksize", "cutoff", "iters"]
+ args = self._format_function_arguments(
+ required_params + optional_params)
+ script = '#classify.seqs(fasta=%s, %s)' % (fasta, args)
+ return script
+
+ def _get_result_paths(self):
+ input_base, ext = path.splitext(path.basename(self._input_filename))
+ result_by_suffix = {
+ ".summary": "summary",
+ ".taxonomy": "assignments",
+ ".accnos": "accnos",
+ }
+
+ paths = {'log': self._derive_log_path()}
+ input_dir = path.dirname(self._input_filename)
+ for fn in listdir(input_dir):
+ if fn.startswith(input_base):
+ for suffix, result_key in result_by_suffix.items():
+ if fn.endswith(suffix):
+ paths[result_key] = path.join(input_dir, fn)
+ return dict([(k, ResultPath(v)) for (k, v) in paths.items()])
+
+
+def parse_mothur_assignments(lines):
+ for line in lines:
+ line = line.strip()
+ if not line:
+ continue
+ seq_id, _, assignment = line.partition("\t")
+
+ # Special case: unidentified sequences should be given a
+ # confidence of 0.0. Newer versions of MOTHUR return a real
+ # value for the confidence -- maybe we should consider keeping
+ # the value if present, because a sequence may conceivably be
+ # unknown with 85% confidence.
+ if re.match('unknown', assignment, re.IGNORECASE):
+ yield seq_id, ["Unknown"], 0.0
+ continue
+
+ toks = assignment.rstrip(";").split(";")
+ lineage = []
+ conf = 0.0
+ for tok in toks:
+ matchobj = re.match("(.+)\((\d+)\)$", tok)
+ if matchobj:
+ lineage.append(matchobj.group(1))
+ pct_conf = int(matchobj.group(2))
+ conf = pct_conf / 100.0
+ yield seq_id, lineage, conf
+
+
+def mothur_classify_file(
+ query_file, ref_fp, tax_fp, cutoff=None, iters=None, ksize=None,
+ output_fp=None, tmp_dir=None):
+ """Classify a set of sequences using Mothur's naive bayes method
+
+ Dashes are used in Mothur to provide multiple filenames. A
+ filepath with a dash typically breaks an otherwise valid command
+ in Mothur. This wrapper script makes a copy of both files, ref_fp
+ and tax_fp, to ensure that the path has no dashes.
+
+ For convenience, we also ensure that each taxon list in the
+ id-to-taxonomy file ends with a semicolon.
+ """
+ if tmp_dir is None:
+ tmp_dir = gettempdir()
+
+ ref_seq_ids = set()
+
+ user_ref_file = open(ref_fp)
+ tmp_ref_file = NamedTemporaryFile(dir=tmp_dir, suffix=".ref.fa")
+ for seq_id, seq in parse_fasta(user_ref_file):
+ id_token = seq_id.split()[0]
+ ref_seq_ids.add(id_token)
+ tmp_ref_file.write(">%s\n%s\n" % (seq_id, seq))
+ tmp_ref_file.seek(0)
+
+ user_tax_file = open(tax_fp)
+ tmp_tax_file = NamedTemporaryFile(dir=tmp_dir, suffix=".tax.txt")
+ for line in user_tax_file:
+ line = line.rstrip()
+ if not line:
+ continue
+
+ # MOTHUR is particular that each assignment end with a semicolon.
+ if not line.endswith(";"):
+ line = line + ";"
+
+ id_token, _, _ = line.partition("\t")
+ if id_token in ref_seq_ids:
+ tmp_tax_file.write(line)
+ tmp_tax_file.write("\n")
+ tmp_tax_file.seek(0)
+
+ params = {"reference": tmp_ref_file.name, "taxonomy": tmp_tax_file.name}
+ if cutoff is not None:
+ params["cutoff"] = cutoff
+ if ksize is not None:
+ params["ksize"] = ksize
+ if iters is not None:
+ params["iters"] = iters
+
+ # Create a temporary working directory to accommodate mothur's output
+ # files, which are generated automatically based on the input
+ # file.
+ work_dir = mkdtemp(dir=tmp_dir)
+
+ app = MothurClassifySeqs(
+ params, InputHandler='_input_as_lines', WorkingDir=work_dir,
+ TmpDir=tmp_dir)
+ result = app(query_file)
+
+ # Force evaluation so we can safely clean up files
+ assignments = list(parse_mothur_assignments(result['assignments']))
+ result.cleanUp()
+ rmtree(work_dir)
+
+ if output_fp is not None:
+ f = open(output_fp, "w")
+ for query_id, taxa, conf in assignments:
+ taxa_str = ";".join(taxa)
+ f.write("%s\t%s\t%.2f\n" % (query_id, taxa_str, conf))
+ f.close()
+ return None
+ return dict((a, (b, c)) for a, b, c in assignments)
diff --git a/bfillings/muscle_v38.py b/bfillings/muscle_v38.py
new file mode 100644
index 0000000..3af9971
--- /dev/null
+++ b/bfillings/muscle_v38.py
@@ -0,0 +1,777 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for muscle 3.8
+"""
+from os import remove
+from random import choice
+import tempfile
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import FlagParameter, ValuedParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+ get_tmp_filename, guess_input_handler)
+
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.parse.tree import DndParser
+from cogent.core.tree import PhyloNode
+from cogent import DNA
+
+
+class Muscle(CommandLineApplication):
+ """Muscle application controller"""
+
+ _options ={
+ # Minimum spacing between anchor columns. [Integer]
+ '-anchorspacing':ValuedParameter('-',Name='anchorspacing',Delimiter=' '),
+ # Center parameter. Should be negative [Float]
+ '-center':ValuedParameter('-',Name='center',Delimiter=' '),
+
+ # Clustering method. cluster1 is used in iteration 1
+ # and 2, cluster2 in later iterations
+ '-cluster1':ValuedParameter('-',Name='cluster1',Delimiter=' '),
+ '-cluster2':ValuedParameter('-',Name='cluster2',Delimiter=' '),
+
+ # Minimum length of diagonal.
+ '-diaglength':ValuedParameter('-',Name='diaglength',Delimiter=' '),
+
+ # Discard this many positions at ends of diagonal.
+ '-diagmargin':ValuedParameter('-',Name='diagmargin',Delimiter=' '),
+
+ # Distance measure for iteration 1.
+ '-distance1':ValuedParameter('-',Name='distance1',Delimiter=' '),
+
+ # Distance measure for iterations 2, 3 ...
+ '-distance2':ValuedParameter('-',Name='distance2',Delimiter=' '),
+
+ # The gap open score. Must be negative.
+ '-gapopen':ValuedParameter('-',Name='gapopen',Delimiter=' '),
+
+ # Window size for determining whether a region is hydrophobic.
+ '-hydro':ValuedParameter('-',Name='hydro',Delimiter=' '),
+
+ # Multiplier for gap open/close penalties in hydrophobic regions.
+ '-hydrofactor':ValuedParameter('-',Name='hydrofactor',Delimiter=' '),
+
+ # Where to find the input sequences.
+ '-in':ValuedParameter('-',Name='in',Delimiter=' ', Quote="\""),
+ '-in1':ValuedParameter('-',Name='in1',Delimiter=' ', Quote="\""),
+ '-in2':ValuedParameter('-',Name='in2',Delimiter=' ', Quote="\""),
+
+ # Log file name (delete existing file).
+ '-log':ValuedParameter('-',Name='log',Delimiter=' '),
+
+ # Log file name (append to existing file).
+ '-loga':ValuedParameter('-',Name='loga',Delimiter=' '),
+
+ # Maximum distance between two diagonals that allows them to merge
+ # into one diagonal.
+ '-maxdiagbreak':ValuedParameter('-',Name='maxdiagbreak',Delimiter=' '),
+
+ # Maximum time to run in hours. The actual time may exceed the
+ # requested limit by a few minutes. Decimals are allowed, so 1.5
+ # means one hour and 30 minutes.
+ '-maxhours':ValuedParameter('-',Name='maxhours',Delimiter=' '),
+
+ # Maximum number of iterations.
+ '-maxiters':ValuedParameter('-',Name='maxiters',Delimiter=' '),
+
+ # Maximum memory in Mb
+ '-maxmb': ValuedParameter('-', Name='maxmb', Delimiter=' '),
+
+ # Maximum number of new trees to build in iteration 2.
+ '-maxtrees':ValuedParameter('-',Name='maxtrees',Delimiter=' '),
+
+ # Minimum score a column must have to be an anchor.
+ '-minbestcolscore':ValuedParameter('-',Name='minbestcolscore',Delimiter=' '),
+
+ # Minimum smoothed score a column must have to be an anchor.
+ '-minsmoothscore':ValuedParameter('-',Name='minsmoothscore',Delimiter=' '),
+
+ # Objective score used by tree dependent refinement.
+ # sp=sum-of-pairs score.
+ # spf=sum-of-pairs score (dimer approximation)
+ # spm=sp for < 100 seqs, otherwise spf
+ # dp=dynamic programming score.
+ # ps=average profile-sequence score.
+ # xp=cross profile score.
+ '-objscore':ValuedParameter('-',Name='objscore',Delimiter=' '),
+
+ # Where to write the alignment.
+ '-out':ValuedParameter('-',Name='out',Delimiter=' ', Quote="\""),
+
+ # Where to write the file in phylip sequenctial format (v3.6 only).
+ '-physout':ValuedParameter('-',Name='physout',Delimiter=' '),
+
+ # Where to write the file in phylip interleaved format (v3.6 only).
+ '-phyiout':ValuedParameter('-',Name='phyiout',Delimiter=' '),
+
+ # Set to profile for aligning two alignments and adding seqs to an
+ # existing alignment
+ '-profile':FlagParameter(Prefix='-',Name='profile'),
+
+ # Method used to root tree; root1 is used in iteration 1 and 2, root2
+ # in later iterations.
+ '-root1':ValuedParameter('-',Name='root1',Delimiter=' '),
+ '-root2':ValuedParameter('-',Name='root2',Delimiter=' '),
+
+ # Sequence type.
+ '-seqtype':ValuedParameter('-',Name='seqtype',Delimiter=' '),
+
+ # Maximum value of column score for smoothing purposes.
+ '-smoothscoreceil':ValuedParameter('-',Name='smoothscoreceil',Delimiter=' '),
+
+ # Constant used in UPGMB clustering. Determines the relative fraction
+ # of average linkage (SUEFF) vs. nearest-neighbor linkage (1 . SUEFF).
+ '-SUEFF':ValuedParameter('-',Name='SUEFF',Delimiter=' '),
+
+ # Save tree produced in first or second iteration to given file in
+ # Newick (Phylip-compatible) format.
+ '-tree1':ValuedParameter('-',Name='tree1',Delimiter=' ', Quote="\""),
+ '-tree2':ValuedParameter('-',Name='tree2',Delimiter=' ', Quote="\""),
+
+ # Sequence weighting scheme.
+ # weight1 is used in iterations 1 and 2.
+ # weight2 is used for tree-dependent refinement.
+ # none=all sequences have equal weight.
+ # henikoff=Henikoff & Henikoff weighting scheme.
+ # henikoffpb=Modified Henikoff scheme as used in PSI-BLAST.
+ # clustalw=CLUSTALW method.
+ # threeway=Gotoh three-way method.
+ '-weight1':ValuedParameter('-',Name='weight1',Delimiter=' '),
+ '-weight2':ValuedParameter('-',Name='weight2',Delimiter=' '),
+
+ # Use anchor optimization in tree dependent refinement iterations
+ '-anchors':FlagParameter(Prefix='-',Name='anchors'),
+
+ # Write output in CLUSTALW format (default is FASTA).
+ '-clw':FlagParameter(Prefix='-',Name='clw'),
+
+ # Cluster sequences
+ '-clusteronly':FlagParameter(Prefix='-',Name='clusteronly'),
+ # neighborjoining is "unrecognized"
+ #'-neighborjoining':FlagParameter(Prefix='-',Name='neighborjoining'),
+
+
+ # Write output in CLUSTALW format with the "CLUSTAL W (1.81)" header
+ # rather than the MUSCLE version. This is useful when a post-processing
+ # step is picky about the file header.
+ '-clwstrict':FlagParameter(Prefix='-',Name='clwstrict'),
+
+ # Do not catch exceptions.
+ '-core':FlagParameter(Prefix='-',Name='core'),
+
+ # Write output in FASTA format. Alternatives include .clw,
+ # .clwstrict, .msf and .html.
+ '-fasta':FlagParameter(Prefix='-',Name='fasta'),
+
+ # Group similar sequences together in the output. This is the default.
+ # See also .stable.
+ '-group':FlagParameter(Prefix='-',Name='group'),
+
+ # Write output in HTML format (default is FASTA).
+ '-html':FlagParameter(Prefix='-',Name='html'),
+
+ # Use log-expectation profile score (VTML240). Alternatives are to use
+ # -sp or -sv. This is the default for amino acid sequences.
+ '-le':FlagParameter(Prefix='-',Name='le'),
+
+ # Write output in MSF format (default is FASTA).
+ '-msf':FlagParameter(Prefix='-',Name='msf'),
+
+ # Disable anchor optimization. Default is -anchors.
+ '-noanchors':FlagParameter(Prefix='-',Name='noanchors'),
+
+ # Catch exceptions and give an error message if possible.
+ '-nocore':FlagParameter(Prefix='-',Name='nocore'),
+
+ # Do not display progress messages.
+ '-quiet':FlagParameter(Prefix='-',Name='quiet'),
+
+ # Input file is already aligned, skip first two iterations and begin
+ # tree dependent refinement.
+ '-refine':FlagParameter(Prefix='-',Name='refine'),
+
+ # Use sum-of-pairs protein profile score (PAM200). Default is -le.
+ '-sp':FlagParameter(Prefix='-',Name='sp'),
+
+ # Use sum-of-pairs nucleotide profile score (BLASTZ parameters). This
+ # is the only option for nucleotides, and is therefore the default.
+ '-spn':FlagParameter(Prefix='-',Name='spn'),
+
+ # Preserve input order of sequences in output file. Default is to group
+ # sequences by similarity (-group).
+ '-stable':FlagParameter(Prefix='-',Name='stable'),
+
+ # Use sum-of-pairs profile score (VTML240). Default is -le.
+ '-sv':FlagParameter(Prefix='-',Name='sv'),
+
+ # Diagonal optimization
+ '-diags':FlagParameter(Prefix='-',Name='diags'),
+ '-diags1':FlagParameter(Prefix='-',Name='diags1'),
+ '-diags2':FlagParameter(Prefix='-',Name='diags2'),
+
+
+ # Terminal gaps penalized with full penalty.
+ # [1] Not fully supported in this version.
+ '-termgapsfull':FlagParameter(Prefix='-',Name='termgapsfull'),
+
+ # Terminal gaps penalized with half penalty.
+ # [1] Not fully supported in this version.
+ '-termgapshalf':FlagParameter(Prefix='-',Name='termgapshalf'),
+
+ # Terminal gaps penalized with half penalty if gap relative to
+ # longer sequence, otherwise with full penalty.
+ # [1] Not fully supported in this version.
+ '-termgapshalflonger':FlagParameter(Prefix='-',Name='termgapshalflonger'),
+
+ # Write parameter settings and progress messages to log file.
+ '-verbose':FlagParameter(Prefix='-',Name='verbose'),
+
+ # Write version string to stdout and exit.
+ '-version':FlagParameter(Prefix='-',Name='version'),
+ }
+
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "muscle"
+
+ def _input_as_seqs(self,data):
+ lines = []
+ for i,s in enumerate(data):
+ #will number the sequences 1,2,3,etc.
+ lines.append(''.join(['>',str(i+1)]))
+ lines.append(s)
+ return self._input_as_lines(lines)
+
+ def _input_as_lines(self,data):
+ if data:
+ self.Parameters['-in']\
+ .on(super(Muscle,self)._input_as_lines(data))
+
+ return ''
+
+ def _input_as_string(self,data):
+ """Makes data the value of a specific parameter
+
+ This method returns the empty string. The parameter will be printed
+ automatically once set.
+ """
+ if data:
+ self.Parameters['-in'].on(str(data))
+ return ''
+
+ def _input_as_multiline_string(self, data):
+ if data:
+ self.Parameters['-in']\
+ .on(super(Muscle,self)._input_as_multiline_string(data))
+ return ''
+
+ def _input_as_multifile(self, data):
+ """For use with the -profile option
+
+ This input handler expects data to be a tuple containing two
+ filenames. Index 0 will be set to -in1 and index 1 to -in2
+ """
+ if data:
+ try:
+ filename1, filename2 = data
+ except:
+ raise ValueError, "Expected two filenames"
+
+ self.Parameters['-in'].off()
+ self.Parameters['-in1'].on(filename1)
+ self.Parameters['-in2'].on(filename2)
+ return ''
+
+ def _align_out_filename(self):
+
+ if self.Parameters['-out'].isOn():
+ aln_filename = self._absolute(str(self.Parameters['-out'].Value))
+ else:
+ raise ValueError, "No output file specified."
+ return aln_filename
+
+ def _tree1_out_filename(self):
+
+ if self.Parameters['-tree1'].isOn():
+ aln_filename = self._absolute(str(self.Parameters['-tree1'].Value))
+ else:
+ raise ValueError, "No tree output file specified."
+ return aln_filename
+
+ def _tree2_out_filename(self):
+
+ if self.Parameters['-tree2'].isOn():
+ tree_filename = self._absolute(str(self.Parameters['-tree2'].Value))
+ else:
+ raise ValueError, "No tree output file specified."
+ return tree_filename
+
+ def _get_result_paths(self,data):
+
+ result = {}
+ if self.Parameters['-out'].isOn():
+ out_name = self._align_out_filename()
+ result['MuscleOut'] = ResultPath(Path=out_name,IsWritten=True)
+ if self.Parameters['-tree1'].isOn():
+ out_name = self._tree1_out_filename()
+ result['Tree1Out'] = ResultPath(Path=out_name,IsWritten=True)
+ if self.Parameters['-tree2'].isOn():
+ out_name = self._tree2_out_filename()
+ result['Tree2Out'] = ResultPath(Path=out_name,IsWritten=True)
+ return result
+
+
+ def getHelp(self):
+ """Muscle help"""
+
+ help_str = """
+"""
+ return help_str
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+def muscle_seqs(seqs,
+ add_seq_names=False,
+ out_filename=None,
+ input_handler=None,
+ params={},
+ WorkingDir=tempfile.gettempdir(),
+ SuppressStderr=None,
+ SuppressStdout=None):
+ """Muscle align list of sequences.
+
+ seqs: a list of sequences as strings or objects, you must set add_seq_names=True
+ or sequences in a multiline string, as read() from a fasta file
+ or sequences in a list of lines, as readlines() from a fasta file
+ or a fasta seq filename.
+
+ == for eg, testcode for guessing
+ #guess_input_handler should correctly identify input
+ gih = guess_input_handler
+ self.assertEqual(gih('abc.txt'), '_input_as_string')
+ self.assertEqual(gih('>ab\nTCAG'), '_input_as_multiline_string')
+ self.assertEqual(gih(['ACC','TGA'], True), '_input_as_seqs')
+ self.assertEqual(gih(['>a','ACC','>b','TGA']), '_input_as_lines')
+
+ == docstring for blast_seqs, apply to muscle_seqs ==
+ seqs: either file name or list of sequence objects or list of strings or
+ single multiline string containing sequences.
+
+ WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules
+ for data are as follows. If it's s list, treat as lines, unless
+ add_seq_names is true (in which case treat as list of seqs). If it's a
+ string, test whether it has newlines. If it doesn't have newlines, assume
+ it's a filename. If it does have newlines, it can't be a filename, so
+ assume it's a multiline string containing sequences.
+
+ If you want to skip the detection and force a specific type of input
+ handler, use input_handler='your_favorite_handler'.
+
+ add_seq_names: boolean. if True, sequence names are inserted in the list
+ of sequences. if False, it assumes seqs is a list of lines of some
+ proper format that the program can handle
+
+ Addl docs coming soon
+ """
+
+ if out_filename:
+ params["-out"] = out_filename
+ #else:
+ # params["-out"] = get_tmp_filename(WorkingDir)
+
+ ih = input_handler or guess_input_handler(seqs, add_seq_names)
+ muscle_app = Muscle(
+ params=params,
+ InputHandler=ih,
+ WorkingDir=WorkingDir,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout)
+ return muscle_app(seqs)
+
+
+def cluster_seqs(seqs,
+ neighbor_join=False,
+ params={},
+ add_seq_names=True,
+ WorkingDir=tempfile.gettempdir(),
+ SuppressStderr=None,
+ SuppressStdout=None,
+ max_chars=1000000,
+ max_hours=1.0,
+ constructor=PhyloNode,
+ clean_up=True
+ ):
+ """Muscle cluster list of sequences.
+
+ seqs: either file name or list of sequence objects or list of strings or
+ single multiline string containing sequences.
+
+ Addl docs coming soon
+ """
+ num_seqs = len(seqs)
+ if num_seqs < 2:
+ raise ValueError, "Muscle requres 2 or more sequences to cluster."
+
+
+ num_chars = sum(map(len, seqs))
+ if num_chars > max_chars:
+ params["-maxiters"] = 2
+ params["-diags1"] = True
+ params["-sv"] = True
+ #params["-distance1"] = "kmer6_6"
+ #params["-distance1"] = "kmer20_3"
+ #params["-distance1"] = "kbit20_3"
+ print "lots of chars, using fast align", num_chars
+
+
+ params["-maxhours"] = max_hours
+ #params["-maxiters"] = 10
+
+ #cluster_type = "upgmb"
+ #if neighbor_join:
+ # cluster_type = "neighborjoining"
+
+ params["-clusteronly"] = True
+ params["-tree1"] = get_tmp_filename(WorkingDir)
+
+ muscle_res = muscle_seqs(seqs,
+ params=params,
+ add_seq_names=add_seq_names,
+ WorkingDir=WorkingDir,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout)
+
+ tree = DndParser(muscle_res["Tree1Out"], constructor=constructor)
+
+ if clean_up:
+ muscle_res.cleanUp()
+ return tree
+
+def aln_tree_seqs(seqs,
+ input_handler=None,
+ tree_type='neighborjoining',
+ params={},
+ add_seq_names=True,
+ WorkingDir=tempfile.gettempdir(),
+ SuppressStderr=None,
+ SuppressStdout=None,
+ max_hours=5.0,
+ constructor=PhyloNode,
+ clean_up=True
+ ):
+ """Muscle align sequences and report tree from iteration2.
+
+ Unlike cluster_seqs, returns tree2 which is the tree made during the
+ second muscle iteration (it should be more accurate that the cluster from
+ the first iteration which is made fast based on k-mer words)
+
+ seqs: either file name or list of sequence objects or list of strings or
+ single multiline string containing sequences.
+ tree_type: can be either neighborjoining (default) or upgmb for UPGMA
+ clean_up: When true, will clean up output files
+ """
+
+ params["-maxhours"] = max_hours
+ if tree_type:
+ params["-cluster2"] = tree_type
+ params["-tree2"] = get_tmp_filename(WorkingDir)
+ params["-out"] = get_tmp_filename(WorkingDir)
+
+ muscle_res = muscle_seqs(seqs,
+ input_handler=input_handler,
+ params=params,
+ add_seq_names=add_seq_names,
+ WorkingDir=WorkingDir,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout)
+ tree = DndParser(muscle_res["Tree2Out"], constructor=constructor)
+ aln = [line for line in muscle_res["MuscleOut"]]
+
+ if clean_up:
+ muscle_res.cleanUp()
+ return tree, aln
+
+def fastest_aln_seqs(seqs,
+ params={},
+ out_filename=None,
+ add_seq_names=True,
+ WorkingDir=tempfile.gettempdir(),
+ SuppressStderr=None,
+ SuppressStdout=None
+ ):
+ """Fastest (and least accurate) version of muscle
+
+ seqs: either file name or list of sequence objects or list of strings or
+ single multiline string containing sequences.
+
+ Addl docs coming soon
+ """
+
+ params["-maxiters"] = 1
+ params["-diags1"] = True
+ params["-sv"] = True
+ params["-distance1"] = "kbit20_3"
+
+ muscle_res = muscle_seqs(seqs,
+ params=params,
+ add_seq_names=add_seq_names,
+ out_filename=out_filename,
+ WorkingDir=WorkingDir,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout)
+ return muscle_res
+
+def align_unaligned_seqs(seqs, moltype=DNA, params=None):
+ """Returns an Alignment object from seqs.
+
+ seqs: SequenceCollection object, or data that can be used to build one.
+
+ moltype: a MolType object. DNA, RNA, or PROTEIN.
+
+ params: dict of parameters to pass in to the Muscle app controller.
+
+ Result will be an Alignment object.
+ """
+ if not params:
+ params = {}
+ #create SequenceCollection object from seqs
+ seq_collection = SequenceCollection(seqs,MolType=moltype)
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = seq_collection.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+ #get temporary filename
+ params.update({'-out':get_tmp_filename()})
+ #Create Muscle app.
+ app = Muscle(InputHandler='_input_as_multiline_string',\
+ params=params, WorkingDir=tempfile.gettempdir())
+ #Get results using int_map as input to app
+ res = app(int_map.toFasta())
+ #Get alignment as dict out of results
+ alignment = dict(parse_fasta(res['MuscleOut']))
+ #Make new dict mapping original IDs
+ new_alignment = {}
+ for k,v in alignment.items():
+ new_alignment[int_keys[k]]=v
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment,MolType=moltype)
+ #Clean up
+ res.cleanUp()
+ del(seq_collection,int_map,int_keys,app,res,alignment,params)
+
+ return new_alignment
+
+
+def align_and_build_tree(seqs, moltype, best_tree=False, params=None):
+ """Returns an alignment and a tree from Sequences object seqs.
+
+ seqs: a cogent.core.alignment.SequenceCollection object, or data that can
+ be used to build one.
+
+ moltype: cogent.core.moltype.MolType object
+
+ best_tree: if True (default:False), uses a slower but more accurate
+ algorithm to build the tree.
+
+ params: dict of parameters to pass in to the Muscle app controller.
+
+ The result will be a tuple containing a cogent.core.alignment.Alignment
+ and a cogent.core.tree.PhyloNode object (or None for the alignment
+ and/or tree if either fails).
+ """
+ aln = align_unaligned_seqs(seqs, moltype=moltype, params=params)
+ tree = build_tree_from_alignment(aln, moltype, best_tree, params)
+ return {'Align':aln, 'Tree':tree}
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None):
+ """Returns a tree from Alignment object aln.
+
+ aln: a cogent.core.alignment.Alignment object, or data that can be used
+ to build one.
+
+ moltype: cogent.core.moltype.MolType object
+
+ best_tree: unsupported
+
+ params: dict of parameters to pass in to the Muscle app controller.
+
+ The result will be an cogent.core.tree.PhyloNode object, or None if tree
+ fails.
+ """
+ # Create instance of app controller, enable tree, disable alignment
+ app = Muscle(InputHandler='_input_as_multiline_string', params=params, \
+ WorkingDir=tempfile.gettempdir())
+
+ app.Parameters['-clusteronly'].on()
+ app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir))
+ app.Parameters['-seqtype'].on(moltype.label)
+
+ seq_collection = SequenceCollection(aln, MolType=moltype)
+
+ #Create mapping between abbreviated IDs and full IDs
+ int_map, int_keys = seq_collection.getIntMap()
+ #Create SequenceCollection from int_map.
+ int_map = SequenceCollection(int_map,MolType=moltype)
+
+
+ # Collect result
+ result = app(int_map.toFasta())
+
+ # Build tree
+ tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode)
+
+ for tip in tree.tips():
+ tip.Name = int_keys[tip.Name]
+
+ # Clean up
+ result.cleanUp()
+ del(seq_collection, app, result)
+
+ return tree
+
+def add_seqs_to_alignment(seqs, aln, params=None):
+ """Returns an Alignment object from seqs and existing Alignment.
+
+ seqs: a cogent.core.alignment.SequenceCollection object, or data that can
+ be used to build one.
+
+ aln: a cogent.core.alignment.Alignment object, or data that can be used
+ to build one
+
+ params: dict of parameters to pass in to the Muscle app controller.
+ """
+ if not params:
+ params = {}
+
+ #create SequenceCollection object from seqs
+ seqs_collection = SequenceCollection(seqs)
+ #Create mapping between abbreviated IDs and full IDs
+ seqs_int_map, seqs_int_keys = seqs_collection.getIntMap(prefix='seq_')
+ #Create SequenceCollection from int_map.
+ seqs_int_map = SequenceCollection(seqs_int_map)
+
+ #create SequenceCollection object from aln
+ aln_collection = SequenceCollection(aln)
+ #Create mapping between abbreviated IDs and full IDs
+ aln_int_map, aln_int_keys = aln_collection.getIntMap(prefix='aln_')
+ #Create SequenceCollection from int_map.
+ aln_int_map = SequenceCollection(aln_int_map)
+
+ #set output and profile options
+ params.update({'-out':get_tmp_filename(), '-profile':True})
+
+ #save seqs to tmp file
+ seqs_filename = get_tmp_filename()
+ seqs_out = open(seqs_filename,'w')
+ seqs_out.write(seqs_int_map.toFasta())
+ seqs_out.close()
+
+ #save aln to tmp file
+ aln_filename = get_tmp_filename()
+ aln_out = open(aln_filename, 'w')
+ aln_out.write(aln_int_map.toFasta())
+ aln_out.close()
+
+ #Create Muscle app and get results
+ app = Muscle(InputHandler='_input_as_multifile', params=params,
+ WorkingDir=tempfile.gettempdir())
+ res = app((aln_filename, seqs_filename))
+
+ #Get alignment as dict out of results
+ alignment = dict(parse_fasta(res['MuscleOut']))
+ #Make new dict mapping original IDs
+ new_alignment = {}
+ for k,v in alignment.items():
+ if k in seqs_int_keys:
+ new_alignment[seqs_int_keys[k]] = v
+ else:
+ new_alignment[aln_int_keys[k]] = v
+
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment)
+
+ #Clean up
+ res.cleanUp()
+ del(seqs_collection, seqs_int_map, seqs_int_keys)
+ del(aln_collection, aln_int_map, aln_int_keys)
+ del(app, res, alignment, params)
+ remove(seqs_filename)
+ remove(aln_filename)
+
+ return new_alignment
+
+def align_two_alignments(aln1, aln2, params=None):
+ """Returns an Alignment object from two existing Alignments.
+
+ aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
+ used to build them.
+
+ params: dict of parameters to pass in to the Muscle app controller.
+ """
+ if not params:
+ params = {}
+
+ #create SequenceCollection object from aln1
+ aln1_collection = SequenceCollection(aln1)
+ #Create mapping between abbreviated IDs and full IDs
+ aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_')
+ #Create SequenceCollection from int_map.
+ aln1_int_map = SequenceCollection(aln1_int_map)
+
+ #create SequenceCollection object from aln2
+ aln2_collection = SequenceCollection(aln2)
+ #Create mapping between abbreviated IDs and full IDs
+ aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_')
+ #Create SequenceCollection from int_map.
+ aln2_int_map = SequenceCollection(aln2_int_map)
+
+ #set output and profile options
+ params.update({'-out':get_tmp_filename(), '-profile':True})
+
+ #save aln1 to tmp file
+ aln1_filename = get_tmp_filename()
+ aln1_out = open(aln1_filename,'w')
+ aln1_out.write(aln1_int_map.toFasta())
+ aln1_out.close()
+
+ #save aln2 to tmp file
+ aln2_filename = get_tmp_filename()
+ aln2_out = open(aln2_filename, 'w')
+ aln2_out.write(aln2_int_map.toFasta())
+ aln2_out.close()
+
+ #Create Muscle app and get results
+ app = Muscle(InputHandler='_input_as_multifile', params=params,
+ WorkingDir=tempfile.gettempdir())
+ res = app((aln1_filename, aln2_filename))
+
+ #Get alignment as dict out of results
+ alignment = dict(parse_fasta(res['MuscleOut']))
+
+ #Make new dict mapping original IDs
+ new_alignment = {}
+ for k,v in alignment.items():
+ if k in aln1_int_keys:
+ new_alignment[aln1_int_keys[k]] = v
+ else:
+ new_alignment[aln2_int_keys[k]] = v
+
+ #Create an Alignment object from alignment dict
+ new_alignment = Alignment(new_alignment)
+
+ #Clean up
+ res.cleanUp()
+ del(aln1_collection, aln1_int_map, aln1_int_keys)
+ del(aln2_collection, aln2_int_map, aln2_int_keys)
+ del(app, res, alignment, params)
+ remove(aln1_filename)
+ remove(aln2_filename)
+
+ return new_alignment
diff --git a/bfillings/parsinsert.py b/bfillings/parsinsert.py
new file mode 100644
index 0000000..ce33754
--- /dev/null
+++ b/bfillings/parsinsert.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for ParsInsert
+
+designed for ParsInsert v1.03 """
+
+from StringIO import StringIO
+from os.path import splitext, join, abspath
+
+from burrito.parameters import ValuedParameter, FlagParameter, MixedParameter
+from burrito.util import (CommandLineApplication, FilePath, system,
+ CommandLineAppResult, ResultPath, remove,
+ ApplicationError)
+
+from cogent.core.tree import PhyloNode
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.parse.phylip import get_align_for_phylip
+
+
+class ParsInsert(CommandLineApplication):
+ """ParsInsert application Controller"""
+
+ _command = 'ParsInsert'
+ _input_handler = '_input_as_multiline_string'
+ _parameters = {
+ # read mask from this file
+ '-m':ValuedParameter('-',Name='m',Delimiter=' '),
+
+ # read core tree sequences from this file
+ '-s':ValuedParameter('-',Name='s',Delimiter=' '),
+
+ # read core tree from this file
+ '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+ # read core tree taxomony from this file
+ '-x':ValuedParameter('-',Name='x',Delimiter=' '),
+
+ # output taxonomy for each insert sequence to this file
+ '-o':ValuedParameter('-',Name='o',Delimiter=' '),
+
+ # create log file
+ '-l':ValuedParameter('-',Name='l',Delimiter=' '),
+
+ # number of best matches to display
+ '-n':ValuedParameter('-',Name='n',Delimiter=' '),
+
+ #percent threshold cutoff
+ '-c':ValuedParameter('-',Name='c',Delimiter=' '),
+ }
+
+ def _handle_app_result_build_failure(self,out,err,exit_status,result_paths):
+ """ Catch the error when files are not produced """
+ raise ApplicationError, \
+ 'ParsInsert failed to produce an output file due to the following error: \n\n%s ' \
+ % err.read()
+
+ def _get_result_paths(self,data):
+ """ Get the resulting tree"""
+ result = {}
+ result['Tree'] = ResultPath(Path=splitext(self._input_filename)[0] + \
+ '.tree')
+ return result
+
+def insert_sequences_into_tree(aln, moltype, params={}):
+ """Returns a tree from placement of sequences
+ """
+ # convert aln to phy since seq_names need fixed to run through parsinsert
+ new_aln=get_align_for_phylip(StringIO(aln))
+
+ # convert aln to fasta in case it is not already a fasta file
+ aln2 = Alignment(new_aln)
+ seqs = aln2.toFasta()
+
+ parsinsert_app = ParsInsert(params=params)
+ result = parsinsert_app(seqs)
+
+ # parse tree
+ tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+
+ # cleanup files
+ result.cleanUp()
+
+ return tree
diff --git a/bfillings/pplacer.py b/bfillings/pplacer.py
new file mode 100644
index 0000000..66992dd
--- /dev/null
+++ b/bfillings/pplacer.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for pplacer 1.1"""
+
+from os.path import splitext, abspath, join, split
+from StringIO import StringIO
+
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, FilePath, system,
+ CommandLineAppResult, ResultPath, remove,
+ ApplicationError, get_tmp_filename)
+
+from cogent.core.alignment import Alignment
+from cogent.app.guppy import build_tree_from_json_using_params
+from cogent.parse.phylip import get_align_for_phylip
+from cogent.parse.tree import DndParser
+from cogent.core.tree import PhyloNode
+
+class Pplacer(CommandLineApplication):
+ """pplacer Application Controller
+ """
+
+ _command = 'pplacer'
+ _input_handler = '_input_as_multiline_string'
+ _parameters = {
+ # -c Specify the path to the reference package.
+ '-c': ValuedParameter('-', Name='c', Delimiter=' ', IsPath=True),
+
+ # -t Specify the reference tree filename.
+ '-t': ValuedParameter('-', Name='t', Delimiter=' ', IsPath=True),
+
+ # -r Specify the reference alignment filename.
+ '-r': ValuedParameter('-', Name='r', Delimiter=' ', IsPath=True),
+
+ # -s Supply a phyml stats.txt or a RAxML info file giving the model parameters.
+ '-s': ValuedParameter('-', Name='s', Delimiter=' ', IsPath=True),
+
+ # -d Specify the directory containing the reference information.
+ '-d': ValuedParameter('-', Name='d', Delimiter=' ', IsPath=True),
+
+ # -p Calculate posterior probabilities.
+ '-p': FlagParameter('-', Name='p'),
+
+ # -m Substitution model. Protein: are LG, WAG, or JTT. Nucleotides: GTR.
+ '-m': ValuedParameter('-', Name='m', Delimiter=' '),
+
+ # --model-freqs Use model frequencies instead of reference alignment frequencies.
+ '--model-freqs': FlagParameter('--', Name='model-freqs'),
+
+ # --gamma-cats Number of categories for discrete gamma model.
+ '--gamma-cats': ValuedParameter('--', Name='gamma-cats', Delimiter=' '),
+
+ # --gamma-alpha Specify the shape parameter for a discrete gamma model.
+ '--gamma-alpha': ValuedParameter('--', Name='gamma-alpha', Delimiter=' '),
+
+ # --ml-tolerance 1st stage branch len optimization tolerance (2nd stage to 1e-5). Default: 0.01.
+ '--ml-tolerance': ValuedParameter('--', Name='ml-tolerance', Delimiter=' '),
+
+ # --pp-rel-err Relative error for the posterior probability calculation. Default is 0.01.
+ '--pp-rel-err': ValuedParameter('--', Name='pp-rel-err', Delimiter=' '),
+
+ # --unif-prior Use a uniform prior rather than exponential.
+ '--unif-prior': FlagParameter('--', Name='unif-prior'),
+
+ # --start-pend Starting pendant branch length. Default is 0.1.
+ '--start-pend': ValuedParameter('--', Name='start-pend', Delimiter=' '),
+
+ # --max-pend Set the maximum ML pendant branch length. Default is 2.
+ '--max-pend': ValuedParameter('--', Name='max-pend', Delimiter=' '),
+
+ # --max-strikes Maximum number of strikes for baseball. 0 -> no ball playing. Default is 6.
+ '--max-strikes': ValuedParameter('--', Name='max-strikes', Delimiter=' '),
+
+ # --strike-box Set the size of the strike box in log likelihood units. Default is 3.
+ '--strike-box': ValuedParameter('--', Name='strike-box', Delimiter=' '),
+
+ # --max-pitches Set the maximum number of pitches for baseball. Default is 40.
+ '--max-pitches': ValuedParameter('--', Name='max-pitches', Delimiter=' '),
+
+ # --fantasy Desired likelihood cutoff for fantasy baseball mode. 0 -> no fantasy.
+ '--fantasy': ValuedParameter('--', Name='fantasy', Delimiter=' '),
+
+ # --fantasy-frac Fraction of fragments to use when running fantasy baseball. Default is 0.1.
+ '--fantasy-frac': ValuedParameter('--', Name='fantasy-frac', Delimiter=' '),
+
+ # --write-masked Write alignment masked to the region without gaps in the query.
+ '--write-masked': FlagParameter('--', Name='write-masked'),
+
+ # --verbosity Set verbosity level. 0 is silent, and 2 is quite a lot. Default is 1.
+ '--verbosity': ValuedParameter('--', Name='verbosity', Delimiter=' '),
+
+ # --unfriendly Do not run friend finder pre-analysis.
+ '--unfriendly': FlagParameter('--', Name='unfriendly'),
+
+ # --out-dir Specify the directory to write place files to.
+ '--out-dir': ValuedParameter('--', Name='out-dir', Delimiter=' ', IsPath=True),
+
+ # --pretend Only check out the files then report. Do not run the analysis.
+ '--pretend': FlagParameter('--', Name='pretend'),
+
+ # --csv Make a CSV file with the results.
+ '--csv': FlagParameter('--', Name='csv'),
+
+ # --old-format Make an old-format placefile with the resuls.
+ '--old-format': FlagParameter('--', Name='old-format'),
+
+ # --diagnostic Write file describing the 'diagnostic' mutations for various clades.
+ '--diagnostic': FlagParameter('--', Name='diagnostic'),
+
+ # --check-like Write out the likelihood of the reference tree, calculated two ways.
+ '--check-like': FlagParameter('--', Name='check-like'),
+
+ # --version Write out the version number and exit.
+ '--version': FlagParameter('--', Name='version'),
+
+ # --help Display this list of options
+ '--help': FlagParameter('--', Name='help'),
+ }
+
+ def getTmpFilename(self, tmp_dir="/tmp",prefix='tmp',suffix='.fasta',\
+ include_class_id=False,result_constructor=FilePath):
+ """ Define Tmp filename to contain .fasta suffix, since pplacer requires
+ the suffix to be .fasta """
+
+ return super(Pplacer,self).getTmpFilename(tmp_dir=tmp_dir,
+ prefix=prefix,
+ suffix=suffix,
+ include_class_id=include_class_id,
+ result_constructor=result_constructor)
+
+ def _handle_app_result_build_failure(self,out,err,exit_status,result_paths):
+ """ Catch the error when files are not produced """
+ raise ApplicationError, \
+ 'Pplacer failed to produce an output file due to the following error: \n\n%s ' \
+ % out.read()
+
+ def _get_result_paths(self,data):
+ """ Define the output filepaths """
+ output_dir = self.Parameters['--out-dir'].Value
+ result = {}
+ result['json'] = ResultPath(Path=join(output_dir,
+ splitext(split(self._input_filename)[-1])[0] + \
+ '.jplace'))
+ return result
+
+def insert_sequences_into_tree(aln, moltype, params={},
+ write_log=True):
+ """Returns a tree from Alignment object aln.
+
+ aln: an xxx.Alignment object, or data that can be used to build one.
+
+ moltype: cogent.core.moltype.MolType object
+
+ params: dict of parameters to pass in to the RAxML app controller.
+
+ The result will be an xxx.Alignment object, or None if tree fails.
+ """
+
+ # convert aln to phy since seq_names need fixed to run through pplacer
+
+ new_aln=get_align_for_phylip(StringIO(aln))
+
+ # convert aln to fasta in case it is not already a fasta file
+ aln2 = Alignment(new_aln)
+ seqs = aln2.toFasta()
+
+ ih = '_input_as_multiline_string'
+
+ pplacer_app = Pplacer(params=params,
+ InputHandler=ih,
+ WorkingDir=None,
+ SuppressStderr=False,
+ SuppressStdout=False)
+
+ pplacer_result = pplacer_app(seqs)
+
+ # write a log file
+ if write_log:
+ log_fp = join(params["--out-dir"],'log_pplacer_' + \
+ split(get_tmp_filename())[-1])
+ log_file=open(log_fp,'w')
+ log_file.write(pplacer_result['StdOut'].read())
+ log_file.close()
+
+ # use guppy to convert json file into a placement tree
+ guppy_params={'tog':None}
+
+ new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \
+ output_dir=params['--out-dir'], \
+ params=guppy_params)
+
+ pplacer_result.cleanUp()
+
+ return new_tree
diff --git a/bfillings/raxml_v730.py b/bfillings/raxml_v730.py
new file mode 100644
index 0000000..84a7356
--- /dev/null
+++ b/bfillings/raxml_v730.py
@@ -0,0 +1,875 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for RAxML (v7.3.0).
+
+WARNING: Because of the use of the -x option, this version is no longer
+compatible with RAxML version VI.
+"""
+from random import choice, randint
+from os import walk, listdir
+from os.path import isabs, join, split
+import re
+
+from cogent.core.tree import PhyloNode
+from cogent.core.alignment import Alignment
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.parse.tree import DndParser
+from cogent.app.guppy import build_tree_from_json_using_params
+
+from burrito.parameters import FlagParameter, ValuedParameter, FilePath
+from burrito.util import (CommandLineApplication, ResultPath,
+ get_tmp_filename, ApplicationError)
+
+
+class Raxml(CommandLineApplication):
+ """RAxML application controller"""
+
+ _options ={
+
+ # Specify a column weight file name to assign individual wieghts to
+ # each column of the alignment. Those weights must be integers
+ # separated by any number and type of whitespaces whithin a separate
+ # file, see file "example_weights" for an example.
+ '-a':ValuedParameter('-',Name='a',Delimiter=' '),
+
+ # Specify one of the secondary structure substitution models implemented
+ # in RAxML. The same nomenclature as in the PHASE manual is used,
+ # available models: S6A, S6B, S6C, S6D, S6E, S7A, S7B, S7C, S7D, S7E,
+ # S7F, S16, S16A, S16B
+ # DEFAULT: 16-state GTR model (S16)
+ '-A':ValuedParameter('-',Name='A',Delimiter=' '),
+
+ # Specify an integer number (random seed) for bootstrapping
+ '-b':ValuedParameter('-',Name='b',Delimiter=' '),
+
+ # specify a floating point number between 0.0 and 1.0 that will be used
+ # as cutoff threshold for the MR-based bootstopping criteria. The
+ # recommended setting is 0.03.
+ '-B':ValuedParameter('-',Name='B',Delimiter=' '),
+
+ # Specify number of distinct rate catgories for raxml when
+ # ModelOfEvolution is set to GTRCAT or HKY85CAT.
+ # Individual per-site rates are categorized into numberOfCategories
+ # rate categories to accelerate computations. (Default = 50)
+ '-c':ValuedParameter('-',Name='c',Delimiter=' '),
+
+ # Conduct model parameter optimization on gappy, partitioned multi-gene
+ # alignments with per-partition branch length estimates (-M enabled)
+ # using the fast method with pointer meshes described in:
+ # Stamatakis and Ott: "Efficient computation of the phylogenetic
+ # likelihood function on multi-gene alignments and multi-core
+ # processors"
+ # WARNING: We can not conduct useful tree searches using this method
+ # yet! Does not work with Pthreads version.
+ '-C':ValuedParameter('-',Name='C',Delimiter=' '),
+
+ # This option allows you to start the RAxML search with a complete
+ # random starting tree instead of the default Maximum Parsimony
+ # Starting tree. On smaller datasets (around 100-200 taxa) it has
+ # been observed that this might sometimes yield topologies of distinct
+ # local likelihood maxima which better correspond to empirical
+ # expectations.
+ '-d':FlagParameter('-',Name='d'),
+
+ # ML search convergence criterion. This will break off ML searches if
+ # the relative Robinson-Foulds distance between the trees obtained from
+ # two consecutive lazy SPR cycles is smaller or equal to 1%. Usage
+ # recommended for very large datasets in terms of taxa. On trees with
+ # more than 500 taxa this will yield execution time improvements of
+ # approximately 50% While yielding only slightly worse trees.
+ # DEFAULT: OFF
+ '-D':ValuedParameter('-',Name='D'),
+
+ # This allows you to specify up to which likelihood difference.
+ # Default is 0.1 log likelihood units, author recommends 1 or 2 to
+ # rapidly evaluate different trees.
+ '-e':ValuedParameter('-',Name='e',Delimiter=' '),
+
+ # specify an exclude file name, that contains a specification of
+ # alignment positions you wish to exclude. Format is similar to Nexus,
+ # the file shall contain entries like "100-200 300-400", to exclude a
+ # single column write, e.g., "100-100", if you use a mixed model, an
+ # appropriatly adapted model file will be written.
+ '-E':ValuedParameter('-',Name='E',Delimiter=' '),
+
+ # select search algorithm:
+ # a rapid Bootstrap analysis and search for best-scoring ML tree in
+ # one program run
+ # A compute marginal ancestral states on a ROOTED reference tree
+ # provided with "t" - ONLY IN 7.3.0
+ # b draw bipartition information on a tree provided with "-t" based on
+ # multiple trees (e.g., from a bootstrap) in a file specifed by
+ # "-z"
+ # c check if the alignment can be properly read by RAxML
+ # d for normal hill-climbing search (Default)
+ # when -f option is omitted this algorithm will be used
+ # e optimize model+branch lengths for given input tree under
+ # GAMMA/GAMMAI only
+ # E execute very fast experimental tree search, at present only for
+ # testing
+ # F execute fast experimental tree search, at present only for testing
+ # g compute per site log Likelihoods for one ore more trees passed via
+ # "-z" and write them to a file that can be read by CONSEL
+ # WARNING: does not print likelihoods in the original column order
+ # h compute log likelihood test (SH-test) between best tree passed via
+ # "-t" and a bunch of other trees passed via "-z"
+ # i EXPERIMENTAL do not use for real tree inferences: conducts a
+ # single cycle of fast lazy SPR moves on a given input tree, to be
+ # used in combination with -C and -M
+ # I EXPERIMENTAL do not use for real tree inferences: conducts a
+ # single cycle of thorough lazy SPR moves on a given input tree,
+ # to be used in combination with -C and -M
+ # j generate a bunch of bootstrapped alignment files from an original
+ # alignemnt file. You need to specify a seed with "-b" and the
+ # number of replicates with "-#"
+ # following "J" is for version 7.2.8
+ # J Compute SH-like support values on a given tree passed via "-t".
+ # m compare bipartitions between two bunches of trees passed via "-t"
+ # and "-z" respectively. This will return the Pearson correlation
+ # between all bipartitions found in the two tree files. A file
+ # called RAxML_bipartitionFrequencies.outpuFileName will be
+ # printed that contains the pair-wise bipartition frequencies of
+ # the two sets
+ # n compute the log likelihood score of all trees contained in a tree
+ # file provided by "-z" under GAMMA or GAMMA+P-Invar
+ # o old (slower) algorithm from v. 2.1.3
+ # p perform pure stepwise MP addition of new sequences to an
+ # incomplete starting tree and exit
+ # r compute pairwise Robinson-Foulds (RF) distances between all pairs
+ # of trees in a tree file passed via "-z" if the trees have node
+ # labales represented as integer support values the program will
+ # also compute two flavors of the weighted Robinson-Foulds (WRF)
+ # distance
+ # following "R" is for version 7.2.8
+ # R compute rogue taxa using new statistical method based on the
+ # evolutionary placement algorithm
+ # WARNING: this is experimental code - DEPRECATED IN 7.3.0
+ # s (split) splits into individual genes, provided with model file
+ # following "S" is for version 7.2.8
+ # S compute site-specific placement bias using a leave one out test
+ # inspired by the evolutionary placement algorithm
+ # t do randomized tree searches on one fixed starting tree
+ # u execute morphological weight calibration using maximum likelihood,
+ # this will return a weight vector. you need to provide a
+ # morphological alignment and a reference tree via "-t"
+ # U execute morphological wieght calibration using parsimony, this
+ # will return a weight vector. you need to provide a morphological
+ # alignment and a reference tree via "-t" - DEPRECATED IN 7.3.0
+ # v classify a bunch of environmental sequences into a reference tree
+ # using the slow heuristics without dynamic alignment you will
+ # need to start RAxML with a non-comprehensive reference tree and
+ # an alignment containing all sequences (reference + query)
+ # w compute ELW test on a bunch of trees passed via "-z"
+ # x compute pair-wise ML distances, ML model parameters will be
+ # estimated on an MP starting tree or a user-defined tree passed
+ # via "-t", only allowed for GAMMA-based models of rate
+ # heterogeneity
+ # y classify a bunch of environmental sequences into a reference tree
+ # using the fast heuristics without dynamic alignment you will
+ # need to start RAxML with a non-comprehensive reference tree and
+ # an alignment containing all sequences (reference + query)
+ '-f':ValuedParameter('-',Name='f',Delimiter=' ', Value="d"),
+
+ # enable ML tree searches under CAT model for very large trees without
+ # switching to GAMMA in the end (saves memory). This option can also be
+ # used with the GAMMA models in order to avoid the thorough optimization
+ # of the best-scoring ML tree in the end.
+ # DEFAULT: OFF
+ '-F':FlagParameter('-',Name='F'),
+
+ # select grouping file name: allows incomplete multifurcating constraint
+ # tree in newick format -- resolves multifurcations randomly, adds
+ # other taxa using parsimony insertion
+ '-g':ValuedParameter('-', Name='g',Delimiter=' '),
+
+ # enable the ML-based evolutionary placement algorithm heuristics by
+ # specifiyng a threshold value (fraction of insertion branches to be
+ # evaluated using slow insertions under ML).
+ '-G':FlagParameter('-', Name='G'),
+
+ # prints help and exits
+ '-h':FlagParameter('-', Name='h'),
+
+ # enable the MP-based evolutionary placement algorithm heuristics
+ # by specifiyng a threshold value (fraction of insertion branches to be
+ # evaluated using slow insertions under ML) - DEPRECATED IN 7.3.0
+ #'-H':ValuedParameter('-', Name='H',Delimiter=' '),
+
+ # allows initial rearrangement to be constrained, e.g. 10 means
+ # insertion will not be more than 10 nodes away from original.
+ # default is to pick a "good" setting.
+ '-i':ValuedParameter('-', Name='i', Delimiter=' '),
+
+ # a posteriori bootstopping analysis. Use:
+ # "-I autoFC" for the frequency-based criterion
+ # "-I autoMR" for the majority-rule consensus tree criterion
+ # "-I autoMRE" for the extended majority-rule consensus tree criterion
+ # "-I autoMRE_IGN" for metrics similar to MRE, but include
+ # bipartitions under the threshold whether they are compatible
+ # or not. This emulates MRE but is faster to compute.
+ # You also need to pass a tree file containg several bootstrap
+ # replicates via "-z"
+ '-I':ValuedParameter('-', Name='I', Delimiter=' '),
+
+ # writes checkpoints (off by default)
+ '-j':FlagParameter('-', Name='j'),
+
+ # Compute majority rule consensus tree with "-J MR" or extended majority
+ # rule consensus tree with "-J MRE" or strict consensus tree with "-J
+ # STRICT" You will need to provide a tree file containing several
+ # UNROOTED trees via "-z"
+ '-J':ValuedParameter('-', Name='J', Delimiter=' '),
+
+ #specifies that RAxML will optimize model parameters (for GTRMIX and
+ # GTRGAMMA) as well as calculating likelihoods for bootstrapped trees.
+ '-k':FlagParameter('-', Name='k'),
+
+ # Specify one of the multi-state substitution models (max 32 states)
+ # implemented in RAxML. Available models are: ORDERED, MK, GTR
+ '-K':ValuedParameter('-', Name='K', Delimiter=' '),
+
+ # Model of Binary (Morphological), Nucleotide, Multi-State, or Amino
+ # Acid Substitution::
+ # BINARY:
+ # -m BINCAT : Optimization of site-specific evolutionary rates which
+ # are categorized into numberOfCategories distinct rate categories
+ # for greater computational efficiency. Final tree might be
+ # evaluated automatically under BINGAMMA, depending on the tree
+ # search option
+ # -m BINCATI : Optimization of site-specific evolutionary rates which
+ # are categorized into numberOfCategories distinct rate categories
+ # for greater computational efficiency. Final tree might be
+ # evaluated automatically under BINGAMMAI, depending on the tree
+ # search option
+ # -m BINGAMMA : GAMMA model of rate heterogeneity (alpha parameter
+ # will be estimated)
+ # -m BINGAMMAI : Same as BINGAMMA, but with estimate of proportion of
+ # invariable sites
+ # NUCLEOTIDES
+ # -m GTRCAT: GTR + Optimization of substitution rates + Optimization
+ # of site-specific evolutionary rates which are categorized into
+ # numberOfCategories distinct rate categories for greater
+ # computational efficiency
+ # -m GTRCAT_FLOAT : Same as above but uses single-precision floating
+ # point arithemtics instead of double-precision Usage only
+ # recommened for testing, the code will run slower, but can save
+ # almost 50% of memory. If you have problems with phylogenomic
+ # datasets and large memory requirements you may give it a shot.
+ # Keep in mind that numerical stability seems to be okay but needs
+ # further testing. - DEPRECATED IN 7.3.0
+ # -m GTRCATI : GTR + Optimization of substitution rates + Optimization
+ # of site-specific evolutionary rates which are categorized into
+ # numberOfCategories distinct rate categories for greater
+ # computational efficiency. Final tree might be evaluated under
+ # GTRGAMMAI, depending on the tree search option
+ # -m GTRGAMMA: GTR + Optimization of substitution rates + Gamma
+ # -m GTRGAMMA_FLOAT : Same as GTRGAMMA, but also with
+ # single-precision arithmetics, same cautionary notes as for
+ # GTRCAT_FLOAT apply. - DEPRECATED IN 7.3.0
+ # -m GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of
+ # invariable sites
+ # MULTI-STATE:
+ # -m MULTICAT : Optimization of site-specific evolutionary rates which
+ # are categorized into numberOfCategories distinct rate categories
+ # for greater computational efficiency. Final tree might be
+ # evaluated automatically under MULTIGAMMA, depending on the tree
+ # search option
+ # -m MULTICATI : Optimization of site-specific evolutionary rates
+ # which are categorized into numberOfCategories distinct rate
+ # categories for greater computational efficiency. Final tree
+ # might be evaluated automatically under MULTIGAMMAI, depending on
+ # the tree search option
+ # -m MULTIGAMMA : GAMMA model of rate heterogeneity (alpha parameter
+ # will be estimated)
+ # -m MULTIGAMMAI : Same as MULTIGAMMA, but with estimate of proportion
+ # of invariable sites
+ # You can use up to 32 distinct character states to encode multi-state
+ # regions, they must be used in the following order: 0, 1, 2, 3, 4, 5,
+ # 6, 7, 8, 9, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S,
+ # T, U, V i.e., if you have 6 distinct character states you would use 0,
+ # 1, 2, 3, 4, 5 to encode these. The substitution model for the
+ # multi-state regions can be selected via the "-K" option
+ # Amino Acid Models:
+ # -m PROTCATmatrixName[F] : specified AA matrix + Optimization of
+ # substitution rates + Optimization of site-specific evolutionary
+ # rates which are categorized into numberOfCategories distinct
+ # rate categories for greater computational efficiency. Final
+ # tree might be evaluated automatically under
+ # PROTGAMMAmatrixName[f], depending on the tree search option
+ # -m PROTCATmatrixName[F]_FLOAT : PROTCAT with single precision
+ # arithmetics, same cautionary notes as for GTRCAT_FLOAT apply
+ # - DEPRECATED IN 7.3.0
+ # -m PROTCATImatrixName[F] : specified AA matrix + Optimization of
+ # substitution rates + Optimization of site-specific
+ # evolutionary rates which are categorized into numberOfCategories
+ # distinct rate categories for greater computational efficiency.
+ # Final tree might be evaluated automatically under
+ # PROTGAMMAImatrixName[f], depending on the tree search option
+ # -m PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of
+ # substitution rates + GAMMA model of rate heterogeneity (alpha
+ # parameter will be estimated)
+ # -m PROTGAMMAmatrixName[F]_FLOAT : PROTGAMMA with single precision
+ # arithmetics, same cautionary notes as for GTRCAT_FLOAT apply
+ # - DEPRECATED IN 7.3.0
+ # -m PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but
+ # with estimate of proportion of invariable sites
+ # Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG,
+ # RTREV, CPREV, VT, BLOSUM62, MTMAM, LG, GTR. With the optional "F"
+ # appendix you can specify if you want to use empirical base frequencies
+ # Please note that for mixed models you can in addition specify the
+ # per-gene AA model in the mixed model file (see manual for details).
+ # Also note that if you estimate AA GTR parameters on a partitioned
+ # dataset, they will be linked (estimated jointly) across all partitions
+ # to avoid over-parametrization
+ '-m':ValuedParameter('-',Name='m',Delimiter=' '),
+
+ # Switch on estimation of individual per-partition branch lengths. Only
+ # has effect when used in combination with "-q". Branch lengths for
+ # individual partitions will be printed to separate files. A weighted
+ # average of the branch lengths is computed by using the respective
+ # partition lengths.
+ # DEFAULT: OFF
+ '-M':FlagParameter('-',Name='M'),
+
+ # Specifies the name of the output file.
+ '-n':ValuedParameter('-',Name='n',Delimiter=' '),
+
+ # Specifies the name of the outgroup (or outgroups: comma-delimited,
+ # no spaces, should be monophyletic).
+ '-o':ValuedParameter('-',Name='o',Delimiter=' '),
+
+ # Enable checkpointing using the dmtcp library available at
+ # http://dmtcp.sourceforge.net/. This only works if you call the program
+ # by preceded by the command "dmtcp_checkpoint" and if you compile a
+ # dedicated binary using the appropriate Makefile. With "-O" you can
+ # specify the interval between checkpoints in seconds.
+ # DEFAULT: 3600.0 seconds - DEPRECATED IN 7.3.0
+ #'-O':ValuedParameter('-',Name='O',Delimiter=' ',Value=3600.0),
+
+ # Specify a random number seed for the parsimony inferences. This allows
+ # you to reproduce your results and will help me debug the program.
+ '-p':ValuedParameter('-',Name='p',Delimiter=' '),
+
+ # Specify the file name of a user-defined AA (Protein) substitution
+ # model. This file must contain 420 entries, the first 400 being the AA
+ # substitution rates (this must be a symmetric matrix) and the last 20
+ # are the empirical base frequencies
+ '-P':ValuedParameter('-',Name='P',Delimiter=' '),
+
+ # Specified MultipleModel file name, in format:
+ # gene1 = 1-500
+ # gene2 = 501-1000
+ # (note: ranges can also be discontiguous, e.g. 1-100, 200-300,
+ # or can specify codon ranges as e.g. 1-100/3, 2-100/3, 3-100/3))
+ '-q':ValuedParameter('-', Name='q', Delimiter=' '),
+
+ # THE FOLLOWING "Q" is DEPRECATED IN 7.2.8
+ # Turn on computation of SH-like support values on tree.
+ # DEFAULT: OFF
+ '-Q':FlagParameter('-', Name='Q'),
+
+ # Constraint file name: allows a bifurcating Newick tree to be passed
+ # in as a constraint file, other taxa will be added by parsimony.
+ '-r':ValuedParameter('-',Name='r',Delimiter=' '),
+
+ # THE FOLLOWING "R" is IN 7.2.8
+ # Specify the file name of a binary model parameter file that has
+ # previously been generated with RAxML using the -f e tree evaluation
+ # option. The file name should be: RAxML_binaryModelParameters.runID
+ '-R':ValuedParameter('-',Name='R',Delimiter=' '),
+
+ # specify the name of the alignment data file, in relaxed PHYLIP
+ # format.
+ '-s':ValuedParameter('-',Name='s',Delimiter=' '),
+
+ # Specify the name of a secondary structure file. The file can contain
+ # "." for alignment columns that do not form part of a stem and
+ # characters "()<>[]{}" to define stem regions and pseudoknots
+ '-S':ValuedParameter('-',Name='S',Delimiter=' '),
+
+ # Specify a user starting tree file name in Newick format
+ '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+ # PTHREADS VERSION ONLY! Specify the number of threads you want to run.
+ # Make sure to set "-T" to at most the number of CPUs you have on your
+ # machine, otherwise, there will be a huge performance decrease!
+ '-T':ValuedParameter('-',Name='T',Delimiter=' '),
+
+ # THE FOLLOWING "U" is IN 7.2.8
+ # Try to save memory by using SEV-based implementation for gap columns
+ # on large gappy alignments
+ # WARNING: this will only work for DNA under GTRGAMMA and is still in an
+ # experimental state.
+ '-U':ValuedParameter('-',Name='U',Delimiter=' '),
+
+ # Print the version
+ '-v':FlagParameter('-',Name='v'),
+
+ # Name of the working directory where RAxML-V will write its output
+ # files.
+ '-w':ValuedParameter('-',Name='w',Delimiter=' '),
+
+ # THE FOLLOWING "W" is IN 7.2.8
+ # Sliding window size for leave-one-out site-specific placement bias
+ # algorithm only effective when used in combination with "-f S"
+ # DEFAULT: 100 sites
+ '-W':ValuedParameter('-',Name='W',Delimiter=' '),
+
+ # Specify an integer number (random seed) and turn on rapid
+ # bootstrapping. CAUTION: unlike in version 7.0.4 RAxML will conduct
+ # rapid BS replicates under the model of rate heterogeneity you
+ # specified via "-m" and not by default under CAT
+ '-x':ValuedParameter('-',Name='x',Delimiter=' '),
+
+ # EXPERIMENTAL OPTION: This option will do a per-site estimate of
+ # protein substitution models by looping over all given, fixed models
+ # LG, WAG, JTT, etc and using their respective base frequencies to
+ # independently assign a prot subst. model to each site via ML
+ # optimization. At present this option only works with the GTR+GAMMA
+ # model, unpartitioned datasets, and in the sequential version only.
+ # DEFAULT: OFF
+ '-X':FlagParameter('-', Name='X'),
+
+ # Compute only randomized starting parsimony tree with RAxML, do not
+ # optimize an ML analysis of the tree
+ '-y':FlagParameter('-', Name='y'),
+
+ # Do a more thorough parsimony tree search using a parsimony ratchet and
+ # exit. Specify the number of ratchet searches via "-#" or "-N". This
+ # has just been implemented for completeness, if you want a fast MP
+ # implementation use TNT
+ # DEFAULT: OFF - DEPRECATED IN 7.3.0
+ #'-Y':FlagParameter('-', Name='Y'),
+
+ # Multiple tree file, for use with -f b (to draw bipartitions onto the
+ # common tree specified with -t)
+ '-z':ValuedParameter('-', Name='z', Delimiter=' '),
+
+ # Specifies number of runs on distinct starting trees.
+ '-#':ValuedParameter('-', Name='#', Delimiter=' ',Value=1),
+
+ # Specifies number of runs on distinct starting trees.
+ '-N':ValuedParameter('-', Name='N', Delimiter=' '),
+
+ }
+
+ _parameters = {}
+ _parameters.update(_options)
+ _command = "raxmlHPC"
+ _out_format = "RAxML_%s.%s"
+
+ def _format_output(self, outfile_name, out_type):
+ """ Prepend proper output prefix to output filename """
+
+ outfile_name = self._absolute(outfile_name)
+ outparts = outfile_name.split("/")
+ outparts[-1] = self._out_format % (out_type, outparts[-1] )
+
+ return '/'.join(outparts)
+
+ def _input_as_seqs(self,data):
+ lines = []
+ for i,s in enumerate(data):
+ #will number the sequences 1,2,3,etc.
+ lines.append(''.join(['>',str(i+1)]))
+ lines.append(s)
+ return self._input_as_lines(lines)
+
+ def _input_as_lines(self,data):
+ if data:
+ self.Parameters['-s']\
+ .on(super(Raxml,self)._input_as_lines(data))
+ return ''
+
+ def _input_as_string(self,data):
+ """Makes data the value of a specific parameter
+
+ This method returns the empty string. The parameter will be printed
+ automatically once set.
+ """
+ if data:
+ self.Parameters['-in'].on(str(data))
+ return ''
+
+ def _input_as_multiline_string(self, data):
+ if data:
+ self.Parameters['-s']\
+ .on(super(Raxml,self)._input_as_multiline_string(data))
+ return ''
+
+ def _absolute(self,path):
+ path = FilePath(path)
+ if isabs(path):
+ return path
+ elif self.Parameters['-w'].isOn():
+ return self.Parameters['-w'].Value + path
+ else:
+ return self.WorkingDir + path
+
+ def _log_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), "log")
+ else:
+ raise ValueError, "No output file specified."
+
+ def _info_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), "info")
+ else:
+ raise ValueError, "No output file specified."
+
+ def _parsimony_tree_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "parsimonyTree")
+ else:
+ raise ValueError, "No output file specified."
+
+ # added for tree-insertion
+ def _originallabelled_tree_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "originalLabelledTree")
+ else:
+ raise ValueError, "No output file specified."
+
+ # added for tree-insertion
+ def _labelled_tree_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "labelledTree")
+ else:
+ raise ValueError, "No output file specified."
+
+ # added for tree-insertion
+ def _classification_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "classification")
+ else:
+ raise ValueError, "No output file specified."
+
+ # added for tree-insertion
+ def _classificationlikelihoodweights_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "classificationLikelihoodWeights")
+ else:
+ raise ValueError, "No output file specified."
+
+ # added for tree-insertion
+ def _best_tree_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "bestTree")
+ else:
+ raise ValueError, "No output file specified."
+
+ # added for tree-insertion
+ def _entropy_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "entropy")
+ else:
+ raise ValueError, "No output file specified."
+
+ # added for tree-insertion
+ def _json_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "portableTree")
+ else:
+ raise ValueError, "No output file specified."
+
+ # added for tree-insertion
+ def _parsimony_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "equallyParsimoniousPlacements")
+ else:
+ raise ValueError, "No output file specified."
+
+ def _result_tree_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "result")
+ else:
+ raise ValueError, "No output file specified."
+
+ def _result_bootstrap_out_filename(self):
+ if self.Parameters['-n'].isOn():
+ return self._format_output(str(self.Parameters['-n'].Value), \
+ "bootstrap")
+ else:
+ raise ValueError, "No output file specified"
+
+ def _checkpoint_out_filenames(self):
+ """
+ RAxML generates a crapload of checkpoint files so need to
+ walk directory to collect names of all of them.
+ """
+ out_filenames = []
+ if self.Parameters['-n'].isOn():
+ out_name = str(self.Parameters['-n'].Value)
+ walk_root = self.WorkingDir
+ if self.Parameters['-w'].isOn():
+ walk_root = str(self.Parameters['-w'].Value)
+ for tup in walk(walk_root):
+ dpath, dnames, dfiles = tup
+ if dpath == walk_root:
+ for gen_file in dfiles:
+ if out_name in gen_file and "checkpoint" in gen_file:
+ out_filenames.append(walk_root + gen_file)
+ break
+
+ else:
+ raise ValueError, "No output file specified."
+ return out_filenames
+
+ def _handle_app_result_build_failure(self,out,err,exit_status,result_paths):
+ """ Catch the error when files are not produced """
+
+ try:
+ raise ApplicationError, \
+ 'RAxML failed to produce an output file due to the following error: \n\n%s ' \
+ % err.read()
+ except:
+ raise ApplicationError,\
+ 'RAxML failed to run properly.'
+
+ def _get_result_paths(self,data):
+
+ result = {}
+ result['Info'] = ResultPath(Path=self._info_out_filename(),
+ IsWritten=True)
+ if self.Parameters['-k'].isOn():
+ result['Bootstrap'] = ResultPath(
+ Path=self._result_bootstrap_out_filename(),
+ IsWritten=True)
+ elif self.Parameters["-f"].Value == 'v':
+ #these were added to handle the results from tree-insertion
+ result['Classification'] = ResultPath(
+ Path=self._classification_out_filename(),
+ IsWritten=True)
+ result['ClassificationLikelihoodWeights'] = ResultPath(
+ Path=self._classificationlikelihoodweights_out_filename(),
+ IsWritten=True)
+ result['OriginalLabelledTree'] = ResultPath(
+ Path=self._originallabelled_tree_out_filename(),
+ IsWritten=True)
+ result['Result'] = ResultPath(
+ Path=self._labelled_tree_out_filename(),IsWritten=True)
+ result['entropy'] = ResultPath(
+ Path=self._entropy_out_filename(),IsWritten=True)
+ result['json'] = ResultPath(
+ Path=self._json_out_filename()+'.jplace',IsWritten=True)
+ elif self.Parameters["-f"].Value == 'y':
+ #these were added to handle the results from tree-insertion
+
+ result['Parsimony'] = ResultPath(
+ Path=self._parsimony_out_filename(),
+ IsWritten=True)
+ result['OriginalLabelledTree'] = ResultPath(
+ Path=self._originallabelled_tree_out_filename(),
+ IsWritten=True)
+ result['json'] = ResultPath(
+ Path=self._json_out_filename()+'.jplace',IsWritten=True)
+ else:
+ result['Log'] = ResultPath(Path=self._log_out_filename(),
+ IsWritten=True)
+ result['ParsimonyTree'] = ResultPath(
+ Path=self._parsimony_tree_out_filename(),
+ IsWritten=True)
+ result['Result'] = ResultPath(
+ Path=self._result_tree_out_filename(),
+ IsWritten=True)
+ #
+ result['besttree'] = ResultPath(
+ Path=self._best_tree_out_filename(),
+ IsWritten=True)
+
+ for checkpoint_file in self._checkpoint_out_filenames():
+ checkpoint_num = checkpoint_file.split(".")[-1]
+ try:
+ checkpoint_num = int(checkpoint_num)
+ except Exception, e:
+ raise ValueError, "%s does not appear to be a valid checkpoint file"
+ result['Checkpoint%d' % checkpoint_num] = ResultPath(
+ Path=checkpoint_file,
+ IsWritten=True)
+
+ return result
+
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+def raxml_alignment(align_obj,
+ raxml_model="GTRCAT",
+ params={},
+ SuppressStderr=True,
+ SuppressStdout=True):
+ """Run raxml on alignment object
+
+ align_obj: Alignment object
+ params: you can set any params except -w and -n
+
+ returns: tuple (phylonode,
+ parsimonyphylonode,
+ log likelihood,
+ total exec time)
+ """
+
+ # generate temp filename for output
+ params["-w"] = "/tmp/"
+ params["-n"] = get_tmp_filename().split("/")[-1]
+ params["-m"] = raxml_model
+ params["-p"] = randint(1,100000)
+ ih = '_input_as_multiline_string'
+ seqs, align_map = align_obj.toPhylip()
+
+ #print params["-n"]
+
+ # set up command
+ raxml_app = Raxml(
+ params=params,
+ InputHandler=ih,
+ WorkingDir=None,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout)
+
+ # run raxml
+ ra = raxml_app(seqs)
+
+ # generate tree
+ tree_node = DndParser(ra["Result"])
+
+ # generate parsimony tree
+ parsimony_tree_node = DndParser(ra["ParsimonyTree"])
+
+ # extract log likelihood from log file
+ log_file = ra["Log"]
+ total_exec_time = exec_time = log_likelihood = 0.0
+ for line in log_file:
+ exec_time, log_likelihood = map(float, line.split())
+ total_exec_time += exec_time
+
+ # remove output files
+ ra.cleanUp()
+
+ return tree_node, parsimony_tree_node, log_likelihood, total_exec_time
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={}):
+ """Returns a tree from Alignment object aln.
+
+ aln: an xxx.Alignment object, or data that can be used to build one.
+
+ moltype: cogent.core.moltype.MolType object
+
+ best_tree: best_tree suppport is currently not implemented
+
+ params: dict of parameters to pass in to the RAxML app controller.
+
+ The result will be an xxx.Alignment object, or None if tree fails.
+ """
+ if best_tree:
+ raise NotImplementedError
+
+ if '-m' not in params:
+ if moltype == DNA or moltype == RNA:
+ #params["-m"] = 'GTRMIX'
+ # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT
+ # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html)
+ params["-m"] = 'GTRGAMMA'
+ elif moltype == PROTEIN:
+ params["-m"] = 'PROTGAMMAmatrixName'
+ else:
+ raise ValueError, "Moltype must be either DNA, RNA, or PROTEIN"
+
+ if not hasattr(aln, 'toPhylip'):
+ aln = Alignment(aln)
+ seqs, align_map = aln.toPhylip()
+
+ # generate temp filename for output
+ params["-w"] = "/tmp/"
+ params["-n"] = get_tmp_filename().split("/")[-1]
+ params["-k"] = True
+ params["-p"] = randint(1,100000)
+ params["-x"] = randint(1,100000)
+
+ ih = '_input_as_multiline_string'
+
+ raxml_app = Raxml(params=params,
+ InputHandler=ih,
+ WorkingDir=None,
+ SuppressStderr=True,
+ SuppressStdout=True)
+
+ raxml_result = raxml_app(seqs)
+
+ tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode)
+
+ for node in tree.tips():
+ node.Name = align_map[node.Name]
+
+ raxml_result.cleanUp()
+
+ return tree
+
+
+def insert_sequences_into_tree(seqs, moltype, params={},
+ write_log=True):
+ """Insert sequences into Tree.
+
+ aln: an xxx.Alignment object, or data that can be used to build one.
+
+ moltype: cogent.core.moltype.MolType object
+
+ params: dict of parameters to pass in to the RAxML app controller.
+
+ The result will be a tree.
+ """
+
+ ih = '_input_as_multiline_string'
+
+ raxml_app = Raxml(params=params,
+ InputHandler=ih,
+ WorkingDir=None,
+ SuppressStderr=False,
+ SuppressStdout=False,
+ HALT_EXEC=False)
+
+ raxml_result = raxml_app(seqs)
+
+ # write a log file
+ if write_log:
+ log_fp = join(params["-w"],'log_raxml_'+split(get_tmp_filename())[-1])
+ log_file=open(log_fp,'w')
+ log_file.write(raxml_result['StdOut'].read())
+ log_file.close()
+
+ '''
+ # getting setup since parsimony doesn't output tree..only jplace, however
+ # it is currently corrupt
+
+ # use guppy to convert json file into a placement tree
+ guppy_params={'tog':None}
+
+ new_tree=build_tree_from_json_using_params(raxml_result['json'].name, \
+ output_dir=params["-w"], \
+ params=guppy_params)
+ '''
+
+ # get tree from 'Result Names'
+ new_tree=raxml_result['Result'].readlines()
+ filtered_tree=re.sub('\[I\d+\]','',str(new_tree))
+ tree = DndParser(filtered_tree, constructor=PhyloNode)
+
+ raxml_result.cleanUp()
+
+ return tree
diff --git a/bfillings/rdp_classifier.py b/bfillings/rdp_classifier.py
new file mode 100644
index 0000000..6c57fcf
--- /dev/null
+++ b/bfillings/rdp_classifier.py
@@ -0,0 +1,589 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for rdp_classifier-2.0
+"""
+
+
+import os.path
+import re
+from os import environ, getenv
+from optparse import OptionParser
+from shutil import rmtree
+import tempfile
+import warnings
+
+from burrito.parameters import ValuedParameter
+from skbio.parse.sequences import parse_fasta
+from burrito.util import (CommandLineApplication, FilePath, ResultPath,
+ ApplicationNotFoundError, ApplicationError)
+
+from burrito.util import which
+
+
+class RdpClassifier(CommandLineApplication):
+
+ """RDP Classifier application controller
+
+ The RDP Classifier program is distributed as a java archive (.jar)
+ file. If the file 'rdp_classifier-2.2.jar' is not found in the
+ current directory, the app controller uses the JAR file specified
+ by the environment variable RDP_JAR_PATH. If this variable is not
+ set, and 'rdp_classifier-2.2.jar' is not found in the current
+ directory, the application controller raises an
+ ApplicationNotFoundError.
+
+ The RDP Classifier often requires memory in excess of Java's
+ default 64M. To correct this situation, the authors recommend
+ increasing the maximum heap size for the java virtual machine. An
+ option '-Xmx' (default 1000M) is provided for this purpose.
+ Details on this option may be found at
+ http://java.sun.com/j2se/1.5.0/docs/tooldocs/solaris/java.html
+
+ The classifier may optionally use a custom training set. The full
+ path to the training set may be provided in the option
+ '-training-data'.
+ """
+ _input_handler = '_input_as_lines'
+ _command = "rdp_classifier-2.2.jar"
+ _options = {
+ # output file name for classification assignment
+ '-o': ValuedParameter('-', Name='o', Delimiter=' ', IsPath=True),
+ # a property file contains the mapping of the training
+ # files. Note: the training files and the property file should
+ # be in the same directory. The default property file is set
+ # to data/classifier/rRNAClassifier.properties.
+ '-t': ValuedParameter('-', Name='t', Delimiter=' ', IsPath=True),
+ # all tab delimited output format: [allrank|fixrank|db].
+ # Default is allrank.
+ #
+ # allrank: outputs the results for all ranks applied for
+ # each sequence: seqname, orientation, taxon name, rank,
+ # conf, ...
+ #
+ # fixrank: only outputs the results for fixed ranks in
+ # order: no rank, domain, phylum, class, order, family,
+ # genus
+ #
+ # db: outputs the seqname, trainset_no, tax_id, conf. This
+ # is good for storing in a database
+ '-f': ValuedParameter('-', Name='f', Delimiter=' '),
+ }
+
+ # The following are available in the attributes JvmParameters,
+ # JarParameters, and PositionalParameters
+
+ _jvm_synonyms = {}
+ _jvm_parameters = {
+ # Maximum heap size for JVM.
+ '-Xmx': ValuedParameter('-', Name='Xmx', Delimiter='', Value='1000m'),
+ }
+
+ _parameters = {}
+ _parameters.update(_options)
+ _parameters.update(_jvm_parameters)
+
+ def getHelp(self):
+ """Returns documentation string"""
+ # Summary paragraph copied from rdp_classifier-2.0, which is
+ # licensed under the GPL 2.0 and Copyright 2008 Michigan State
+ # University Board of Trustees
+ help_str = """\
+ usage: ClassifierCmd [-f <arg>] [-o <arg>] [-q <arg>] [-t <arg>]
+
+ -f,--format <arg> all tab delimited output format:
+ [allrank|fixrank|db]. Default is allrank.
+
+ allrank: outputs the results for all ranks applied for each
+ sequence: seqname, orientation, taxon name, rank, conf, ...
+
+ fixrank: only outputs the results for fixed ranks in order:
+ no rank, domain, phylum, class, order, family, genus
+
+ db: outputs the seqname, trainset_no, tax_id, conf. This is
+ good for storing in a database
+
+ -o,--outputFile <arg> output file name for classification
+ assignment
+
+ -q,--queryFile <arg> query file contains sequences in one of
+ the following formats: Fasta, Genbank and EMBL
+
+ -t,--train_propfile <arg> a property file contains the mapping
+ of the training files.
+
+ Note: the training files and the property file should be in
+ the same directory. The default property file is set to
+ data/classifier/rRNAClassifier.properties."""
+ return help_str
+
+ def _accept_exit_status(self, status):
+ """Returns false if an error occurred in execution
+ """
+ return (status == 0)
+
+ def _error_on_missing_application(self, params):
+ """Raise an ApplicationNotFoundError if the app is not accessible
+
+ In this case, checks for the java runtime and the RDP jar file.
+ """
+ if not (os.path.exists('java') or which('java')):
+ raise ApplicationNotFoundError(
+ "Cannot find java runtime. Is it installed? Is it in your "
+ "path?")
+ jar_fp = self._get_jar_fp()
+ if jar_fp is None:
+ raise ApplicationNotFoundError(
+ "JAR file not found in current directory and the RDP_JAR_PATH "
+ "environment variable is not set. Please set RDP_JAR_PATH to "
+ "the full pathname of the JAR file.")
+ if not os.path.exists(jar_fp):
+ raise ApplicationNotFoundError(
+ "JAR file %s does not exist." % jar_fp)
+
+ def _get_jar_fp(self):
+ """Returns the full path to the JAR file.
+
+ If the JAR file cannot be found in the current directory and
+ the environment variable RDP_JAR_PATH is not set, returns
+ None.
+ """
+ # handles case where the jar file is in the current working directory
+ if os.path.exists(self._command):
+ return self._command
+ # handles the case where the user has specified the location via
+ # an environment variable
+ elif 'RDP_JAR_PATH' in environ:
+ return getenv('RDP_JAR_PATH')
+ else:
+ return None
+
+ # Overridden to pull out JVM-specific command-line arguments.
+ def _get_base_command(self):
+ """Returns the base command plus command-line options.
+
+ Does not include input file, output file, and training set.
+ """
+ cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
+ jvm_command = "java"
+ jvm_arguments = self._commandline_join(
+ [self.Parameters[k] for k in self._jvm_parameters])
+ jar_arguments = '-jar "%s"' % self._get_jar_fp()
+ rdp_arguments = self._commandline_join(
+ [self.Parameters[k] for k in self._options])
+
+ command_parts = [
+ cd_command, jvm_command, jvm_arguments, jar_arguments,
+ rdp_arguments, '-q']
+ return self._commandline_join(command_parts).strip()
+
+ BaseCommand = property(_get_base_command)
+
+ def _commandline_join(self, tokens):
+ """Formats a list of tokens as a shell command
+
+ This seems to be a repeated pattern; may be useful in
+ superclass.
+ """
+ commands = filter(None, map(str, tokens))
+ return self._command_delimiter.join(commands).strip()
+
+ def _get_result_paths(self, data):
+ """ Return a dict of ResultPath objects representing all possible output
+ """
+ assignment_fp = str(self.Parameters['-o'].Value).strip('"')
+ if not os.path.isabs(assignment_fp):
+ assignment_fp = os.path.relpath(assignment_fp, self.WorkingDir)
+ return {'Assignments': ResultPath(assignment_fp, IsWritten=True)}
+
+
+class RdpTrainer(RdpClassifier):
+ _input_handler = '_input_as_lines'
+ TrainingClass = 'edu.msu.cme.rdp.classifier.train.ClassifierTraineeMaker'
+ PropertiesFile = 'RdpClassifier.properties'
+
+ _parameters = {
+ 'taxonomy_file': ValuedParameter(None, None, IsPath=True),
+ 'model_output_dir': ValuedParameter(None, None, IsPath=True),
+ 'training_set_id': ValuedParameter(None, None, Value='1'),
+ 'taxonomy_version': ValuedParameter(None, None, Value='version1'),
+ 'modification_info': ValuedParameter(None, None, Value='cogent'),
+ }
+ _jvm_parameters = {
+ # Maximum heap size for JVM.
+ '-Xmx': ValuedParameter('-', Name='Xmx', Delimiter='', Value='1000m'),
+ }
+ _parameters.update(_jvm_parameters)
+
+ def _get_base_command(self):
+ """Returns the base command plus command-line options.
+
+ Handles everything up to and including the classpath. The
+ positional training parameters are added by the
+ _input_handler_decorator method.
+ """
+ cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
+ jvm_command = "java"
+ jvm_args = self._commandline_join(
+ [self.Parameters[k] for k in self._jvm_parameters])
+ cp_args = '-cp "%s" %s' % (self._get_jar_fp(), self.TrainingClass)
+
+ command_parts = [cd_command, jvm_command, jvm_args, cp_args]
+ return self._commandline_join(command_parts).strip()
+
+ BaseCommand = property(_get_base_command)
+
+ def _set_input_handler(self, method_name):
+ """Stores the selected input handler in a private attribute.
+ """
+ self.__InputHandler = method_name
+
+ def _get_input_handler(self):
+ """Returns decorator that wraps the requested input handler.
+ """
+ return '_input_handler_decorator'
+
+ InputHandler = property(_get_input_handler, _set_input_handler)
+
+ @property
+ def ModelDir(self):
+ """Absolute FilePath to the training output directory.
+ """
+ model_dir = self.Parameters['model_output_dir'].Value
+ absolute_model_dir = os.path.abspath(model_dir)
+ return FilePath(absolute_model_dir)
+
+ def _input_handler_decorator(self, data):
+ """Adds positional parameters to selected input_handler's results.
+ """
+ input_handler = getattr(self, self.__InputHandler)
+ input_parts = [
+ self.Parameters['taxonomy_file'],
+ input_handler(data),
+ self.Parameters['training_set_id'],
+ self.Parameters['taxonomy_version'],
+ self.Parameters['modification_info'],
+ self.ModelDir,
+ ]
+ return self._commandline_join(input_parts)
+
+ def _get_result_paths(self, output_dir):
+ """Return a dict of output files.
+ """
+ # Only include the properties file here. Add the other result
+ # paths in the __call__ method, so we can catch errors if an
+ # output file is not written.
+ self._write_properties_file()
+ properties_fp = os.path.join(self.ModelDir, self.PropertiesFile)
+ result_paths = {
+ 'properties': ResultPath(properties_fp, IsWritten=True,)
+ }
+ return result_paths
+
+ def _write_properties_file(self):
+ """Write an RDP training properties file manually.
+ """
+ # The properties file specifies the names of the files in the
+ # training directory. We use the example properties file
+ # directly from the rdp_classifier distribution, which lists
+ # the default set of files created by the application. We
+ # must write this file manually after generating the
+ # training data.
+ properties_fp = os.path.join(self.ModelDir, self.PropertiesFile)
+ properties_file = open(properties_fp, 'w')
+ properties_file.write(
+ "# Sample ResourceBundle properties file\n"
+ "bergeyTree=bergeyTrainingTree.xml\n"
+ "probabilityList=genus_wordConditionalProbList.txt\n"
+ "probabilityIndex=wordConditionalProbIndexArr.txt\n"
+ "wordPrior=logWordPrior.txt\n"
+ "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, "
+ "November 2003\n"
+ )
+ properties_file.close()
+
+ def __call__(self, data=None, remove_tmp=True):
+ """Run the application with the specified kwargs on data
+
+ data: anything that can be cast into a string or written out
+ to a file. Usually either a list of things or a single
+ string or number. input_handler will be called on this data
+ before it is passed as part of the command-line argument, so
+ by creating your own input handlers you can customize what
+ kind of data you want your application to accept
+
+ remove_tmp: if True, removes tmp files
+ """
+ result = super(
+ RdpClassifier,
+ self).__call__(
+ data=data,
+ remove_tmp=remove_tmp)
+ training_files = {
+ 'bergeyTree': 'bergeyTrainingTree.xml',
+ 'probabilityList': 'genus_wordConditionalProbList.txt',
+ 'probabilityIndex': 'wordConditionalProbIndexArr.txt',
+ 'wordPrior': 'logWordPrior.txt',
+ }
+ for key, training_fn in sorted(training_files.items()):
+ training_fp = os.path.join(self.ModelDir, training_fn)
+ if not os.path.exists(training_fp):
+ exception_msg = (
+ "Training output file %s not found. This may "
+ "happen if an error occurred during the RDP training "
+ "process. More details may be available in the "
+ "standard error, printed below.\n\n" % training_fp
+ )
+ stderr_msg = result["StdErr"].read()
+ result["StdErr"].seek(0)
+ raise ApplicationError(exception_msg + stderr_msg)
+ # Not in try/except clause because we already know the
+ # file exists. Failure would be truly exceptional, and we
+ # want to maintain the original exception in that case.
+ result[key] = open(training_fp)
+ return result
+
+
+def parse_command_line_parameters(argv=None):
+ """ Parses command line arguments """
+ usage =\
+ 'usage: %prog [options] input_sequences_filepath'
+ version = 'Version: %prog ' + __version__
+ parser = OptionParser(usage=usage, version=version)
+
+ parser.add_option('-o', '--output_fp', action='store',
+ type='string', dest='output_fp', help='Path to store ' +
+ 'output file [default: generated from input_sequences_filepath]')
+
+ parser.add_option('-c', '--min_confidence', action='store',
+ type='float', dest='min_confidence', help='minimum confidence ' +
+ 'level to return a classification [default: %default]')
+
+ parser.set_defaults(verbose=False, min_confidence=0.80)
+
+ opts, args = parser.parse_args(argv)
+ if len(args) != 1:
+ parser.error('Exactly one argument is required.')
+
+ return opts, args
+
+
+def assign_taxonomy(
+ data, min_confidence=0.80, output_fp=None, training_data_fp=None,
+ fixrank=True, max_memory=None, tmp_dir=tempfile.gettempdir()):
+ """Assign taxonomy to each sequence in data with the RDP classifier
+
+ data: open fasta file object or list of fasta lines
+ confidence: minimum support threshold to assign taxonomy to a sequence
+ output_fp: path to write output; if not provided, result will be
+ returned in a dict of {seq_id:(taxonomy_assignment,confidence)}
+ """
+ # Going to iterate through this twice in succession, best to force
+ # evaluation now
+ data = list(data)
+
+ # RDP classifier doesn't preserve identifiers with spaces
+ # Use lookup table
+ seq_id_lookup = {}
+ for seq_id, seq in parse_fasta(data):
+ seq_id_lookup[seq_id.split()[0]] = seq_id
+
+ app_kwargs = {}
+ if tmp_dir is not None:
+ app_kwargs['TmpDir'] = tmp_dir
+ app = RdpClassifier(**app_kwargs)
+
+ if max_memory is not None:
+ app.Parameters['-Xmx'].on(max_memory)
+
+ temp_output_file = tempfile.NamedTemporaryFile(
+ prefix='RdpAssignments_', suffix='.txt', dir=tmp_dir)
+ app.Parameters['-o'].on(temp_output_file.name)
+ if training_data_fp is not None:
+ app.Parameters['-t'].on(training_data_fp)
+
+ if fixrank:
+ app.Parameters['-f'].on('fixrank')
+ else:
+ app.Parameters['-f'].on('allrank')
+
+ app_result = app(data)
+
+ assignments = {}
+
+ # ShortSequenceException messages are written to stdout
+ # Tag these ID's as unassignable
+ for line in app_result['StdOut']:
+ excep = parse_rdp_exception(line)
+ if excep is not None:
+ _, rdp_id = excep
+ orig_id = seq_id_lookup[rdp_id]
+ assignments[orig_id] = ('Unassignable', 1.0)
+
+ for line in app_result['Assignments']:
+ rdp_id, direction, taxa = parse_rdp_assignment(line)
+ if taxa[0][0] == "Root":
+ taxa = taxa[1:]
+ orig_id = seq_id_lookup[rdp_id]
+ lineage, confidence = get_rdp_lineage(taxa, min_confidence)
+ if lineage:
+ assignments[orig_id] = (';'.join(lineage), confidence)
+ else:
+ assignments[orig_id] = ('Unclassified', 1.0)
+
+ if output_fp:
+ try:
+ output_file = open(output_fp, 'w')
+ except OSError:
+ raise OSError("Can't open output file for writing: %s" % output_fp)
+ for seq_id, assignment in assignments.items():
+ lineage, confidence = assignment
+ output_file.write(
+ '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence))
+ output_file.close()
+ return None
+ else:
+ return assignments
+
+
+def train_rdp_classifier(
+ training_seqs_file, taxonomy_file, model_output_dir, max_memory=None,
+ tmp_dir=tempfile.gettempdir()):
+ """ Train RDP Classifier, saving to model_output_dir
+
+ training_seqs_file, taxonomy_file: file-like objects used to
+ train the RDP Classifier (see RdpTrainer documentation for
+ format of training data)
+
+ model_output_dir: directory in which to save the files
+ necessary to classify sequences according to the training
+ data
+
+ Once the model data has been generated, the RDP Classifier may
+ """
+ app_kwargs = {}
+ if tmp_dir is not None:
+ app_kwargs['TmpDir'] = tmp_dir
+ app = RdpTrainer(**app_kwargs)
+
+ if max_memory is not None:
+ app.Parameters['-Xmx'].on(max_memory)
+
+ temp_taxonomy_file = tempfile.NamedTemporaryFile(
+ prefix='RdpTaxonomy_', suffix='.txt', dir=tmp_dir)
+ temp_taxonomy_file.write(taxonomy_file.read())
+ temp_taxonomy_file.seek(0)
+
+ app.Parameters['taxonomy_file'].on(temp_taxonomy_file.name)
+ app.Parameters['model_output_dir'].on(model_output_dir)
+ return app(training_seqs_file)
+
+
+def train_rdp_classifier_and_assign_taxonomy(
+ training_seqs_file, taxonomy_file, seqs_to_classify, min_confidence=0.80,
+ model_output_dir=None, classification_output_fp=None, max_memory=None,
+ tmp_dir=tempfile.gettempdir()):
+ """ Train RDP Classifier and assign taxonomy in one fell swoop
+
+ The file objects training_seqs_file and taxonomy_file are used to
+ train the RDP Classifier (see RdpTrainer documentation for
+ details). Model data is stored in model_output_dir. If
+ model_output_dir is not provided, a temporary directory is created
+ and removed after classification.
+
+ The sequences in seqs_to_classify are classified according to the
+ model and filtered at the desired confidence level (default:
+ 0.80).
+
+ The results are saved to classification_output_fp if provided,
+ otherwise a dict of {seq_id:(taxonomy_assignment,confidence)} is
+ returned.
+ """
+ if model_output_dir is None:
+ training_dir = tempfile.mkdtemp(prefix='RdpTrainer_', dir=tmp_dir)
+ else:
+ training_dir = model_output_dir
+
+ training_results = train_rdp_classifier(
+ training_seqs_file, taxonomy_file, training_dir, max_memory=max_memory,
+ tmp_dir=tmp_dir)
+ training_data_fp = training_results['properties'].name
+
+ assignment_results = assign_taxonomy(
+ seqs_to_classify, min_confidence=min_confidence,
+ output_fp=classification_output_fp, training_data_fp=training_data_fp,
+ max_memory=max_memory, fixrank=False, tmp_dir=tmp_dir)
+
+ if model_output_dir is None:
+ # Forum user reported an error on the call to os.rmtree:
+ # https://groups.google.com/d/topic/qiime-forum/MkNe7-JtSBw/discussion
+ # We were not able to replicate the problem and fix it
+ # properly. However, even if an error occurs, we would like
+ # to return results, along with a warning.
+ try:
+ rmtree(training_dir)
+ except OSError:
+ msg = (
+ "Temporary training directory %s not removed" % training_dir)
+ if os.path.isdir(training_dir):
+ training_dir_files = os.listdir(training_dir)
+ msg += "\nDetected files %s" % training_dir_files
+ warnings.warn(msg, RuntimeWarning)
+
+ return assignment_results
+
+
+def get_rdp_lineage(rdp_taxa, min_confidence):
+ lineage = []
+ obs_confidence = 1.0
+ for taxon, rank, confidence in rdp_taxa:
+ if confidence >= min_confidence:
+ obs_confidence = confidence
+ lineage.append(taxon)
+ else:
+ break
+ return lineage, obs_confidence
+
+
+def parse_rdp_exception(line):
+ if line.startswith('ShortSequenceException'):
+ matchobj = re.search('recordID=(\S+)', line)
+ if matchobj:
+ rdp_id = matchobj.group(1)
+ return ('ShortSequenceException', rdp_id)
+ return None
+
+
+def parse_rdp_assignment(line):
+ """Returns a list of assigned taxa from an RDP classification line
+ """
+ toks = line.strip().split('\t')
+ seq_id = toks.pop(0)
+ direction = toks.pop(0)
+ if ((len(toks) % 3) != 0):
+ raise ValueError(
+ "Expected assignments in a repeating series of (rank, name, "
+ "confidence), received %s" % toks)
+ assignments = []
+ # Fancy way to create list of triples using consecutive items from
+ # input. See grouper function in documentation for itertools for
+ # more general example.
+ itoks = iter(toks)
+ for taxon, rank, confidence_str in zip(itoks, itoks, itoks):
+ if not taxon:
+ continue
+ assignments.append((taxon.strip('"'), rank, float(confidence_str)))
+ return seq_id, direction, assignments
+
+
+if __name__ == "__main__":
+ opts, args = parse_command_line_parameters()
+ assign_taxonomy(
+ open(args[0]), min_confidence=opts.min_confidence,
+ output_fp=opts.output_fp)
diff --git a/bfillings/rtax.py b/bfillings/rtax.py
new file mode 100644
index 0000000..36f3c56
--- /dev/null
+++ b/bfillings/rtax.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for RTAX version 1.0
+
+Includes application controller for RTAX.
+
+Modified from uclust.py and rdp_classifier.py on 12-27-11
+"""
+
+from os import remove, makedirs
+from os.path import exists, split, splitext, basename, isdir, abspath, isfile
+import tempfile
+import os.path
+import re
+from sys import stderr
+from shutil import rmtree
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+ get_tmp_filename, ApplicationError,
+ ApplicationNotFoundError)
+from skbio.util import remove_files
+
+from cogent.util.misc import app_path
+from cogent import DNA
+
+
+class RtaxParseError(Exception):
+ pass
+
+
+class Rtax(CommandLineApplication):
+ """ Rtax ApplicationController
+
+ """
+
+ _command = 'rtax'
+ _input_handler = '_input_as_parameters'
+ _parameters = {\
+
+ # -r a reference database in FASTA format
+ '-r':ValuedParameter('-',Name='r',Delimiter=' ', IsPath=True),
+
+ # -t a taxonomy file with sequence IDs matching the reference database
+ '-t':ValuedParameter('-',Name='t',Delimiter=' ', IsPath=True),
+
+ # -a a FASTA file containing query sequences (single-ended, read 1, or paired-end delimited)
+ '-a':ValuedParameter('-',Name='a',Delimiter=' ', IsPath=True),
+
+ # -b a FASTA file containing query sequences (read 2, with matching IDs)
+ '-b':ValuedParameter('-',Name='b',Delimiter=' ', IsPath=True),
+
+ # -l a text file containing sequence IDs to process, one per line
+ '-l':ValuedParameter('-',Name='l',Delimiter=' ', IsPath=True),
+
+ # -d a delimiter separating the two reads when provided in a single file
+ '-d':ValuedParameter('-',Name='d',Delimiter=' ', IsPath=False, Quote="\""),
+
+ # -i a regular expression used to select part of the fasta header to use as the sequence id.
+ '-i':ValuedParameter('-',Name='i',Delimiter=' ', IsPath=False, Quote="'"),
+
+ # -o output file name for classification assignment
+ '-o': ValuedParameter('-', Name='o', Delimiter=' ', IsPath=True),
+
+ # -m temporary directory
+ '-m': ValuedParameter('-', Name='m', Delimiter=' ', IsPath=True),
+
+ # -f allow fallback from paired-end to single-ended classification when one read is missing
+ '-f':FlagParameter(Prefix='-',Name='f'),
+
+ # -g do not allow fallback from paired-end to single-ended classification when one read is too generic
+ '-g':FlagParameter(Prefix='-',Name='g')
+ }
+
+ _suppress_stdout = False
+ _suppress_stderr = False
+
+ #def __init__(self):
+ # super().__init__()...
+ # usearch_command = "usearch"
+ # if not (exists(usearch_command) or app_path(usearch_command)):
+ # raise ApplicationNotFoundError,\
+ # "Cannot find %s. Is it installed? Is it in your path?"\
+ # % usearch_command
+
+
+ def _input_as_parameters(self,data):
+ """ Set the input path (a fasta filepath)
+ """
+ # The list of values which can be passed on a per-run basis
+ allowed_values = ['-r','-t','-a','-b','-l','-d','i','-o','-m','-v','-f', '-g']
+
+ unsupported_parameters = set(data.keys()) - set(allowed_values)
+ if unsupported_parameters:
+ raise ApplicationError,\
+ "Unsupported parameter(s) passed when calling rtax: %s" %\
+ ' '.join(unsupported_parameters)
+
+ for v in allowed_values:
+ # turn the parameter off so subsequent runs are not
+ # affected by parameter settings from previous runs
+ self.Parameters[v].off()
+ if v in data:
+ # turn the parameter on if specified by the user
+ self.Parameters[v].on(data[v])
+
+ return ''
+
+ def _get_result_paths(self,data):
+ """ Return a dict of ResultPath objects representing all possible output
+ """
+ assignment_fp = str(self.Parameters['-o'].Value).strip('"')
+ if not os.path.isabs(assignment_fp):
+ assignment_fp = os.path.relpath(assignment_fp, self.WorkingDir)
+ return {'Assignments': ResultPath(assignment_fp, IsWritten=True)}
+
+
+
+ def _accept_exit_status(self,exit_status):
+ """ Test for acceptable exit status
+
+ uclust can seg fault and still generate a parsable .uc file
+ so we explicitly check the exit status
+
+ """
+ return exit_status == 0
+
+ def getHelp(self):
+ """Method that points to documentation"""
+ help_str =\
+ """
+ RTAX is hosted at:
+ http://dev.davidsoergel.com/rtax/
+
+ The following paper should be cited if this resource is used:
+
+ Soergel D.A.W., Dey N., Knight R., and Brenner S.E. 2012.
+ Selection of primers for optimal taxonomic classification
+ of environmental 16S rRNA gene sequences. ISME J (6), 1440-1444
+ """
+ return help_str
+
+def assign_taxonomy(dataPath, reference_sequences_fp, id_to_taxonomy_fp, read_1_seqs_fp, read_2_seqs_fp, single_ok=False, no_single_ok_generic=False,
+ header_id_regex=None, read_id_regex = "\S+\s+(\S+)", amplicon_id_regex = "(\S+)\s+(\S+?)\/",
+ output_fp=None, log_path=None, HALT_EXEC=False, base_tmp_dir = '/tmp'):
+ """Assign taxonomy to each sequence in data with the RTAX classifier
+
+ # data: open fasta file object or list of fasta lines
+ dataPath: path to a fasta file
+
+ output_fp: path to write output; if not provided, result will be
+ returned in a dict of {seq_id:(taxonomy_assignment,confidence)}
+ """
+
+ usearch_command = "usearch"
+ if not (exists(usearch_command) or app_path(usearch_command)):
+ raise ApplicationNotFoundError,\
+ "Cannot find %s. Is it installed? Is it in your path?"\
+ % usearch_command
+
+ my_tmp_dir = get_tmp_filename(tmp_dir=base_tmp_dir,prefix='rtax_',suffix='',result_constructor=str)
+ os.makedirs(my_tmp_dir)
+
+
+ try:
+ # RTAX classifier doesn't necessarily preserve identifiers
+ # it reports back only the id extracted as $1 using header_id_regex
+ # since rtax takes the original unclustered sequence files as input,
+ # the usual case is that the regex extracts the amplicon ID from the second field
+
+
+
+ # Use lookup table
+ read_1_id_to_orig_id = {}
+ readIdExtractor = re.compile(read_id_regex) # OTU clustering produces ">clusterID read_1_id"
+ data = open(dataPath,'r')
+ for seq_id, seq in parse_fasta(data):
+ # apply the regex
+ extract = readIdExtractor.match(seq_id)
+ if extract is None:
+ stderr.write("Matched no ID with read_id_regex " + read_id_regex +" in '" + seq_id + "' from file " + dataPath + "\n")
+ else:
+ read_1_id_to_orig_id[extract.group(1)] = seq_id
+ #stderr.write(extract.group(1) + " => " + seq_id + "\n")
+ #seq_id_lookup[seq_id.split()[1]] = seq_id
+ data.close()
+
+
+
+ # make list of amplicon IDs to pass to RTAX
+
+ id_list_fp = open(my_tmp_dir+"/ampliconIdsToClassify", "w")
+
+ # Establish mapping of amplicon IDs to read_1 IDs
+ # simultaneously write the amplicon ID file for those IDs found in the input mapping above
+
+ amplicon_to_read_1_id = {}
+ ampliconIdExtractor = re.compile(amplicon_id_regex) # split_libraries produces >read_1_id ampliconID/1 ... // see also assign_taxonomy 631
+ read_1_data = open(read_1_seqs_fp,'r')
+ for seq_id, seq in parse_fasta(read_1_data):
+ # apply the regex
+ extract = ampliconIdExtractor.match(seq_id)
+ if extract is None:
+ stderr.write("Matched no ID with amplicon_id_regex " + amplicon_id_regex + " in '" + seq_id + "' from file " + read_1_seqs_fp + "\n")
+ else:
+ read_1_id = extract.group(1)
+ amplicon_id = extract.group(2)
+ try:
+ amplicon_to_read_1_id[amplicon_id] = read_1_id
+ bogus = read_1_id_to_orig_id[read_1_id] # verify that the id is valid
+ id_list_fp.write('%s\n' % (amplicon_id))
+ except KeyError:
+ pass
+ data.close()
+ id_list_fp.close()
+
+ app = Rtax(HALT_EXEC=HALT_EXEC)
+
+ temp_output_file = tempfile.NamedTemporaryFile(
+ prefix='RtaxAssignments_', suffix='.txt')
+ app.Parameters['-o'].on(temp_output_file.name)
+ app.Parameters['-r'].on(reference_sequences_fp)
+ app.Parameters['-t'].on(id_to_taxonomy_fp)
+ # app.Parameters['-d'].on(delimiter)
+ app.Parameters['-l'].on(id_list_fp.name) # these are amplicon IDs
+ app.Parameters['-a'].on(read_1_seqs_fp)
+ if read_2_seqs_fp is not None:
+ app.Parameters['-b'].on(read_2_seqs_fp)
+ app.Parameters['-i'].on(header_id_regex)
+ app.Parameters['-m'].on(my_tmp_dir)
+ if single_ok: app.Parameters['-f'].on();
+ if no_single_ok_generic: app.Parameters['-g'].on();
+ #app.Parameters['-v'].on()
+
+ app_result = app()
+
+ if log_path:
+ f=open(log_path, 'a')
+ errString=''.join(app_result['StdErr'].readlines()) + '\n'
+ f.write(errString)
+ f.close()
+
+ assignments = {}
+
+ # restore original sequence IDs with spaces
+
+ for line in app_result['Assignments']:
+ toks = line.strip().split('\t')
+ rtax_id = toks.pop(0)
+ if len(toks):
+ bestpcid = toks.pop(0) # ignored
+ lineage = toks
+
+ # RTAX does not provide a measure of confidence. We could pass one in,
+ # based on the choice of primers, or even look it up on the fly in the tables
+ # from the "optimal primers" paper; but it would be the same for every
+ # query sequence anyway.
+ # we could also return bestpcid, but that's not the same thing as confidence.
+ confidence = 1.0
+
+ read_1_id = amplicon_to_read_1_id[rtax_id]
+ orig_id = read_1_id_to_orig_id[read_1_id]
+ if lineage:
+ assignments[orig_id] = (';'.join(lineage), confidence)
+ else:
+ assignments[orig_id] = ('Unclassified', 1.0)
+
+ if output_fp:
+ try:
+ output_file = open(output_fp, 'w')
+ except OSError:
+ raise OSError("Can't open output file for writing: %s" % output_fp)
+ for seq_id, assignment in assignments.items():
+ lineage, confidence = assignment
+ output_file.write(
+ '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence))
+ output_file.close()
+ return None
+ else:
+ return assignments
+ finally:
+ try:
+ rmtree(my_tmp_dir)
+ except OSError:
+ pass
diff --git a/bfillings/seqprep.py b/bfillings/seqprep.py
new file mode 100644
index 0000000..2ee9d48
--- /dev/null
+++ b/bfillings/seqprep.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+# Application controller for SeqPrep
+# https://github.com/jstjohn/SeqPrep
+
+import os
+import tempfile
+
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import CommandLineApplication, ResultPath
+
+# SeqPrep help:
+# Usage:
+# SeqPrep [Required Args] [Options]
+# NOTE 1: The output is always gziped compressed.
+# NOTE 2: If the quality strings in the output contain characters less than
+# ascii 33 on an ascii table (they look like lines from a binary file), try
+# running again with or without the -6 option.
+#
+
+
+class SeqPrep(CommandLineApplication):
+
+ """SeqPrep application controller for joining paired-end reads"""
+ _command = 'SeqPrep'
+ _parameters = {
+ # Required Arguments
+ # -f <first read input fastq filename>
+ # -r <second read input fastq filename>
+ # -1 <first read output fastq filename>
+ # -2 <second read output fastq filename>
+ '-f': ValuedParameter(Prefix='-', Delimiter=' ', Name='f'),
+ '-r': ValuedParameter(Prefix='-', Delimiter=' ', Name='r'),
+ '-1': ValuedParameter(Prefix='-', Delimiter=' ', Name='1'),
+ '-2': ValuedParameter(Prefix='-', Delimiter=' ', Name='2'),
+
+ # General Arguments (Optional):
+ # -3 <first read discarded fastq filename>
+ # -4 <second read discarded fastq filename>
+ # -h Display this help message and exit (also works with no args)
+ # -6 Input sequence is in phred+64 rather than phred+33 format, the
+ # output will still be phred+33
+ # -q <Quality score cutoff for mismatches to be counted in overlap; default = 13>
+ # -L <Minimum length of a trimmed or merged read to print it; default = 30>
+ '-3': ValuedParameter(Prefix='-', Delimiter=' ', Name='3'),
+ '-4': ValuedParameter(Prefix='-', Delimiter=' ', Name='4'),
+ '-h': FlagParameter(Prefix='-', Name='h'),
+ '-6': FlagParameter(Prefix='-', Name='6'),
+ '-q': ValuedParameter(Prefix='-', Delimiter=' ', Name='q'),
+ '-L': ValuedParameter(Prefix='-', Delimiter=' ', Name='L'),
+
+ # Arguments for Adapter/Primer Trimming (Optional):
+ # -A <forward read primer/adapter sequence to trim as it would appear at the
+ # end of a read (recommend about 20bp of this)
+ # (should validate by grepping a file);
+ # default (genomic non-multiplexed adapter1) = AGATCGGAAGAGCGGTTCAG>
+ # -B <reverse read primer/adapter sequence to trim as it would appear at the
+ # end of a read (recommend about 20bp of this)
+ # (should validate by grepping a file);
+ # default (genomic non-multiplexed adapter2) = AGATCGGAAGAGCGTCGTGT>
+ # -O <minimum overall base pair overlap with adapter sequence to trim;
+ # default = 10>
+ # -M <maximum fraction of good quality mismatching bases for primer/adapter
+ # overlap; default = 0.020000>
+ # -N <minimum fraction of matching bases for primer/adapter overlap;
+ # default = 0.870000>
+ # -b <adapter alignment band-width; default = 50>
+ # -Q <adapter alignment gap-open; default = 8>
+ # -t <adapter alignment gap-extension; default = 2>
+ # -e <adapter alignment gap-end; default = 2>
+ # -Z <adapter alignment minimum local alignment score cutoff
+ # [roughly (2*num_hits) - (num_gaps*gap_open) - (num_gaps*gap_close) -
+ # (gap_len*gap_extend) - (2*num_mismatches)]; default = 26>
+ # -w <read alignment band-width; default = 50>
+ # -W <read alignment gap-open; default = 26>
+ # -p <read alignment gap-extension; default = 9>
+ # -P <read alignment gap-end; default = 5>
+ # -X <read alignment maximum fraction gap cutoff; default = 0.125000>
+ '-A': ValuedParameter(Prefix='-', Delimiter=' ', Name='A'),
+ '-B': ValuedParameter(Prefix='-', Delimiter=' ', Name='B'),
+ '-O': ValuedParameter(Prefix='-', Delimiter=' ', Name='O'),
+ '-M': ValuedParameter(Prefix='-', Delimiter=' ', Name='M'),
+ '-N': ValuedParameter(Prefix='-', Delimiter=' ', Name='N'),
+ '-b': ValuedParameter(Prefix='-', Delimiter=' ', Name='b'),
+ '-Q': ValuedParameter(Prefix='-', Delimiter=' ', Name='Q'),
+ '-t': ValuedParameter(Prefix='-', Delimiter=' ', Name='t'),
+ '-e': ValuedParameter(Prefix='-', Delimiter=' ', Name='e'),
+ '-Z': ValuedParameter(Prefix='-', Delimiter=' ', Name='Z'),
+ '-w': ValuedParameter(Prefix='-', Delimiter=' ', Name='w'),
+ '-W': ValuedParameter(Prefix='-', Delimiter=' ', Name='W'),
+ '-p': ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),
+ '-P': ValuedParameter(Prefix='-', Delimiter=' ', Name='P'),
+ '-X': ValuedParameter(Prefix='-', Delimiter=' ', Name='X'),
+
+ # Optional Arguments for Merging:
+ # -y <maximum quality score in output ((phred 33) default = ']' )>
+ # -g <print overhang when adapters are present and stripped (use this if
+ # reads are different length)>
+ # -s <perform merging and output the merged reads to this file>
+ # -E <write pretty alignments to this file for visual Examination>
+ # -x <max number of pretty alignments to write (if -E provided);
+ # default = 10000>
+ # -o <minimum overall base pair overlap to merge two reads; default = 15>
+ # -m <maximum fraction of good quality mismatching bases to overlap reads;
+ # default = 0.020000>
+ # -n <minimum fraction of matching bases to overlap reads;
+ # default = 0.900000>
+ '-y': ValuedParameter(Prefix='-', Delimiter=' ', Name='y'),
+ '-g': FlagParameter(Prefix='-', Name='y'),
+ '-s': ValuedParameter(Prefix='-', Delimiter=' ', Name='s'),
+ '-E': ValuedParameter(Prefix='-', Delimiter=' ', Name='E'),
+ '-x': ValuedParameter(Prefix='-', Delimiter=' ', Name='x'),
+ '-o': ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),
+ '-m': ValuedParameter(Prefix='-', Delimiter=' ', Name='m'),
+ '-n': ValuedParameter(Prefix='-', Delimiter=' ', Name='n')}
+
+ def _unassembled_reads1_out_file_name(self):
+ """Checks file name is set for reads1 output.
+ Returns absolute path."""
+ if self.Parameters['-1'].isOn():
+ unassembled_reads1 = self._absolute(
+ str(self.Parameters['-1'].Value))
+ else:
+ raise ValueError("No reads1 (flag: -1) output path specified")
+ return unassembled_reads1
+
+ def _unassembled_reads2_out_file_name(self):
+ """Checks if file name is set for reads2 output.
+ Returns absolute path."""
+ if self.Parameters['-2'].isOn():
+ unassembled_reads2 = self._absolute(
+ str(self.Parameters['-2'].Value))
+ else:
+ raise ValueError("No reads2 (flag -2) output path specified")
+ return unassembled_reads2
+
+ def _discarded_reads1_out_file_name(self):
+ """Checks if file name is set for discarded reads1 output.
+ Returns absolute path."""
+ if self.Parameters['-3'].isOn():
+ discarded_reads1 = self._absolute(str(self.Parameters['-3'].Value))
+ else:
+ raise ValueError(
+ "No discarded-reads1 (flag -3) output path specified")
+ return discarded_reads1
+
+ def _discarded_reads2_out_file_name(self):
+ """Checks if file name is set for discarded reads2 output.
+ Returns absolute path."""
+ if self.Parameters['-4'].isOn():
+ discarded_reads2 = self._absolute(str(self.Parameters['-4'].Value))
+ else:
+ raise ValueError(
+ "No discarded-reads2 (flag -4) output path specified")
+ return discarded_reads2
+
+ def _assembled_out_file_name(self):
+ """Checks file name is set for assembled output.
+ Returns absolute path."""
+ if self.Parameters['-s'].isOn():
+ assembled_reads = self._absolute(str(self.Parameters['-s'].Value))
+ else:
+ raise ValueError(
+ "No assembled-reads (flag -s) output path specified")
+ return assembled_reads
+
+ def _pretty_alignment_out_file_name(self):
+ """Checks file name is set for pretty alignment output.
+ Returns absolute path."""
+ if self.Parameters['-E'].isOn():
+ pretty_alignment = self._absolute(str(self.Parameters['-E'].Value))
+ else:
+ raise ValueError(
+ "No pretty-=alignment (flag -E) output path specified")
+ return pretty_alignment
+
+ def _get_result_paths(self, data):
+ """Captures SeqPrep output.
+
+ """
+ result = {}
+
+ # Always output:
+ result['UnassembledReads1'] = ResultPath(Path=
+ self._unassembled_reads1_out_file_name(
+ ),
+ IsWritten=True)
+ result['UnassembledReads2'] = ResultPath(Path=
+ self._unassembled_reads2_out_file_name(
+ ),
+ IsWritten=True)
+
+ # optional output, so we check for each
+ # check for assembled reads file
+ if self.Parameters['-s'].isOn():
+ result['Assembled'] = ResultPath(Path=
+ self._assembled_out_file_name(),
+ IsWritten=True)
+
+ # check for discarded (unassembled) reads1 file
+ if self.Parameters['-3'].isOn():
+ result['Reads1Discarded'] = ResultPath(Path=
+ self._discarded_reads1_out_file_name(
+ ),
+ IsWritten=True)
+
+ # check for discarded (unassembled) reads2 file
+ if self.Parameters['-4'].isOn():
+ result['Reads2Discarded'] = ResultPath(Path=
+ self._discarded_reads2_out_file_name(
+ ),
+ IsWritten=True)
+
+ # check for pretty-alignment file
+ if self.Parameters['-E'].isOn():
+ result['PrettyAlignments'] = ResultPath(Path=
+ self._pretty_alignment_out_file_name(
+ ),
+ IsWritten=True)
+
+ return result
+
+ def getHelp(self):
+ """seqprep help"""
+ help_str = """
+ For basic help, type the following at the command line:
+ 'SeqPrep -h'
+
+ Website:
+ https://github.com/jstjohn/SeqPrep
+ """
+ return help_str
+
+
+def join_paired_end_reads_seqprep(
+ reads1_infile_path,
+ reads2_infile_path,
+ outfile_label='seqprep',
+ max_overlap_ascii_q_score='J',
+ min_overlap=None, # typical default vs 15
+ max_mismatch_good_frac=None, # typical default is 0.02,
+ min_frac_matching=None, # typical default is 0.9,
+ phred_64=False,
+ params={},
+ working_dir=tempfile.gettempdir(),
+ SuppressStderr=True,
+ SuppressStdout=True,
+ HALT_EXEC=False):
+ """ Runs SeqPrep parameters to assemble paired-end reads.
+ -reads1_infile_path : reads1.fastq infile path
+ -reads2_infile_path : reads2.fastq infile path
+ -max_overlap_ascii_q_score : 'J' for Illumina 1.8+ phred+33,
+ representing a score of 41. See:
+ http://en.wikipedia.org/wiki/FASTQ_format
+ -min_overlap : minimum overall base pair overlap to merge two reads
+ -max_mismatch_good_frac : maximum fraction of good quality mismatching
+ bases to overlap reads
+ -min_frac_matching : minimum fraction of matching bases to overlap
+ reads
+ -phred_64 : if input is in phred+64. Output will always be phred+33.
+ -params : other optional SeqPrep parameters
+
+ NOTE: SeqPrep always outputs gzipped files
+ """
+
+ abs_r1_path = os.path.abspath(reads1_infile_path)
+ abs_r2_path = os.path.abspath(reads2_infile_path)
+
+ infile_paths = [abs_r1_path, abs_r2_path]
+
+ # check / make absolute infile paths
+ for p in infile_paths:
+ if not os.path.exists(p):
+ raise IOError('Infile not found at: %s' % p)
+
+ # set up controller
+ seqprep_app = SeqPrep(params=params,
+ WorkingDir=working_dir,
+ SuppressStderr=SuppressStderr,
+ SuppressStdout=SuppressStdout,
+ HALT_EXEC=HALT_EXEC)
+
+ # required by SeqPrep to assemble:
+ seqprep_app.Parameters['-f'].on(abs_r1_path)
+ seqprep_app.Parameters['-r'].on(abs_r2_path)
+
+ if outfile_label is not None:
+ seqprep_app.Parameters['-s'].on(outfile_label + '_assembled.fastq.gz')
+ seqprep_app.Parameters[
+ '-1'].on(outfile_label + '_unassembled_R1.fastq.gz')
+ seqprep_app.Parameters[
+ '-2'].on(outfile_label + '_unassembled_R2.fastq.gz')
+ else:
+ raise ValueError("Must set an outfile_label in order to set",
+ " the -s, -1, & -2 options!")
+
+ if min_overlap is not None:
+ if isinstance(min_overlap, int) and min_overlap > 0:
+ seqprep_app.Parameters['-o'].on(min_overlap)
+ else:
+ raise ValueError("min_overlap must be an int >= 0!")
+
+ if max_mismatch_good_frac is not None:
+ if isinstance(max_mismatch_good_frac, float) and 0.0 < max_mismatch_good_frac <= 1.0:
+ seqprep_app.Parameters['-m'].on(max_mismatch_good_frac)
+ else:
+ raise ValueError(
+ "max_mismatch_good_frac must be a float between 0.0-1.0!")
+
+ if min_frac_matching is not None:
+ if isinstance(min_frac_matching, float) and 0.0 < min_frac_matching <= 1.0:
+ seqprep_app.Parameters['-n'].on(min_frac_matching)
+ else:
+ raise ValueError(
+ "min_frac_matching must be a float between 0.0-1.0!")
+
+ if max_overlap_ascii_q_score is not None:
+ if isinstance(max_overlap_ascii_q_score, str) \
+ and len(max_overlap_ascii_q_score) == 1:
+ seqprep_app.Parameters['-y'].on(max_overlap_ascii_q_score)
+ else:
+ raise ValueError("max_overlap_ascii_q_score must be a single",
+ " ASCII character string. e.g. \'J\'!")
+
+ # if input is phred+64
+ if phred_64 is True:
+ seqprep_app.Parameters['-6'].on()
+
+ # run assembler
+ result = seqprep_app()
+
+ # Store output file path data to dict
+ path_dict = {}
+ path_dict['Assembled'] = result['Assembled'].name
+ path_dict['UnassembledReads1'] = result['UnassembledReads1'].name
+ path_dict['UnassembledReads2'] = result['UnassembledReads2'].name
+
+ # sanity check that files actually exist in path lcoations
+ for path in path_dict.values():
+ if not os.path.exists(path):
+ raise IOError('Output file not found at: %s' % path)
+
+ return path_dict
diff --git a/bfillings/sortmerna_v2.py b/bfillings/sortmerna_v2.py
new file mode 100644
index 0000000..e6cac68
--- /dev/null
+++ b/bfillings/sortmerna_v2.py
@@ -0,0 +1,544 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+Application controller for SortMeRNA version 2.0
+================================================
+"""
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+
+from os.path import split, splitext, dirname, join
+from glob import glob
+import re
+import tempfile
+
+from burrito.util import CommandLineApplication, ResultPath
+from burrito.parameters import ValuedParameter, FlagParameter
+from skbio.parse.sequences import parse_fasta
+
+
+class IndexDB(CommandLineApplication):
+ """ SortMeRNA generic application controller for building databases
+ """
+ _command = 'indexdb_rna'
+ _command_delimiter = ' '
+ _parameters = {
+ # Fasta reference file followed by indexed reference
+ # (ex. /path/to/refseqs.fasta,/path/to/refseqs.idx)
+ '--ref': ValuedParameter('--', Name='ref', Delimiter=' ', IsPath=True),
+
+ # Maximum number of positions to store for each unique seed
+ '--max_pos': ValuedParameter('--', Name='max_pos', Delimiter=' ',
+ IsPath=False, Value="10000"),
+
+ # tmp folder for storing unique L-mers (prior to calling CMPH
+ # in indexdb_rna), this tmp file is removed by indexdb_rna
+ # after it is not used any longer
+ '--tmpdir': ValuedParameter('--', Name='tmpdir', Delimiter=' ',
+ IsPath=True)
+ }
+
+ def _get_result_paths(self, data):
+ """ Build the dict of result filepaths
+ """
+ # get the filepath of the indexed database (after comma)
+ # /path/to/refseqs.fasta,/path/to/refseqs.idx
+ # ^------------------^
+ db_name = (self.Parameters['--ref'].Value).split(',')[1]
+
+ result = {}
+ extensions = ['bursttrie', 'kmer', 'pos', 'stats']
+ for extension in extensions:
+ for file_path in glob("%s.%s*" % (db_name, extension)):
+ # this will match e.g. nr.bursttrie_0.dat, nr.bursttrie_1.dat
+ # and nr.stats
+ key = file_path.split(db_name + '.')[1]
+ result[key] = ResultPath(Path=file_path, IsWritten=True)
+ return result
+
+
+def build_database_sortmerna(fasta_path,
+ max_pos=None,
+ output_dir=None,
+ temp_dir=tempfile.gettempdir(),
+ HALT_EXEC=False):
+ """ Build sortmerna db from fasta_path; return db name
+ and list of files created
+
+ Parameters
+ ----------
+ fasta_path : string
+ path to fasta file of sequences to build database.
+ max_pos : integer, optional
+ maximum positions to store per seed in index
+ [default: 10000].
+ output_dir : string, optional
+ directory where output should be written
+ [default: same directory as fasta_path]
+ HALT_EXEC : boolean, optional
+ halt just before running the indexdb_rna command
+ and print the command -- useful for debugging
+ [default: False].
+
+ Return
+ ------
+ db_name : string
+ filepath to indexed database.
+ db_filepaths : list
+ output files by indexdb_rna
+ """
+
+ if fasta_path is None:
+ raise ValueError("Error: path to fasta reference "
+ "sequences must exist.")
+
+ fasta_dir, fasta_filename = split(fasta_path)
+ if not output_dir:
+ output_dir = fasta_dir or '.'
+ # Will cd to this directory, so just pass the filename
+ # so the app is not confused by relative paths
+ fasta_path = fasta_filename
+
+ index_basename = splitext(fasta_filename)[0]
+
+ db_name = join(output_dir, index_basename)
+
+ # Instantiate the object
+ sdb = IndexDB(WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ # The parameter --ref STRING must follow the format where
+ # STRING = /path/to/ref.fasta,/path/to/ref.idx
+ sdb.Parameters['--ref'].on("%s,%s" % (fasta_path, db_name))
+
+ # Set temporary directory
+ sdb.Parameters['--tmpdir'].on(temp_dir)
+
+ # Override --max_pos parameter
+ if max_pos is not None:
+ sdb.Parameters['--max_pos'].on(max_pos)
+
+ # Run indexdb_rna
+ app_result = sdb()
+
+ # Return all output files (by indexdb_rna) as a list,
+ # first however remove the StdErr and StdOut filepaths
+ # as they files will be destroyed at the exit from
+ # this function (IndexDB is a local instance)
+ db_filepaths = [v.name for k, v in app_result.items()
+ if k not in {'StdErr', 'StdOut'} and hasattr(v, 'name')]
+
+ return db_name, db_filepaths
+
+
+class Sortmerna(CommandLineApplication):
+ """ SortMeRNA generic application controller for OTU picking
+ """
+
+ _command = 'sortmerna'
+ _command_delimiter = ' '
+ _parameters = {
+ # Verbose (log to stdout)
+ '-v': FlagParameter('-', Name='v', Value=True),
+
+ # Fasta or Fastq input query sequences file
+ '--reads': ValuedParameter('--', Name='reads', Delimiter=' ',
+ IsPath=True, Value=None),
+
+ # Fasta reference file followed by indexed reference
+ '--ref': ValuedParameter('--', Name='ref', Delimiter=' ',
+ IsPath=True, Value=None),
+
+ # File path + base name for all output files
+ '--aligned': ValuedParameter('--', Name='aligned', Delimiter=' ',
+ IsPath=True, Value=None),
+
+ # Output log file with parameters used to launch sortmerna and
+ # statistics on final results (the log file takes on
+ # the basename given in --aligned and the extension '.log')
+ '--log': FlagParameter('--', Name='log', Value=True),
+
+ # Output Fasta or Fastq file of aligned reads (flag)
+ '--fastx': FlagParameter('--', Name='fastx', Value=True),
+
+ # Output BLAST alignment file, options include [0,3] where:
+ # 0: Blast-like pairwise alignment,
+ # 1: Blast tabular format,
+ # 2: 1 + extra column for CIGAR string,
+ # 3: 2 + extra column for query coverage
+ '--blast': ValuedParameter('--', Name='blast', Delimiter=' ',
+ IsPath=False, Value=None),
+
+ # Output SAM alignment file
+ '--sam': FlagParameter('--', Name='sam', Value=False),
+
+ # Output SQ tags in the SAM file (useful for whole-genome alignment)
+ '--SQ': FlagParameter('--', Name='SQ', Value=False),
+
+ # Report the best INT number of alignments
+ '--best': ValuedParameter('--', Name='best', Delimiter=' ',
+ IsPath=False, Value="1"),
+
+ # Report first INT number of alignments
+ '--num_alignments': ValuedParameter('--', Name='num_alignments',
+ Delimiter=' ', IsPath=False,
+ Value=None),
+
+ # Number of threads
+ '-a': ValuedParameter('-', Name='a', Delimiter=' ',
+ IsPath=False, Value="1"),
+
+ # E-value threshold
+ '-e': ValuedParameter('-', Name='e', Delimiter=' ',
+ IsPath=False, Value="1"),
+
+ # Similarity threshold
+ '--id': ValuedParameter('--', Name='id', Delimiter=' ',
+ IsPath=False, Value="0.97"),
+
+ # Query coverage threshold
+ '--coverage': ValuedParameter('--', Name='coverage', Delimiter=' ',
+ IsPath=False, Value="0.97"),
+
+ # Output Fasta/Fastq file with reads failing to pass the --id and
+ # --coverage thresholds for de novo clustering
+ '--de_novo_otu': FlagParameter('--', Name='de_novo_otu', Value=True),
+
+ # Output an OTU map
+ '--otu_map': FlagParameter('--', Name='otu_map', Value=True),
+
+ # Print a NULL alignment string for non-aligned reads
+ '--print_all_reads': FlagParameter('--', Name='print_all_reads',
+ Value=False)
+ }
+ _synonyms = {}
+ _input_handler = '_input_as_string'
+ _supress_stdout = False
+ _supress_stderr = False
+
+ def _get_result_paths(self, data):
+ """ Set the result paths """
+
+ result = {}
+
+ # get the file extension of the reads file (sortmerna
+ # internally outputs all results with this extension)
+ fileExtension = splitext(self.Parameters['--reads'].Value)[1]
+
+ # at this point the parameter --aligned should be set as
+ # sortmerna will not run without it
+ if self.Parameters['--aligned'].isOff():
+ raise ValueError("Error: the --aligned parameter must be set.")
+
+ # file base name for aligned reads
+ output_base = self.Parameters['--aligned'].Value
+
+ # Blast alignments
+ result['BlastAlignments'] =\
+ ResultPath(Path=output_base + '.blast',
+ IsWritten=self.Parameters['--blast'].isOn())
+
+ # SAM alignments
+ result['SAMAlignments'] =\
+ ResultPath(Path=output_base + '.sam',
+ IsWritten=self.Parameters['--sam'].isOn())
+
+ # OTU map (mandatory output)
+ result['OtuMap'] =\
+ ResultPath(Path=output_base + '_otus.txt',
+ IsWritten=self.Parameters['--otu_map'].isOn())
+
+ # FASTA file of sequences in the OTU map (madatory output)
+ result['FastaMatches'] =\
+ ResultPath(Path=output_base + fileExtension,
+ IsWritten=self.Parameters['--fastx'].isOn())
+
+ # FASTA file of sequences not in the OTU map (mandatory output)
+ result['FastaForDenovo'] =\
+ ResultPath(Path=output_base + '_denovo' +
+ fileExtension,
+ IsWritten=self.Parameters['--de_novo_otu'].isOn())
+ # Log file
+ result['LogFile'] =\
+ ResultPath(Path=output_base + '.log',
+ IsWritten=self.Parameters['--log'].isOn())
+
+ return result
+
+ def getHelp(self):
+ """Method that points to documentation"""
+ help_str = ("SortMeRNA is hosted at:\n"
+ "http://bioinfo.lifl.fr/RNA/sortmerna/\n"
+ "https://github.com/biocore/sortmerna\n\n"
+ "The following paper should be cited if this resource is "
+ "used:\n\n"
+ "Kopylova, E., Noe L. and Touzet, H.,\n"
+ "SortMeRNA: fast and accurate filtering of ribosomal RNAs "
+ "in\n"
+ "metatranscriptomic data, Bioinformatics (2012) 28(24)\n"
+ )
+ return help_str
+
+
+def sortmerna_ref_cluster(seq_path=None,
+ sortmerna_db=None,
+ refseqs_fp=None,
+ result_path=None,
+ tabular=False,
+ max_e_value=1,
+ similarity=0.97,
+ coverage=0.97,
+ threads=1,
+ best=1,
+ HALT_EXEC=False
+ ):
+ """Launch sortmerna OTU picker
+
+ Parameters
+ ----------
+ seq_path : str
+ filepath to query sequences.
+ sortmerna_db : str
+ indexed reference database.
+ refseqs_fp : str
+ filepath of reference sequences.
+ result_path : str
+ filepath to output OTU map.
+ max_e_value : float, optional
+ E-value threshold [default: 1].
+ similarity : float, optional
+ similarity %id threshold [default: 0.97].
+ coverage : float, optional
+ query coverage % threshold [default: 0.97].
+ threads : int, optional
+ number of threads to use (OpenMP) [default: 1].
+ tabular : bool, optional
+ output BLAST tabular alignments [default: False].
+ best : int, optional
+ number of best alignments to output per read
+ [default: 1].
+
+ Returns
+ -------
+ clusters : dict of lists
+ OTU ids and reads mapping to them
+
+ failures : list
+ reads which did not align
+ """
+
+ # Instantiate the object
+ smr = Sortmerna(HALT_EXEC=HALT_EXEC)
+
+ # Set input query sequences path
+ if seq_path is not None:
+ smr.Parameters['--reads'].on(seq_path)
+ else:
+ raise ValueError("Error: a read file is mandatory input.")
+
+ # Set the input reference sequence + indexed database path
+ if sortmerna_db is not None:
+ smr.Parameters['--ref'].on("%s,%s" % (refseqs_fp, sortmerna_db))
+ else:
+ raise ValueError("Error: an indexed database for reference set %s must"
+ " already exist.\nUse indexdb_rna to index the"
+ " database." % refseqs_fp)
+
+ if result_path is None:
+ raise ValueError("Error: the result path must be set.")
+
+ # Set output results path (for Blast alignments, clusters and failures)
+ output_dir = dirname(result_path)
+ if output_dir is not None:
+ output_file = join(output_dir, "sortmerna_otus")
+ smr.Parameters['--aligned'].on(output_file)
+
+ # Set E-value threshold
+ if max_e_value is not None:
+ smr.Parameters['-e'].on(max_e_value)
+
+ # Set similarity threshold
+ if similarity is not None:
+ smr.Parameters['--id'].on(similarity)
+
+ # Set query coverage threshold
+ if coverage is not None:
+ smr.Parameters['--coverage'].on(coverage)
+
+ # Set number of best alignments to output
+ if best is not None:
+ smr.Parameters['--best'].on(best)
+
+ # Set Blast tabular output
+ # The option --blast 3 represents an
+ # m8 blast tabular output + two extra
+ # columns containing the CIGAR string
+ # and the query coverage
+ if tabular:
+ smr.Parameters['--blast'].on("3")
+
+ # Set number of threads
+ if threads is not None:
+ smr.Parameters['-a'].on(threads)
+
+ # Run sortmerna
+ app_result = smr()
+
+ # Put clusters into a map of lists
+ f_otumap = app_result['OtuMap']
+ rows = (line.strip().split('\t') for line in f_otumap)
+ clusters = {r[0]: r[1:] for r in rows}
+
+ # Put failures into a list
+ f_failure = app_result['FastaForDenovo']
+ failures = [re.split('>| ', label)[0]
+ for label, seq in parse_fasta(f_failure)]
+
+ # remove the aligned FASTA file and failures FASTA file
+ # (currently these are re-constructed using pick_rep_set.py
+ # further in the OTU-picking pipeline)
+ smr_files_to_remove = [app_result['FastaForDenovo'].name,
+ app_result['FastaMatches'].name,
+ app_result['OtuMap'].name]
+
+ return clusters, failures, smr_files_to_remove
+
+
+def sortmerna_map(seq_path,
+ output_dir,
+ refseqs_fp,
+ sortmerna_db,
+ e_value=1,
+ threads=1,
+ best=None,
+ num_alignments=None,
+ HALT_EXEC=False,
+ output_sam=False,
+ sam_SQ_tags=False,
+ blast_format=3,
+ print_all_reads=True,
+ ):
+ """Launch sortmerna mapper
+
+ Parameters
+ ----------
+ seq_path : str
+ filepath to reads.
+ output_dir : str
+ dirpath to sortmerna output.
+ refseqs_fp : str
+ filepath of reference sequences.
+ sortmerna_db : str
+ indexed reference database.
+ e_value : float, optional
+ E-value threshold [default: 1].
+ threads : int, optional
+ number of threads to use (OpenMP) [default: 1].
+ best : int, optional
+ number of best alignments to output per read
+ [default: None].
+ num_alignments : int, optional
+ number of first alignments passing E-value threshold to
+ output per read [default: None].
+ HALT_EXEC : bool, debugging parameter
+ If passed, will exit just before the sortmerna command
+ is issued and will print out the command that would
+ have been called to stdout [default: False].
+ output_sam : bool, optional
+ flag to set SAM output format [default: False].
+ sam_SQ_tags : bool, optional
+ add SQ field to SAM output (if output_SAM is True)
+ [default: False].
+ blast_format : int, optional
+ Output Blast m8 tabular + 2 extra columns for CIGAR
+ string and query coverge [default: 3].
+ print_all_reads : bool, optional
+ output NULL alignments for non-aligned reads
+ [default: True].
+
+ Returns
+ -------
+ dict of result paths set in _get_result_paths()
+ """
+
+ if not (blast_format or output_sam):
+ raise ValueError("Either Blast or SAM output alignment "
+ "format must be chosen.")
+
+ if (best and num_alignments):
+ raise ValueError("Only one of --best or --num_alignments "
+ "options must be chosen.")
+
+ # Instantiate the object
+ smr = Sortmerna(HALT_EXEC=HALT_EXEC)
+
+ # Set the input reference sequence + indexed database path
+ smr.Parameters['--ref'].on("%s,%s" % (refseqs_fp, sortmerna_db))
+
+ # Set input query sequences path
+ smr.Parameters['--reads'].on(seq_path)
+
+ # Set Blast tabular output
+ # The option --blast 3 represents an
+ # m8 blast tabular output + two extra
+ # columns containing the CIGAR string
+ # and the query coverage
+ if blast_format:
+ smr.Parameters['--blast'].on(blast_format)
+
+ # Output alignments in SAM format
+ if output_sam:
+ smr.Parameters['--sam'].on()
+ if sam_SQ_tags:
+ smr.Parameters['--SQ'].on()
+
+ # Turn on NULL string alignment output
+ if print_all_reads:
+ smr.Parameters['--print_all_reads'].on()
+
+ # Set output results path (for Blast alignments and log file)
+ output_file = join(output_dir, "sortmerna_map")
+ smr.Parameters['--aligned'].on(output_file)
+
+ # Set E-value threshold
+ if e_value is not None:
+ smr.Parameters['-e'].on(e_value)
+
+ # Set number of best alignments to output per read
+ if best is not None:
+ smr.Parameters['--best'].on(best)
+
+ # Set number of first alignments passing E-value threshold
+ # to output per read
+ if num_alignments is not None:
+ smr.Parameters['--num_alignments'].on(num_alignments)
+
+ # Set number of threads
+ if threads is not None:
+ smr.Parameters['-a'].on(threads)
+
+ # Turn off parameters related to OTU-picking
+ smr.Parameters['--fastx'].off()
+ smr.Parameters['--otu_map'].off()
+ smr.Parameters['--de_novo_otu'].off()
+ smr.Parameters['--id'].off()
+ smr.Parameters['--coverage'].off()
+
+ # Run sortmerna
+ app_result = smr()
+
+ return app_result
diff --git a/bfillings/sumaclust_v1.py b/bfillings/sumaclust_v1.py
new file mode 100644
index 0000000..aa5d2b9
--- /dev/null
+++ b/bfillings/sumaclust_v1.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+Application controller for SumaClust version 1.0
+================================================
+"""
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from os.path import split, isdir, dirname, isfile, exists, realpath
+
+from burrito.util import CommandLineApplication, ResultPath
+from burrito.parameters import ValuedParameter, FlagParameter
+
+
+class Sumaclust(CommandLineApplication):
+ """ SumaClust generic application controller for de novo OTU picking
+ """
+
+ _command = 'sumaclust'
+ _command_delimiter = ' '
+ _parameters = {
+ # Reference sequence length is the shortest
+ '-l': FlagParameter('-', Name='l', Value=True),
+
+ # Filepath of the OTU-map
+ '-O': ValuedParameter('-', Name='O', Delimiter=' ',
+ Value=None, IsPath=True),
+
+ # Flag '-f' must be passed to deactivate FASTA output
+ '-f': FlagParameter('-', Name='f', Value=True),
+
+ # Number of threads
+ '-p': ValuedParameter('-', Name='p', Delimiter=' ',
+ Value=1, IsPath=False),
+
+ # Assign sequence to the best matching cluster seed, rather
+ # than the first matching cluster (having >= similarity threshold)
+ '-e': FlagParameter('-', Name='e', Value=False),
+
+ # Similarity threshold
+ '-t': ValuedParameter('-', Name='t', Delimiter=' ',
+ Value=0.97, IsPath=False),
+
+ # Maximum ratio between abundance of two sequences so that the
+ # less abundant one can be considered as a variant of the more
+ # abundant one.
+ '-R': ValuedParameter('-', Name='R', Delimiter=' ',
+ Value=1, IsPath=False)
+ }
+
+ _synonyms = {}
+ _input_handler = '_input_as_string'
+ _supress_stdout = False
+ _supress_stderr = False
+
+ def _get_result_paths(self, data):
+ """ Set the result paths
+ """
+
+ result = {}
+
+ # OTU map (mandatory output)
+ result['OtuMap'] = ResultPath(Path=self.Parameters['-O'].Value,
+ IsWritten=True)
+
+ # SumaClust will not produce any output file if the
+ # input file was empty, so we create an empty
+ # output file
+ if not isfile(result['OtuMap'].Path):
+ otumap_f = open(result['OtuMap'].Path, 'w')
+ otumap_f.close()
+
+ return result
+
+ def getHelp(self):
+ """ Method that points to documentation
+ """
+ help_str = ("SumaClust is hosted at:\n"
+ "http://metabarcoding.org/sumatra/\n\n"
+ "The following paper should be cited if this resource "
+ "is used:\n\n"
+ "SUMATRA and SUMACLUST: fast and exact comparison and "
+ "clustering "
+ "of full-length barcode sequences\n"
+ "Mercier, C., Boyer, F., Kopylova, E., Taberlet, P., "
+ "Bonin, A. and Coissac E.,"
+ "2014 (in preparation)\n"
+ )
+
+ return help_str
+
+
+def sumaclust_denovo_cluster(seq_path=None,
+ result_path=None,
+ shortest_len=True,
+ similarity=0.97,
+ threads=1,
+ exact=False,
+ HALT_EXEC=False
+ ):
+ """ Function : launch SumaClust de novo OTU picker
+
+ Parameters: seq_path, filepath to reads;
+ result_path, filepath to output OTU map;
+ shortest_len, boolean;
+ similarity, the similarity threshold (between (0,1]);
+ threads, number of threads to use;
+ exact, boolean to perform exact matching
+
+ Return : clusters, list of lists
+ """
+
+ # Sequence path is mandatory
+ if (seq_path is None
+ or not exists(seq_path)):
+ raise ValueError("Error: FASTA query sequence filepath is "
+ "mandatory input.")
+
+ # Output directory is mandatory
+ if (result_path is None
+ or not isdir(dirname(realpath(result_path)))):
+ raise ValueError("Error: output directory is mandatory input.")
+
+ # Instantiate the object
+ sumaclust = Sumaclust(HALT_EXEC=HALT_EXEC)
+
+ # Set the OTU-map filepath
+ sumaclust.Parameters['-O'].on(result_path)
+
+ # Set the similarity threshold
+ if similarity is not None:
+ sumaclust.Parameters['-t'].on(similarity)
+
+ # Set the option to perform exact clustering (default: False)
+ if exact:
+ sumaclust.Parameters['-e'].on()
+
+ # Turn off option for reference sequence length to be the shortest
+ if not shortest_len:
+ sumaclust.Parameters['-l'].off()
+
+ # Set the number of threads
+ if threads > 0:
+ sumaclust.Parameters['-p'].on(threads)
+ else:
+ raise ValueError("Number of threads must be positive.")
+
+ # Launch SumaClust,
+ # set the data string to include the read filepath
+ # (to be passed as final arguments in the sumaclust command)
+ app_result = sumaclust(seq_path)
+
+ # Put clusters into a list of lists
+ f_otumap = app_result['OtuMap']
+ clusters = [line.strip().split('\t')[1:] for line in f_otumap]
+
+ # Return clusters
+ return clusters
diff --git a/bfillings/swarm_v127.py b/bfillings/swarm_v127.py
new file mode 100644
index 0000000..820d3e6
--- /dev/null
+++ b/bfillings/swarm_v127.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+Application controller for Swarm version 1.2.7
+==============================================
+"""
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from os.path import exists
+from tempfile import mkstemp
+from os import close, linesep
+from subprocess import Popen, PIPE
+import re
+
+from burrito.util import (CommandLineApplication, ResultPath,
+ ApplicationNotFoundError)
+from burrito.parameters import ValuedParameter
+from skbio.parse.sequences import parse_fasta
+from skbio.util import remove_files
+
+
+class Swarm(CommandLineApplication):
+ """ Swarm generic application controller for de novo OTU picking
+ """
+
+ _command = 'swarm'
+ _command_delimiter = ' '
+ _parameters = {
+ # Resolution
+ '-d': ValuedParameter('-', Name='d', Delimiter=' ',
+ Value=1, IsPath=False),
+ # OTU-map result filename
+ '-o': ValuedParameter('-', Name='o', Delimiter=' ',
+ Value=None, IsPath=True),
+ # Threads
+ '-t': ValuedParameter('-', Name='t', Delimiter=' ',
+ Value=1, IsPath=False),
+ }
+
+ _synonyms = {}
+ _input_handler = '_input_as_string'
+ _supress_stdout = False
+ _supress_stderr = False
+ files_to_remove = []
+
+ def __call__(self, seq_path):
+ """
+ Input : seq_path, a filepath to input FASTA reads
+
+ Method: de-replicate FASTA reads,
+ launch Swarm followed by swarm_breaker.py,
+ expand clusters
+
+ Return: clusters, a list of lists
+ """
+
+ # De-replicate query sequences
+ exact_match_id_map, seq_path =\
+ self._apply_identical_sequences_prefilter(seq_path)
+
+ # Run Swarm
+ super(Swarm, self).__call__(seq_path)
+
+ # Run swarm_breaker.py to refine the clusters
+ clusters = self._swarm_breaker(seq_path)
+
+ # Expand clusters
+ clusters = self._map_filtered_clusters_to_full_clusters(
+ clusters, exact_match_id_map)
+
+ return clusters
+
+ def _accept_exit_status(self, exit_status):
+ """ Test for acceptable exit status
+ """
+ return exit_status == 0
+
+ def _swarm_breaker(self,
+ seq_path):
+ """
+ Input : seq_path, a filepath to de-replicated
+ input FASTA reads
+
+ Method: using swarm_breaker.py, break
+ chains of amplicons based on
+ abundance information. Abundance
+ is stored after the final
+ underscore '_' in each sequence
+ label (recommended procedure for
+ Swarm)
+
+ Return: clusters, a list of lists
+ """
+ swarm_breaker_command = ["swarm_breaker.py",
+ "-f",
+ seq_path,
+ "-s",
+ self.Parameters['-o'].Value,
+ "-d",
+ str(self.Parameters['-d'].Value)]
+
+ try:
+ # launch swarm_breaker.py as a subprocess,
+ # pipe refined OTU-map to the standard stream
+ proc = Popen(swarm_breaker_command,
+ stdout=PIPE,
+ stderr=PIPE,
+ close_fds=True)
+
+ stdout, stderr = proc.communicate()
+
+ if stderr:
+ raise StandardError("Process exited with %s" % stderr)
+
+ # store refined clusters in list of lists
+ clusters = []
+ for line in stdout.split(linesep):
+ # skip line if contains only the newline character
+ if not line:
+ break
+ seq_ids = re.split("\t| ", line.strip())
+ # remove the abundance information from the labels
+ for i in range(len(seq_ids)):
+ seq_ids[i] = seq_ids[i].rsplit("_", 1)[0]
+ clusters.append(seq_ids)
+ except OSError:
+ raise ApplicationNotFoundError("Cannot find swarm_breaker.py "
+ "in the $PATH directories.")
+
+ return clusters
+
+ def _prefilter_exact_matches(self,
+ seqs):
+ """
+ """
+ unique_sequences = {}
+ seq_id_map = {}
+ filtered_seqs = []
+ for seq_id, seq in seqs:
+ seq_id = seq_id.split()[0]
+ try:
+ temp_seq_id = unique_sequences[seq]
+ except KeyError:
+ temp_seq_id = 'ExactMatch.%s' % seq_id
+ unique_sequences[seq] = temp_seq_id
+ seq_id_map[temp_seq_id] = []
+ filtered_seqs.append((temp_seq_id, seq))
+ seq_id_map[temp_seq_id].append(seq_id)
+ return filtered_seqs, seq_id_map
+
+ def _apply_identical_sequences_prefilter(self,
+ seq_path):
+ """
+ Input : seq_path, a filepath to input FASTA reads
+ Method: prepares and writes de-replicated reads
+ to a temporary FASTA file, calls
+ parent method to do the actual
+ de-replication
+ Return: exact_match_id_map, a dictionary storing
+ de-replicated amplicon ID as key and
+ all original FASTA IDs with identical
+ sequences as values;
+ unique_seqs_fp, filepath to FASTA file
+ holding only de-replicated sequences
+ """
+ # creating mapping for de-replicated reads
+ seqs_to_cluster, exact_match_id_map =\
+ self._prefilter_exact_matches(parse_fasta(seq_path))
+
+ # create temporary file for storing the de-replicated reads
+ fd, unique_seqs_fp = mkstemp(
+ prefix='SwarmExactMatchFilter', suffix='.fasta')
+ close(fd)
+
+ self.files_to_remove.append(unique_seqs_fp)
+
+ # write de-replicated reads to file
+ unique_seqs_f = open(unique_seqs_fp, 'w')
+ for seq_id, seq in seqs_to_cluster:
+ unique_seqs_f.write('>%s_%d\n%s\n'
+ % (seq_id,
+ len(exact_match_id_map[seq_id]),
+ seq))
+ unique_seqs_f.close()
+
+ return exact_match_id_map, unique_seqs_fp
+
+ def _map_filtered_clusters_to_full_clusters(self,
+ clusters,
+ filter_map):
+ """
+ Input: clusters, a list of cluster lists
+ filter_map, the seq_id in each clusters
+ is the key to the filter_map
+ containing all seq_ids with
+ duplicate FASTA sequences
+ Output: an extended list of cluster lists
+ """
+ results = []
+ for cluster in clusters:
+ full_cluster = []
+ for seq_id in cluster:
+ full_cluster += filter_map[seq_id]
+ results.append(full_cluster)
+ return results
+
+ def _get_result_paths(self, data):
+ """ Set the result paths
+ """
+
+ # Swarm OTU map (mandatory output)
+ return {'OtuMap': ResultPath(Path=self.Parameters['-o'].Value,
+ IsWritten=True)}
+
+ def getHelp(self):
+ """ Method that points to documentation
+ """
+ help_str = ("Swarm is hosted at:\n"
+ "https://github.com/torognes/swarm\n\n"
+ "The following paper should be cited if this resource "
+ "is used:\n\n"
+ "Swarm: robust and fast clustering method for "
+ "amplicon-based studies\n"
+ "Mahe, F., Rognes, T., Quince, C., de Vargas, C., "
+ "and Dunthorn, M."
+ "2014 (submitted)\n"
+ )
+
+ return help_str
+
+
+def swarm_denovo_cluster(seq_path,
+ d=1,
+ threads=1,
+ HALT_EXEC=False):
+ """ Function : launch the Swarm de novo OTU picker
+
+ Parameters: seq_path, filepath to reads
+ d, resolution
+ threads, number of threads to use
+
+ Return : clusters, list of lists
+ """
+
+ # Check sequence file exists
+ if not exists(seq_path):
+ raise ValueError("%s does not exist" % seq_path)
+
+ # Instantiate the object
+ swarm = Swarm(HALT_EXEC=HALT_EXEC)
+
+ # Set the resolution
+ if d > 0:
+ swarm.Parameters['-d'].on(d)
+ else:
+ raise ValueError("Resolution -d must be a positive integer.")
+
+ # Set the number of threads
+ if threads > 0:
+ swarm.Parameters['-t'].on(threads)
+ else:
+ raise ValueError("Number of threads must be a positive integer.")
+
+ # create temporary file for Swarm OTU-map
+ f, tmp_swarm_otumap = mkstemp(prefix='temp_otumap_',
+ suffix='.swarm')
+ close(f)
+
+ swarm.Parameters['-o'].on(tmp_swarm_otumap)
+
+ # Remove this file later, the final OTU-map
+ # is output by swarm_breaker.py and returned
+ # as a list of lists (clusters)
+ swarm.files_to_remove.append(tmp_swarm_otumap)
+
+ # Launch Swarm
+ # set the data string to include the read filepath
+ # (to be passed as final arguments in the swarm command)
+ clusters = swarm(seq_path)
+
+ remove_files(swarm.files_to_remove, error_on_missing=False)
+
+ # Return clusters
+ return clusters
diff --git a/bfillings/tests/__init__.py b/bfillings/tests/__init__.py
new file mode 100644
index 0000000..0b50c1b
--- /dev/null
+++ b/bfillings/tests/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
diff --git a/bfillings/tests/test_blast.py b/bfillings/tests/test_blast.py
new file mode 100644
index 0000000..f98e8e9
--- /dev/null
+++ b/bfillings/tests/test_blast.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from string import split, strip
+from os import popen, remove
+from glob import glob
+from unittest import TestCase, main
+
+from cogent.parse.blast import QMEBlast9
+
+from bfillings.blast import (seqs_to_stream, make_subject_match_scorer,
+ make_shotgun_scorer, keep_everything_scorer,
+ ids_from_seq_lower_threshold, PsiBlast,
+ psiblast_n_neighbors)
+
+
+class BlastTests(TestCase):
+ """Tests of top-level functions"""
+
+ def setUp(self):
+ """Define some standard data"""
+ self.rec = """# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181 ece:Z4181 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4181 ecs:ECs3717 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4181 cvi:CV2421 41.67 72 42 0 39 110 29 100 2e-06 52.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 2
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181 ece:Z4181 100.00 110 0 0 1 110 1 110 3e-54 211
+ece:Z4181 ecs:ECs3717 100.00 110 0 0 1 110 1 110 3e-54 211
+ece:Z4181 cvi:CV2421 41.67 72 42 0 39 110 29 100 2e-08 59.0
+ece:Z4181 sfl:CP0138 33.98 103 57 2 8 110 6 97 6e-06 50.5
+ece:Z4181 spt:SPA2730 37.50 72 45 0 39 110 30 101 1e-05 49.8
+ece:Z4181 sec:SC2804 37.50 72 45 0 39 110 30 101 1e-05 49.8
+ece:Z4181 stm:STM2872 37.50 72 45 0 39 110 30 101 1e-05 49.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4182
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4182 ece:Z4182 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4182 ecs:ECs3718 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4182 cvi:CV2422 41.67 72 42 0 39 110 29 100 2e-06 52.8""".split('\n')
+
+ self.rec2 = """# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181 ece:Z4181 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4181 ecs:ECs3717 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4181 spt:SPA2730 37.50 72 45 0 39 110 30 101 1e-05 49.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 2
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181 ecs:ECs3717 100.00 110 0 0 1 110 1 110 3e-54 211
+ece:Z4181 cvi:CV2421 41.67 72 42 0 39 110 29 100 2e-08 59.0
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4182
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4182 ece:Z4182 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4182 cvi:CV2421 41.67 72 42 0 39 110 29 100 2e-06 52.8""".split('\n')
+
+ self.rec3 = """# BLASTP 2.2.10 [Oct-19-2004]
+# BLASTP 2.2.10 [Oct-19-2004]
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181 ece:Z4182 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4181 ecs:ECs3717 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4181 spt:SPA2730 37.50 72 45 0 39 110 30 101 1e-05 49.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4182
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4182 ece:Z4182 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4182 cvi:CV2421 41.67 72 42 0 39 110 29 100 2e-06 52.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Query: ece:Z4183
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4183 ece:Z4182 100.00 110 0 0 1 110 1 110 3e-47 187
+ece:Z4183 ecs:ECs3717 100.00 110 0 0 1 110 1 110 3e-54 211
+ece:Z4183 cvi:CV2421 41.67 72 42 0 39 110 29 100 2e-08 59.0""".split('\n')
+
+ self.query_1 = """>gi|100002553| Bd2556c Bd2556c two-component system sensor histidine kinase 3092017:3094158 reverse MW:81963
+MRLKNRLNNWISIRMGMVIVIFLGVSCGSMRSSTPPPAKDRLTEIDSLERLLPDCPTIASTLPLLRRLAFLYQQQSEMKVYNERLYENAMAVDSISVAYLGLKNLAEYYYDQSVRDSLEYYCSLVDSIAKARHEYPNVLFDVKSLSSQDLLWLGNYELAMSEAMDLYRLASNLDHRYGLLRCSETLGLIYQRIRRDSDAVVSFQESLDLLKDIKDVPDIMDTKVRLTSYQLESSVRTKQYASTERILGQYMALLDEQYKIYQEKNDLLSIKREYWLLYSFYTSFYLSQGDLENAKRSLDQASSYADSNWVEGDYAINTYLTVKARYHKAAGDIPLALHCINEVLETERLPEDIQFKADILKEQGQLGEVMALYDELYSTLTKRRGTSFLRQVNQLRTLHELHEKELKETELKEAGQRIARKQDLLIFILSISVVLLILLYVLFLYYRHLRSLKNQLQREKELLLESQRQLIKEKTRAEEASLMKSAFLANMS [...]
+
+ self.query_2 = """>gi|100002557| Bd2560c Bd2560c conserved hypothetical protein 3097971:3098210 reverse MW:8927
+MGKNQLIHGNEFHLLKQAEIHKATGKLVESLNLAAGSTGGFDIYKVVEAYFTDLEKRKEINDLLGISEPCETRVTEECFS
+"""
+
+ self.fasta_recs = """>gi|100002550| Bd2553c Bd2553c conserved hypothetical protein 3090609:3091013 reverse MW:14682
+MMDFISVPLVVGIVCAGIYGLFELFVRKRERLAIIEKIGDKLDTSAFDGKLGLPNYMRNFSFSSLKAGCLLAGIGLGLLVGFIINMCMATNSYYDDGWYRHEVAGTAYGASVLLFGGIGLIIAFVIELKLGKNNK
+>gi|100002551| Bd2554 Bd2554 RNA polymerase ECF-type sigma factor 3091112:3091717 forward MW:23408
+LLPQVVTYLPGLRPLSTMELYTDTYYIQRIQAGDVACFACLLDKYSRPIHSLILKVVRSQEEAEELAQDTFMKVFKNLASFKGDCSFSTWIYRIAYNTAISSVRKKRYEFLAIEETTLENVSEEEITNLFGQTESTEQVQRLEVALEQLLPDERALILLFYWKEKTIEELVSITGLTASNIKVKLHRIRKKLFVLLNGMDHE
+>gi|100002552| Bd2555 Bd2555 conserved hypothetical protein 3091713:3092066 forward MW:13332
+MSKINTNKEQPDLLGDLFKRIPEEELPASFRSNVMRQIMLESAKAKKRDERFSLLAAIVASLIMISLAIVSFVYMEIPKIAIPTISTSALAFYLYIGAITLILLLADYKLRNLFHKKG
+>gi|100002553| Bd2556c Bd2556c two-component system sensor histidine kinase 3092017:3094158 reverse MW:81963
+MRLKNRLNNWISIRMGMVIVIFLGVSCGSMRSSTPPPAKDRLTEIDSLERLLPDCPTIASTLPLLRRLAFLYQQQSEMKVYNERLYENAMAVDSISVAYLGLKNLAEYYYDQSVRDSLEYYCSLVDSIAKARHEYPNVLFDVKSLSSQDLLWLGNYELAMSEAMDLYRLASNLDHRYGLLRCSETLGLIYQRIRRDSDAVVSFQESLDLLKDIKDVPDIMDTKVRLTSYQLESSVRTKQYASTERILGQYMALLDEQYKIYQEKNDLLSIKREYWLLYSFYTSFYLSQGDLENAKRSLDQASSYADSNWVEGDYAINTYLTVKARYHKAAGDIPLALHCINEVLETERLPEDIQFKADILKEQGQLGEVMALYDELYSTLTKRRGTSFLRQVNQLRTLHELHEKELKETELKEAGQRIARKQDLLIFILSISVVLLILLYVLFLYYRHLRSLKNQLQREKELLLESQRQLIKEKTRAEEASLMKSAFLANMS [...]
+>gi|100002554| Bd2557c Bd2557c two-component system sensor histidine kinase 3094158:3095507 reverse MW:51247
+LERKYNGEGKIFPVKRHRCLMSCYYCELYTMKGNSGKAQAYLDQATAYLDSSFGDRVEAQYLRTKSFYYWKEKDYRHALSAVNLALKINRDLDKLEMKKAVLQSSGQLQEAVTIYEEIINKTETINTDAFDRQIEQLRVLNDLNDLEKQDRELKLKSEQEALKQKQIVVSIGLLLVLMGLLYMLWRIYMHTKRLRNELLQEKDSLTASEKQLRVVTKEAEAANKKKSAFIANISHEVRTPLNAIVGFSELLASSEYSEEEKIRFAGEVNHSSELLLNLVNDVLDLSRLESGKIKFSVKPNDLVACCQRALDSIRHRVKPGVRLTFTPSIESYTLNTDALRLQQLLTNLLSNAAKFTSEGEINLSFTVDEGKEEVCFSVTDTGCGIPEDKCEKIFERFEKLDDFIQGTGLGLSVCQIISEQLNGSLSVDISYKDGARFVFIHPTNLIETPI
+>gi|100002555| Bd2558c Bd2558c hypothetical protein 3095527:3095985 reverse MW:17134
+LRGKNIHLGRVGCNYGKLLIFIDIYFVSLRIVSDKSMSRGFLRKSSVNTFIGIVWILFAVGTSAQNAVSKFRADSIRQSLSRIQKPQDKIPLLKELIGLYWQLPEEVLALKEIIDIAMPLDSIGIVYDAMAGLSRYYPAIRTFVRVGGALETV
+>gi|100002556| Bd2559 Bd2559 30S ribosomal protein S1 3096095:3097882 forward MW:67092
+MENLKNIQPVEDFNWDAFEQGETYTEVSKDDLVKTYDETLNTVKDKEVVMGTVTSMNKREVVVNIGFKSDGVVPMSEFRYNPDLKIGDEVEVYIESQEDKKGQLILSHKKARATRSWDRVNEALEKDEIIKGYIKCRTKGGMIVDVFGIEAFLPGSQIDVKPIRDYDVFVGKTMEFKIVKINQEFKNVVVSHKALIEAELEQQKKDIISKLEKGQVLEGTVKNITSYGVFIDLGGVDGLIHITDLSWGRVSHPEEIVQLDQKINVVILDFDDEKKRIALGLKQLTPHPWDALDTNLKVGDKVKGKVVVMADYGAFIEIAPGVEGLIHVSEMSWTQHLRSAQDFMKVGDEIEAVILTLDRDERKMSLGIKQLKADPWENIEERFPVGSRHAAKVRNFTNFGVFVEIEEGVDGLIHISDLSWTKKIKHPSEFTQIGAEIEVQVLEIDKENRRLSLGHKQLEENPWDVFETIFTVGSIHEGTIIEVLDKGAVISL [...]
+>gi|100002557| Bd2560c Bd2560c conserved hypothetical protein 3097971:3098210 reverse MW:8927
+MGKNQLIHGNEFHLLKQAEIHKATGKLVESLNLAAGSTGGFDIYKVVEAYFTDLEKRKEINDLLGISEPCETRVTEECFS
+>gi|100002558| Bd2561 Bd2561 phosphoglycolate phosphatase 3098389:3099033 forward MW:24182
+MKKLVIFDLDGTLLNTIADLAHSTNHALRQNGFPTHDVKEYNFFVGNGINKLFERALPEGEKTAENILKVREEFLKHYDLHNTDRSVPYPGVPELLALLQERGIKLAVASNKYQAATRKLIAHFFPSIQFTEVLGQREGVKAKPDPSIVNEIVERASISKESTLYVGDSDVDMQTAINSEVTSCGVTWGFRPRTELEKYAPDHIAEKAEDILKFI
+>gi|100002559| Bd2562 Bd2562 conserved hypothetical protein 3099382:3100299 forward MW:35872
+MSGNIKKIVEPNSGIDYSLEKDFKIFTLSKELPITTYPSYIRLGIVIYCVKGNAKIDIYSNKHIITPKELIIILPGQLVALTDVSVDFQIRYFTITESFYSDILSGISRFSPHFFFYMRQHYYFKMEDVETLSFVDFFELLIRKAVDPENQYRRESVILLLRILFLDIYNHYKVNSLDSTATIDVHKKELTHKFFQLVMSNYKVNRSVTFYANSLCITPKYLTMVVKEVSGKSAKDWITEYMILELKGLLTNSTLNIQEIVEKTQFSNQSSLGRFFRRHTGLSPLQYRKKYLTTEQRTNFSKNNTI
+"""
+
+ def test_seqs_to_stream(self):
+ """seqs_to_stream should iterate over seqs"""
+ sts = seqs_to_stream
+ self.assertEqual(list(sts('>a\nTG\n>b\nWW\n', \
+ '_input_as_multiline_string')),\
+ [['>a','TG'],['>b','WW']])
+ #skipping test for file open
+ self.assertEqual(list(sts(['TG','WW'], '_input_as_seqs')), \
+ [['>0','TG'],['>1','WW']])
+ self.assertEqual(list(sts(['>a','TG','>b','WW'], \
+ '_input_as_lines')),\
+ [['>a','TG'],['>b','WW']])
+ self.assertRaises(TypeError, sts, 'abc', 'xyz')
+
+ def test_make_subject_match_scorer(self):
+ """make_subject_match_scorer should keep ids matching n queries"""
+ qm1 = make_subject_match_scorer(1)
+ qm3 = make_subject_match_scorer(3)
+ qm5 = make_subject_match_scorer(5)
+ qmes = wrap_qmes(QMEBlast9(self.rec3))
+ self.assertItemsEqual(qm1(qmes), ['ece:Z4181','ece:Z4182','ece:Z4183'])
+ self.assertItemsEqual(qm3(qmes), ['ece:Z4181','ece:Z4183'])
+ self.assertItemsEqual(qm5(qmes), [])
+
+ def test_make_shotgun_scorer(self):
+ """make_shotgun_scorer should keep ids matching n queries"""
+ sg1 = make_shotgun_scorer(1)
+ sg2 = make_shotgun_scorer(2)
+ sg3 = make_shotgun_scorer(3)
+ sg4 = make_shotgun_scorer(4)
+ sg5 = make_shotgun_scorer(5)
+ qmes = wrap_qmes(QMEBlast9(self.rec3))
+ self.assertItemsEqual(sg1(qmes), keep_everything_scorer(qmes))
+ self.assertItemsEqual(sg2(qmes), \
+ ['ece:Z4181','ece:Z4182','ece:Z4183','cvi:CV2421','ecs:ECs3717'])
+ self.assertItemsEqual(sg3(qmes), \
+ ['ece:Z4181','ece:Z4182','ece:Z4183'])
+ self.assertItemsEqual(sg4(qmes), \
+ ['ece:Z4182'])
+ self.assertItemsEqual(sg5(qmes), [])
+
+ def test_keep_everything_scorer(self):
+ """keep_everything_scorer should keep all ids found."""
+ k = keep_everything_scorer(wrap_qmes(QMEBlast9(self.rec2)))
+ self.assertItemsEqual(k, \
+ ['ece:Z4181','ecs:ECs3717','spt:SPA2730','cvi:CV2421','ece:Z4182'])
+
+ def test_ids_from_seq_lower_threshold(self):
+ "ids_from_seq_lower_threshold returns psiblast hits, decreasing sens"
+ bdb_seqs = self.fasta_recs
+ f = open('test_bdb', 'w')
+ f.write(bdb_seqs)
+ f.close()
+ temp = popen('formatdb -i test_bdb -o T -p T')
+ params = {'-j':2,
+ '-d':'test_bdb'}
+ query = self.query_1.split('\n')
+ app = PsiBlast(params=params,
+ InputHandler='_input_as_lines')
+ #the command below should result in finding itself and 2554
+ #it should run for max_iterations
+ result = ids_from_seq_lower_threshold(query, n=12, \
+ max_iterations=4, app=app, core_threshold=1e-50, \
+ lower_threshold=1e-20, step=10000)
+ self.assertEqual(result[0],\
+ [('gi|100002553', '0.0'), ('gi|100002554', '0.0')])
+ self.assertEqual(result[1], 4)
+ #if n=2, it should find the same sequences but only run for 1 iteration
+ #since it would hit n after the first blast search
+ result = ids_from_seq_lower_threshold(query, n=2, \
+ max_iterations=4, app=app, core_threshold=1e-50, \
+ lower_threshold=1e-20, step=10000)
+ self.assertEqual(result[0],\
+ [('gi|100002553', '0.0'), ('gi|100002554', '0.0')])
+ self.assertEqual(result[1], 1)
+ query = self.query_2.split('\n')
+ #query_2_s e-value for itself is 9e-47, it should not be found
+ #with the lower_threshold set to 1e-48
+ result = ids_from_seq_lower_threshold(query, n=12, \
+ max_iterations=4, app=app, core_threshold=1e-50, \
+ lower_threshold=1e-48, step=10000)
+ self.assertEqual(result[0], [])
+ #it also should not be found if the max_iterations is set to 1
+ result = ids_from_seq_lower_threshold(query, n=12, \
+ max_iterations=1, app=app, core_threshold=1e-50, \
+ lower_threshold=1e-20, step=10000)
+ self.assertEqual(result[0], [])
+ for fname in ['formatdb.log'] + glob('test_bdb*'):
+ remove(fname)
+
+ def test_psiblast_n_neighbors(self):
+ "psiblast_n_neighbors psiblasts and stops when n neighbors are reached"
+ bdb_seqs = self.fasta_recs
+ f = open('test_bdb', 'w')
+ f.write(bdb_seqs)
+ f.close()
+ temp = popen('formatdb -i test_bdb -o T -p T')
+ params = {'-j':11}
+ lines = bdb_seqs.split('\n')
+ results = psiblast_n_neighbors(lines, n=12, blast_db='test_bdb', \
+ method='lower_threshold', params=params,\
+ core_threshold=1e-50, step=10000)
+ #there should be 10 result entries since there were 10 queries
+ self.assertEqual(len(results), 10)
+ for i in results:
+ #each query should at least find itself
+ self.failUnless(len(results[i][0]) >= 1)
+ #each query should iterate 8 times since it can never reach max
+ self.assertEqual(results[i][1], 11)
+ for fname in ['formatdb.log'] + glob('test_bdb*'):
+ remove(fname)
+
+
+def wrap_qmes(qmes):
+ """Converts qmes into a dict of {q:{m:e}}"""
+ d = {}
+ for q, m, e in qmes:
+ if q not in d:
+ d[q] = {}
+ d[q][m] = e
+ return d
+
+if __name__ == "__main__":
+ main()
diff --git a/bfillings/tests/test_blat.py b/bfillings/tests/test_blat.py
new file mode 100755
index 0000000..190d15a
--- /dev/null
+++ b/bfillings/tests/test_blat.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from unittest import TestCase, main
+from os.path import exists
+from os import remove
+from re import search
+from tempfile import mkstemp
+
+from bfillings.blat import (Blat, assign_reads_to_database,
+ assign_dna_reads_to_dna_database,
+ assign_dna_reads_to_protein_database)
+
+__author__ = "Adam Robbins-Pianka"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Adam Robbins-Pianka", "Daniel McDonald"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "Adam Robbins-Pianka"
+__email__ = "adam.robbinspianka at colorado.edu"
+__status__ = "Production"
+
+
+class BlatTests(TestCase):
+ files_to_remove = []
+
+ def setUp(self):
+ """Sets up files for testing.
+ """
+ _, self.test_db_prot_filename = mkstemp()
+ self.test_db_prot = open(self.test_db_prot_filename, 'w')
+
+ _, self.test_db_dna_filename = mkstemp()
+ self.test_db_dna = open(self.test_db_dna_filename, 'w')
+
+ _, self.test_query_filename = mkstemp()
+ self.test_query = open(self.test_query_filename, 'w')
+
+ # write the global variables at the bottom of this file to the
+ # temporary test files. Can't use file-like objects because the
+ # external application needs actual files.
+ self.test_db_prot.write('\n'.join(test_db_prot))
+ self.test_db_dna.write('\n'.join(test_db_dna))
+ self.test_query.write('\n'.join(test_query))
+
+ # close the files
+ self.test_db_prot.close()
+ self.test_db_dna.close()
+ self.test_query.close()
+
+ # prepare output file path
+ _, self.testout = mkstemp()
+
+ self.files_to_remove += [self.test_db_prot_filename,
+ self.test_db_dna_filename,
+ self.test_query_filename, self.testout]
+
+ def tearDown(self):
+ """Removes temporary files created during the tests
+ """
+ for filename in self.files_to_remove:
+ if exists(filename):
+ remove(filename)
+
+ def test_assign_reads_to_database(self):
+ """Tests that assign_reads_to_database works as expected.
+
+ Checks the output file against the expected result when known
+ database and query files are used.
+ """
+ exp = [l for l in assign_reads_exp if not l.startswith('#')]
+ obs_lines = assign_reads_to_database(self.test_query_filename,
+ self.test_db_dna_filename,
+ self.testout).read().splitlines()
+ obs = [l for l in obs_lines if not l.startswith('#')]
+
+ self.assertEqual(obs, exp)
+
+ def test_assign_dna_reads_to_dna_database(self):
+ """Tests that assign_dna_reads_to_dna_database works as expected.
+
+ Checks the output file against the expected result when known
+ database and query files are used.
+ """
+ exp = [l for l in assign_reads_exp if not l.startswith('#')]
+
+ obs_lines = assign_dna_reads_to_dna_database(self.test_query_filename,
+ self.test_db_dna_filename,
+ self.testout).read().splitlines()
+ obs = [l for l in obs_lines if not l.startswith('#')]
+
+ self.assertEqual(obs, exp)
+
+ def test_assign_dna_reads_to_protein_database(self):
+ """Tests that assign_dna_reads_to_protein_database works as expected.
+
+ Checks the output file against the expected result when known
+ database and query files are used.
+ """
+ exp = [l for l in assign_reads_prot_exp if not l.startswith('#')]
+
+ obs_lines = assign_dna_reads_to_protein_database(
+ self.test_query_filename,
+ self.test_db_prot_filename,
+ self.testout).read().splitlines()
+ obs = [l for l in obs_lines if not l.startswith('#')]
+
+ self.assertEqual(obs, exp)
+
+ def test_get_base_command(self):
+ """Tests that _get_base_command generates the proper command given
+ various inputs.
+ """
+ test_parameters_blank = {}
+ files = (self.test_query_filename, self.test_db_dna_filename,
+ self.testout)
+ exp_blank = 'blat %s %s %s' % (files[1], files[0], files[2])
+
+ # initialize a Blat instance with these parameters and get the
+ # command string
+ b = Blat(params={}, HALT_EXEC=True)
+ # need to set the positional parameters' values
+ b._input_as_list(files)
+ cmd = b._get_base_command()
+
+ # find the end of the cd command and trim the base command
+ cmd_index = search('cd ".+"; ', cmd).end()
+ cmd = cmd[cmd_index:]
+ self.assertEqual(cmd, exp_blank)
+
+ test_parameters_1 = {
+ '-t': 'dna',
+ '-q': 'dna',
+ '-ooc': '11.ooc',
+ '-tileSize': 1,
+ '-stepSize': 2,
+ '-oneOff': 1,
+ '-minMatch': 2,
+ '-minScore': 3,
+ '-minIdentity': 4,
+ '-maxGap': 5,
+ '-makeOoc': 'N.ooc',
+ '-repMatch': 6,
+ '-mask': 'lower',
+ '-qMask': 'lower',
+ '-repeats': 'lower',
+ '-minRepDivergence': 7,
+ '-dots': 8,
+ '-out': 'psl',
+ '-maxIntron': 9}
+ exp_1 = 'blat %s %s ' % (files[1], files[0]) + \
+ '-dots=8 -makeOoc="N.ooc" -mask=lower -maxGap=5 ' + \
+ '-maxIntron=9 -minIdentity=4 -minMatch=2 ' + \
+ '-minRepDivergence=7 -minScore=3 -oneOff=1 -ooc="11.ooc" ' + \
+ '-out=psl -q=dna -qMask=lower -repMatch=6 -repeats=lower ' + \
+ '-stepSize=2 -t=dna -tileSize=1 %s' % files[2]
+
+ # initialize a Blat instance with these parameters and get the
+ # command string
+ b = Blat(params=test_parameters_1, HALT_EXEC=True)
+ # need to set the positional parameters' values
+ b._input_as_list(files)
+ cmd = b._get_base_command()
+
+ # find the end of the cd command and trim the base command
+ cmd_index = search('cd ".+"; ', cmd).end()
+ cmd = cmd[cmd_index:]
+ self.assertEqual(cmd, exp_1)
+
+ test_parameters_2 = {
+ '-tileSize': 1,
+ '-stepSize': 2,
+ '-minMatch': 2,
+ '-minScore': 3,
+ '-minIdentity': 4,
+ '-maxGap': 5,
+ '-makeOoc': 'N.ooc',
+ '-out': 'psl',
+ '-maxIntron': 9}
+ exp_2 = 'blat %s %s ' % (files[1], files[0]) + \
+ '-makeOoc="N.ooc" -maxGap=5 -maxIntron=9 -minIdentity=4 ' + \
+ '-minMatch=2 -minScore=3 -out=psl -stepSize=2 ' + \
+ '-tileSize=1 %s' % files[2]
+
+ # initialize a Blat instance with these parameters and get the
+ # command string
+ b = Blat(params=test_parameters_2, HALT_EXEC=True)
+ # need to set the positional parameters' values
+ b._input_as_list(files)
+ cmd = b._get_base_command()
+
+ # find the end of the cd command and trim the base command
+ cmd_index = search('cd ".+"; ', cmd).end()
+ cmd = cmd[cmd_index:]
+ self.assertEqual(cmd, exp_2)
+
+assign_reads_exp = """# BLAT 34 [2006/03/10]
+# Query: NZ_GG770509_647533119
+# Database: test_db.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG770509_647533119 NZ_GG770509_647533119 100.00 1371 0 0 1 1371 1 1371 0.0e+00 2187.0
+NZ_GG770509_647533119 NZ_ACIZ01000148_643886127 85.49 634 92 0 336 969 337 970 4.5e-234 807.0
+NZ_GG770509_647533119 NZ_ACIZ01000148_643886127 86.08 237 33 0 1135 1371 1137 1373 1.2e-77 287.0
+NZ_GG770509_647533119 NZ_ACIZ01000148_643886127 83.12 154 26 0 976 1129 977 1130 2.2e-48 190.0
+NZ_GG770509_647533119 NZ_GG739926_647533195 78.42 329 71 0 656 984 657 985 4.8e-97 351.0
+NZ_GG770509_647533119 NZ_GG739926_647533195 89.09 110 11 1 1138 1246 1141 1250 1.1e-30 131.0
+NZ_GG770509_647533119 NZ_GG739926_647533195 86.96 69 9 0 1021 1089 1023 1091 3.2e-20 96.0
+NZ_GG770509_647533119 NZ_GG739926_647533195 75.26 97 22 2 356 450 356 452 2.3e-13 73.0
+NZ_GG770509_647533119 NZ_GG739926_647533195 90.57 53 5 0 1319 1371 1315 1367 2.5e-10 63.0
+NZ_GG770509_647533119 NZ_GG739926_647533195 81.82 22 4 0 989 1010 992 1013 1.5e+02 24.0
+# BLAT 34 [2006/03/10]
+# Query: NZ_GG739926_647533195
+# Database: test_db.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG739926_647533195 NZ_GG739926_647533195 100.00 1367 0 0 1 1367 1 1367 0.0e+00 2235.0
+NZ_GG739926_647533195 NZ_ACIZ01000148_643886127 76.22 572 136 0 414 985 414 985 1.7e-158 556.0
+NZ_GG739926_647533195 NZ_ACIZ01000148_643886127 76.80 181 42 0 1023 1203 1022 1202 6.4e-53 205.0
+NZ_GG739926_647533195 NZ_ACIZ01000148_643886127 96.00 50 2 0 1209 1258 1207 1256 6.4e-14 75.0
+NZ_GG739926_647533195 NZ_ACIZ01000148_643886127 88.68 53 6 0 1315 1367 1321 1373 1.6e-09 61.0
+NZ_GG739926_647533195 NZ_ACIZ01000148_643886127 77.27 22 5 0 992 1013 990 1011 8.5e+02 22.0
+NZ_GG739926_647533195 NZ_GG770509_647533119 79.29 280 58 0 657 936 656 935 9.9e-82 301.0
+NZ_GG739926_647533195 NZ_GG770509_647533119 89.09 110 11 1 1141 1250 1138 1246 1.1e-30 131.0
+NZ_GG739926_647533195 NZ_GG770509_647533119 86.96 69 9 0 1023 1091 1021 1089 3.2e-20 96.0
+NZ_GG739926_647533195 NZ_GG770509_647533119 75.26 97 22 2 356 452 356 450 2.3e-13 73.0
+NZ_GG739926_647533195 NZ_GG770509_647533119 90.57 53 5 0 1315 1367 1319 1371 2.5e-10 63.0
+NZ_GG739926_647533195 NZ_GG770509_647533119 80.00 30 6 0 956 985 955 984 1.2e-03 41.0
+NZ_GG739926_647533195 NZ_GG770509_647533119 81.82 22 4 0 992 1013 989 1010 1.5e+02 24.0
+# BLAT 34 [2006/03/10]
+# Query: NZ_ACIZ01000148_643886127
+# Database: test_db.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_ACIZ01000148_643886127 NZ_ACIZ01000148_643886127 100.00 1373 0 0 1 1373 1 1373 0.0e+00 2165.0
+NZ_ACIZ01000148_643886127 NZ_GG770509_647533119 85.49 634 92 0 337 970 336 969 4.5e-234 807.0
+NZ_ACIZ01000148_643886127 NZ_GG770509_647533119 86.08 237 33 0 1137 1373 1135 1371 1.2e-77 287.0
+NZ_ACIZ01000148_643886127 NZ_GG770509_647533119 83.12 154 26 0 977 1130 976 1129 2.2e-48 190.0
+NZ_ACIZ01000148_643886127 NZ_GG739926_647533195 76.22 572 136 0 414 985 414 985 1.7e-158 556.0
+NZ_ACIZ01000148_643886127 NZ_GG739926_647533195 76.80 181 42 0 1022 1202 1023 1203 6.4e-53 205.0
+NZ_ACIZ01000148_643886127 NZ_GG739926_647533195 96.00 50 2 0 1207 1256 1209 1258 6.4e-14 75.0
+NZ_ACIZ01000148_643886127 NZ_GG739926_647533195 88.68 53 6 0 1321 1373 1315 1367 1.6e-09 61.0
+NZ_ACIZ01000148_643886127 NZ_GG739926_647533195 77.27 22 5 0 990 1011 992 1013 8.5e+02 22.0
+""".splitlines()
+
+assign_reads_prot_exp = """# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG770509_647533119_frame_1
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG770509_647533119_frame_1 NZ_GG770509_647533119 96.83 441 0 7 1 427 1 441 8.9e-254 872.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG770509_647533119_frame_2
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG770509_647533119_frame_2 NZ_ACIZ01000148_643886127 85.37 41 6 0 359 399 362 402 8.0e-13 72.0
+NZ_GG770509_647533119_frame_2 NZ_ACIZ01000148_643886127 93.75 16 1 0 419 434 421 436 1.3e+00 31.0
+NZ_GG770509_647533119_frame_2 NZ_GG739926_647533195 75.86 29 7 0 320 348 326 354 2.9e-04 43.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG770509_647533119_frame_3
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG770509_647533119_frame_3 NZ_ACIZ01000148_643886127 80.61 98 19 0 210 307 209 306 7.5e-39 158.0
+NZ_GG770509_647533119_frame_3 NZ_ACIZ01000148_643886127 66.33 98 33 0 43 140 44 141 8.9e-27 118.0
+NZ_GG770509_647533119_frame_3 NZ_ACIZ01000148_643886127 78.95 38 8 0 310 347 308 345 2.3e-08 57.0
+NZ_GG770509_647533119_frame_3 NZ_ACIZ01000148_643886127 66.67 30 10 0 178 207 178 207 2.5e-01 33.0
+NZ_GG770509_647533119_frame_3 NZ_GG739926_647533195 53.00 100 47 0 131 230 134 233 1.9e-18 90.0
+NZ_GG770509_647533119_frame_3 NZ_GG739926_647533195 68.89 45 14 0 238 282 241 285 5.9e-09 59.0
+NZ_GG770509_647533119_frame_3 NZ_GG739926_647533195 72.09 43 12 0 63 105 66 108 3.0e-08 56.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG739926_647533195_frame_1
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG739926_647533195_frame_1 NZ_GG739926_647533195 100.00 437 0 0 1 437 1 437 1.7e-263 904.0
+NZ_GG739926_647533195_frame_1 NZ_ACIZ01000148_643886127 69.86 73 22 0 213 285 209 281 1.1e-20 98.0
+NZ_GG739926_647533195_frame_1 NZ_ACIZ01000148_643886127 53.33 60 28 0 148 207 145 204 1.3e-06 51.0
+NZ_GG739926_647533195_frame_1 NZ_ACIZ01000148_643886127 60.53 38 15 0 66 103 64 101 1.9e-03 41.0
+NZ_GG739926_647533195_frame_1 NZ_ACIZ01000148_643886127 76.92 26 6 0 2 27 3 28 9.7e-03 38.0
+NZ_GG739926_647533195_frame_1 NZ_ACIZ01000148_643886127 69.57 23 7 0 288 310 285 307 4.8e+00 29.0
+NZ_GG739926_647533195_frame_1 NZ_ACIZ01000148_643886127 90.00 10 1 0 134 143 132 141 1.6e+04 18.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG739926_647533195_frame_2
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG739926_647533195_frame_2 NZ_GG770509_647533119 66.67 42 14 0 270 311 276 317 2.3e-08 57.0
+NZ_GG739926_647533195_frame_2 NZ_GG770509_647533119 60.00 45 18 0 185 229 188 232 3.9e-06 49.0
+NZ_GG739926_647533195_frame_2 NZ_GG770509_647533119 80.00 20 4 0 247 266 251 270 5.6e-01 32.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG739926_647533195_frame_3
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG739926_647533195_frame_3 NZ_ACIZ01000148_643886127 94.44 18 1 0 390 407 385 402 4.3e-03 39.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_ACIZ01000148_643886127_frame_1
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_ACIZ01000148_643886127_frame_1 NZ_ACIZ01000148_643886127 100.00 436 0 0 1 436 1 436 2.1e-261 897.0
+NZ_ACIZ01000148_643886127_frame_1 NZ_GG739926_647533195 78.57 42 9 0 240 281 244 285 4.0e-10 63.0
+NZ_ACIZ01000148_643886127_frame_1 NZ_GG739926_647533195 60.53 38 15 0 64 101 66 103 1.9e-03 41.0
+NZ_ACIZ01000148_643886127_frame_1 NZ_GG739926_647533195 76.92 26 6 0 3 28 2 27 9.7e-03 38.0
+NZ_ACIZ01000148_643886127_frame_1 NZ_GG739926_647533195 69.57 23 7 0 285 307 288 310 4.8e+00 29.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_ACIZ01000148_643886127_frame_2
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_ACIZ01000148_643886127_frame_2 NZ_GG770509_647533119 79.59 147 26 2 182 324 189 335 2.3e-61 233.0
+NZ_ACIZ01000148_643886127_frame_2 NZ_GG770509_647533119 72.73 33 9 0 128 160 137 169 5.0e-04 42.0
+NZ_ACIZ01000148_643886127_frame_2 NZ_GG770509_647533119 90.91 22 2 0 70 91 76 97 2.5e-03 40.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_ACIZ01000148_643886127_frame_3
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_ACIZ01000148_643886127_frame_3 NZ_GG770509_647533119 84.21 38 4 1 360 395 367 404 3.0e-08 56.0
+NZ_ACIZ01000148_643886127_frame_3 NZ_GG770509_647533119 94.12 17 1 0 413 429 425 441 1.6e+00 31.0
+NZ_ACIZ01000148_643886127_frame_3 NZ_GG739926_647533195 78.57 28 5 1 321 347 326 353 1.5e-03 41.0"""
+assign_reads_prot_exp = assign_reads_prot_exp.splitlines()
+
+test_db_prot = """>NZ_GG770509_647533119
+YLEFDPGSERTLAAGLTHASRASGRRVSNAWERTICYGITQGNLCYRMetWKVGKSARVGLASWWGKGSPRRRSIAGLRGSATLGLRHGPDSYGRQQWGILDNGRKPDPAMetPRERPGCKALSPVKMetTVTGEEAPANFVPAAAVIRRGLALFGFTGRKAHVGGLLSQGNPGAQPRNCLYWKSVWRVEFRVRNSIFGGTPVAKAAHWTNRGAKAWGANRIRYPGSPRRKRMetLAVGASVAQLTHTFRLGSAVARLKLKGIDGGPHKRWSMetWFNSKQRAEPYQPLTSTGAAWLSSARVVRCWVKSRNERNPRPLPAWALGDCRAGGRWGRQVLMetALTGWATHVLQWWSVGSEHASVSSPPSQFGCTLQLECRSWNRSRISMetPRIRSRALYTPPVTPWELVLPEGACAGDHGRVSDWGEVVTRPGNLRLDHLLS
+>NZ_GG739926_647533195
+WEFDPGSGTLATGLTHASRGTGARVSNAYPTFPRPRDNLPKGRLIPYVQSRSRMGMRPISLLAGQRPTKASIGRGSERKAPHTGTETRSRLLREAAVRNIGQWAEATSQVACRTTAYGLTAFMRGYAGTAIRTGFRASSRGNTEGPGVIRIYWVRERRPPCKRAVKSSGPTAALRRELLGLSAPEAGGIRGVAVKCLDITKNPDCEGSPLWRLTLRLEGAGIEQDIPWSARTMDTRCPALGGQAKALSIPPGEYAGNGETQRNRGPAQAEEHVVFDDTRGTLPGLELRCCMVVVSSCREVSAQVPRAQPLSAVAIGRALCGHCRRKVEEGGDDVKSARPLRPGPHTCYNGRQRAVRAQVRVNPLRSQFGWGLQPDPRSWIRSRISHGAVNTFPGLVHTARQAMKAGGASPCRPRAKPVIGAKSQGSRTGRCGWNTSF
+>NZ_ACIZ01000148_643886127
+NMEFDPGSGTLAACLIHASRTSGGRVSNTWVTCPVGDNIWKQMLIPHKESRFWMDPRRISLVRRLTKAMIRSRTERLIGHIGTETRPKLLREAAVGNLPQWTQVWSNAAVKKAFGSNSVVGEDDGIQPESHGLRASSRGNTVASVIRIYWASERRRFFKSDVKALGLTEEVHRKLGNLSAEEDSGTPCVAVKCVDIWKNTSGEGGCLVLTLRLESMGSEQDIPWSMPTMNARCWSFSAAANALSIPPGEYDRKVETQRNRGPAQAVEHVVFEATRRTLPGLDIDRWCMVVVSSCREMLGVPQRAQPLLVASMGTLVRLPVTNRRKVGMTSNHHAPYDLGYTRATMDGNELRDREVKLISSILSSDVGCNSPTEVGIASNRGSARRGEYVPGPCTHRPSHHESLHPKPVRSEPSKVGQMIRVKSQGSRRRTCGWITS"""
+test_db_prot = test_db_prot.splitlines()
+
+test_db_dna = """>NZ_GG770509_647533119
+UACUUGGAGUUUGAUCCUGGCUCAGAACGAACGCUGGCGGCAGGCUUAACACAUGCAAGUCGAGCGAGCGGCAGACGGGUGAGUAACGCGUGGGAACGUACCAUUUGCUACGGAAUAACUCAGGGAAACUUGUGCUAAUACCGUAUGUGGAAAGUCGGCAAAUGAUCGGCCCGCGUUGGAUUAGCUAGUUGGUGGGGUAAAGGCUCACCAAGGCGACGAUCCAUAGCUGGUCUGAGAGGAUGAUCAGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGCAAGCCUGAUCCAGCCAUGCCGCGUGAGUGAUGAAGGCCCUAGGGUUGUAAAGCUCUUUCACCGGUGAAGAUGACGGUAACCGGAGAAGAAGCCCCGGCUAACUUCGUGCCAGCAGCCGCGGUAAUACGAAGGGGGCUAGCGUUGUUCGGAUUUACUGGGCGUAAAGCGCACGUA [...]
+>NZ_GG739926_647533195
+UAAUGGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCUACAGGCUUAACACAUGCAAGUCGAGGGACCGGCGCACGGGUGAGUAACGCGUAUCCAACCUUCCCGCGACCAAGGGAUAACCUGCCGAAAGGCAGACUAAUACCUUAUGUCCAAAGUCGGUCACGGAUGGGGAUGCGUCCGAUUAGCUUGUUGGCGGGGCAACGGCCCACCAAGGCAUCGAUCGGUAGGGGUUCUGAGAGGAAGGCCCCCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGAGGAAUAUUGGUCAAUGGGCGGAAGCCUGAACCAGCCAAGUAGCGUGCAGGACGACGGCCUACGGGUUGUAAACUGCUUUUAUGCGGGGAUAUGCAGGUACCGCAUGAAUAAGGACCGGCUAAUUCCGUGCCAGCAGCCGCGGUAAUACGGAAGGUCCGGGCGUUAUCCGGAUUUAUUGGGUUUAAAGGGAGCGC [...]
+>NZ_ACIZ01000148_643886127
+AAUAUGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCGGCGUGCCUAAUACAUGCAAGUCGAACGAGUGGCGGACGGGUGAGUAACACGUGGGUAACCUGCCCUUAAGUGGGGGAUAACAUUUGGAAACAGAUGCUAAUACCGCAUAAAGAAAGUCGCUUUUGGAUGGACCCGCGGCGUAUUAGCUAGUUGGUGAGGUAACGGCUCACCAAGGCAAUGAUACGUAGCCGAACUGAGAGGUUGAUCGGCCACAUUGGGACUGAGACACGGCCCAAACUCCUACGGGAGGCAGCAGUAGGGAAUCUUCCACAAUGGACGCAAGUCUGAUGGAGCAACGCCGCGUGAGUGAAGAAGGCUUUCGGGUCGUAAAACUCUGUUGUUGGAGAAGAUGACGGUAUCCAACCAGAAAGCCACGGCUAACUACGUGCCAGCAGCCGCGGUAAUACGUAGGUGGCAAGCGUUAUCCGGAUUUAUUGGGCGUAAAGCGAGCGC [...]
+test_db_dna = test_db_dna.splitlines()
+
+test_query = """>NZ_GG770509_647533119
+UACUUGGAGUUUGAUCCUGGCUCAGAACGAACGCUGGCGGCAGGCUUAACACAUGCAAGUCGAGCGAGCGGCAGACGGGUGAGUAACGCGUGGGAACGUACCAUUUGCUACGGAAUAACUCAGGGAAACUUGUGCUAAUACCGUAUGUGGAAAGUCGGCAAAUGAUCGGCCCGCGUUGGAUUAGCUAGUUGGUGGGGUAAAGGCUCACCAAGGCGACGAUCCAUAGCUGGUCUGAGAGGAUGAUCAGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGCAAGCCUGAUCCAGCCAUGCCGCGUGAGUGAUGAAGGCCCUAGGGUUGUAAAGCUCUUUCACCGGUGAAGAUGACGGUAACCGGAGAAGAAGCCCCGGCUAACUUCGUGCCAGCAGCCGCGGUAAUACGAAGGGGGCUAGCGUUGUUCGGAUUUACUGGGCGUAAAGCGCACGUA [...]
+>NZ_GG739926_647533195
+UAAUGGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCUACAGGCUUAACACAUGCAAGUCGAGGGACCGGCGCACGGGUGAGUAACGCGUAUCCAACCUUCCCGCGACCAAGGGAUAACCUGCCGAAAGGCAGACUAAUACCUUAUGUCCAAAGUCGGUCACGGAUGGGGAUGCGUCCGAUUAGCUUGUUGGCGGGGCAACGGCCCACCAAGGCAUCGAUCGGUAGGGGUUCUGAGAGGAAGGCCCCCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGAGGAAUAUUGGUCAAUGGGCGGAAGCCUGAACCAGCCAAGUAGCGUGCAGGACGACGGCCUACGGGUUGUAAACUGCUUUUAUGCGGGGAUAUGCAGGUACCGCAUGAAUAAGGACCGGCUAAUUCCGUGCCAGCAGCCGCGGUAAUACGGAAGGUCCGGGCGUUAUCCGGAUUUAUUGGGUUUAAAGGGAGCGC [...]
+>NZ_ACIZ01000148_643886127
+AAUAUGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCGGCGUGCCUAAUACAUGCAAGUCGAACGAGUGGCGGACGGGUGAGUAACACGUGGGUAACCUGCCCUUAAGUGGGGGAUAACAUUUGGAAACAGAUGCUAAUACCGCAUAAAGAAAGUCGCUUUUGGAUGGACCCGCGGCGUAUUAGCUAGUUGGUGAGGUAACGGCUCACCAAGGCAAUGAUACGUAGCCGAACUGAGAGGUUGAUCGGCCACAUUGGGACUGAGACACGGCCCAAACUCCUACGGGAGGCAGCAGUAGGGAAUCUUCCACAAUGGACGCAAGUCUGAUGGAGCAACGCCGCGUGAGUGAAGAAGGCUUUCGGGUCGUAAAACUCUGUUGUUGGAGAAGAUGACGGUAUCCAACCAGAAAGCCACGGCUAACUACGUGCCAGCAGCCGCGGUAAUACGUAGGUGGCAAGCGUUAUCCGGAUUUAUUGGGCGUAAAGCGAGCGC [...]
+test_query = test_query.splitlines()
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_bwa.py b/bfillings/tests/test_bwa.py
new file mode 100755
index 0000000..91d495e
--- /dev/null
+++ b/bfillings/tests/test_bwa.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from unittest import TestCase, main
+from os.path import exists
+from os import remove
+from tempfile import mkstemp
+
+from bfillings.bwa import (BWA_index, BWA_aln, BWA_samse, BWA_sampe, BWA_bwasw,
+ create_bwa_index_from_fasta_file,
+ assign_reads_to_database,
+ InvalidArgumentApplicationError,
+ MissingRequiredArgumentApplicationError)
+
+
+class BWAtests(TestCase):
+
+ """Tests for the BWA app controller
+ """
+
+ # keeps track of which files are created during the tests so that they
+ # can be removed during tearDown
+ files_to_remove = []
+
+ def setUp(self):
+ """Performs setup for the tests.
+
+ Nothing to set up for these tests.
+ """
+
+ pass
+
+ def tearDown(self):
+ """Properly and politely terminates the test.
+
+ Removes files created during the tests.
+ """
+
+ for f in self.files_to_remove:
+ if exists(f):
+ remove(f)
+
+ def test_check_arguments(self):
+ """Tests the "check_arguments" method of the BWA base class.
+
+ Arguments passed to certain parameters of the various subcommands can
+ take only certain values. The check_arguments function enforces these
+ constraints. This function ensures that the rules are being enforced
+ as expected.
+ """
+
+ # set up test parameters
+ # should pass
+ index_params_is = {'-a': 'is'}
+ # should pass
+ index_params_bwtsw = {'-a': 'bwtsw'}
+ # should fail, -a must be one of "is" or "bwtsw"
+ index_params_invalid = {'-a': 'invalid'}
+ # should fail, -p must specify a prefix that is an absolute path
+ index_params_invalid_prefix = {'-p': 'invalid'}
+ # should pass
+ index_params_valid_prefix = {'-p': '/prefix'}
+
+ # instantiate objects built from the above parameters
+ index_is = BWA_index(params=index_params_is, HALT_EXEC=True)
+ index_bwtsw = BWA_index(params=index_params_bwtsw, HALT_EXEC=True)
+ index_invalid = BWA_index(params=index_params_invalid, HALT_EXEC=True)
+ index_invalid_prefix = BWA_index(params=index_params_invalid_prefix,
+ HALT_EXEC=True)
+ index_valid_prefix = BWA_index(params=index_params_valid_prefix,
+ HALT_EXEC=True)
+
+ # Should not be allowed
+ self.assertRaises(InvalidArgumentApplicationError,
+ index_invalid.check_arguments)
+ self.assertRaises(InvalidArgumentApplicationError,
+ index_invalid_prefix.check_arguments)
+
+ # Should execute and not raise any exceptions
+ index_is.check_arguments()
+ index_bwtsw.check_arguments()
+ index_valid_prefix.check_arguments()
+
+ # The rest of the _valid_arguments are for checking is_int and is_float
+ # and they all use the same function from the base-class, so testing
+ # just one of the subcommands should suffice
+
+ # -n must be a float (expressed either as a float or as a string)
+ # -o must be an int (expressed either as an int or as a string)
+ # pass, both valid
+ aln_params_valid = {'-n': 3.0, '-o': 5, '-f': '/sai_out'}
+ # fail, second invalid
+ aln_params_invalid1 = {'-n': 3.0, '-o': 'nope', '-f': '/sai_out'}
+ # fail, first invalid
+ aln_params_invalid2 = {'-n': '3.5.1', '-o': 4, '-f': '/sai_out'}
+ # fail, did not specify -f
+ aln_params_invalid3 = {'-n': 3.0, '-o': 5}
+
+ # instantiate objects
+ aln_valid = BWA_aln(params=aln_params_valid, HALT_EXEC=True)
+ aln_invalid1 = BWA_aln(params=aln_params_invalid1, HALT_EXEC=True)
+ aln_invalid2 = BWA_aln(params=aln_params_invalid2, HALT_EXEC=True)
+ aln_invalid3 = BWA_aln(params=aln_params_invalid3, HALT_EXEC=True)
+
+ test_paths = {'prefix': '/fa_in', 'fastq_in': '/fq_in'}
+
+ # Should Halt Exec (AssertionError) right before execution
+ self.assertRaisesRegexp(AssertionError, 'Halted exec', aln_valid,
+ test_paths)
+ # also need to make sure the base command is correct
+ self.assertIn('; bwa aln -f /sai_out -n 3.0 -o 5 /fa_in /fq_in',
+ aln_valid.BaseCommand)
+
+ # Should fail
+ self.assertRaises(InvalidArgumentApplicationError, aln_invalid1,
+ test_paths)
+
+ self.assertRaises(InvalidArgumentApplicationError, aln_invalid2,
+ test_paths)
+
+ self.assertRaises(InvalidArgumentApplicationError, aln_invalid3,
+ test_paths)
+
+ def test_input_as_dict(self):
+ """Tests the input handler (_input_as_dict)
+
+ The input handler should throw exceptions if there are not enough
+ arguments, or if there are unrecognized arguments, or if a file path
+ appears to be a relative filepath.
+ """
+
+ # Arguments for BWA_bwasw, which was chosen since it is the only one
+ # that also has an optional argument (optional arguments are denoted
+ # by a leading underscore)
+ missing = {'prefix': '/fa_in', '_query_fasta_2': '/mate'}
+ extra = {'prefix': '/fa_in', 'query_fasta': '/query_fasta',
+ 'extra': '/param'}
+ rel_fp = {'prefix': 'fa_in', 'query_fasta': '/query_fasta'}
+ valid = {'prefix': '/fa_in', 'query_fasta': '/query_fasta'}
+ valid_with_mate = {'prefix': '/fa_in', 'query_fasta': '/query_fasta',
+ '_query_fasta_2': '/mate'}
+
+ # instantiate the object
+ bwasw = BWA_bwasw(params={'-f': '/sam_out'}, HALT_EXEC=True)
+
+ # should raise ApplicationError for wrong I/O files; failure
+ self.assertRaises(MissingRequiredArgumentApplicationError, bwasw,
+ missing)
+ self.assertRaises(InvalidArgumentApplicationError, bwasw, extra)
+ self.assertRaises(InvalidArgumentApplicationError, bwasw, rel_fp)
+
+ # should raise AssertionError (Halt Exec); success
+ # tests valid arguments with and without the optional
+ # _query_fasta_2 argument
+ self.assertRaisesRegexp(AssertionError, 'Halted exec', bwasw, valid)
+ self.assertRaisesRegexp(AssertionError, 'Halted exec', bwasw,
+ valid_with_mate)
+
+ def test_get_base_command(self):
+ """Tests the function that generates the command string.
+
+ Tests whether an object can be instantiated and then called using
+ one set of files, and then another set of files.
+
+ Since the structure of the various sublcasses is consistent, testing
+ that the correct command is generated by one of the subclasses should
+ suffice here.
+ """
+
+ # instantiate one instance
+ aln = BWA_aln(params={'-n': 1.0, '-f': '/sai_out'}, HALT_EXEC=True)
+
+ # set up two different sets of files
+ first_files = {'prefix': '/fa_in1', 'fastq_in': '/fq_in1'}
+ second_files = {'prefix': '/fa_in2', 'fastq_in': '/fq_in2'}
+
+ # make sure both sets run, and that the command appears to be correct
+ self.assertRaisesRegexp(AssertionError,
+ 'Halted exec', aln, first_files)
+ self.assertIn('; bwa aln -f /sai_out -n 1.0 /fa_in1 /fq_in1',
+ aln.BaseCommand)
+
+ self.assertRaisesRegexp(AssertionError, 'Halted exec', aln,
+ second_files)
+ self.assertIn('; bwa aln -f /sai_out -n 1.0 /fa_in2 /fq_in2',
+ aln.BaseCommand)
+
+ # instantiate another object, to test that there is no cross-talk
+ # between instances with the same baseclass
+ aln2 = BWA_aln(params={'-n': 2.5, '-o': 7, '-f': '/sai_out'},
+ HALT_EXEC=True)
+
+ self.assertRaisesRegexp(AssertionError, 'Halted exec', aln2,
+ first_files)
+ self.assertIn('; bwa aln -f /sai_out -n 2.5 -o 7 /fa_in1 /fq_in1',
+ aln2.BaseCommand)
+
+ def test_get_result_paths(self):
+ """Tests the function that retrieves the result paths.
+
+ aln, sampe, samse, bwasw return only one file.
+ BWA_index returns 5 files, and the name depends on whether or not the
+ -p option is on or not
+ """
+
+ # instantiate objects
+ index = BWA_index(params={}, HALT_EXEC=True)
+ index2 = BWA_index(params={'-p': '/prefix'}, HALT_EXEC=True)
+ aln = BWA_aln(params={'-f': '/sai_out'}, HALT_EXEC=True)
+ samse = BWA_samse(params={'-f': '/sam_out'}, HALT_EXEC=True)
+ sampe = BWA_sampe(params={'-f': '/sam_out'}, HALT_EXEC=True)
+ bwasw = BWA_bwasw(params={'-f': '/sam_out'}, HALT_EXEC=True)
+
+ # pass in the data, and make sure the output paths are as expected.
+ # -p is off here
+ index_data = {'fasta_in': '/fa_in'}
+ results = index._get_result_paths(index_data)
+ self.assertEqual(results['.amb'].Path, '/fa_in.amb')
+ self.assertEqual(results['.ann'].Path, '/fa_in.ann')
+ self.assertEqual(results['.bwt'].Path, '/fa_in.bwt')
+ self.assertEqual(results['.pac'].Path, '/fa_in.pac')
+ self.assertEqual(results['.sa'].Path, '/fa_in.sa')
+
+ # pass in the data, and make sure the output paths are as expected.
+ # -p is on here
+ results = index2._get_result_paths(index_data)
+ self.assertEqual(results['.amb'].Path, '/prefix.amb')
+ self.assertEqual(results['.ann'].Path, '/prefix.ann')
+ self.assertEqual(results['.bwt'].Path, '/prefix.bwt')
+ self.assertEqual(results['.pac'].Path, '/prefix.pac')
+ self.assertEqual(results['.sa'].Path, '/prefix.sa')
+
+ # pass in the data, and make sure the output path is as expected
+ aln_data = {'prefix': '/fa_in', 'fastq_in': '/fq_in'}
+ results = aln._get_result_paths(aln_data)
+ self.assertEqual(results['output'].Path, '/sai_out')
+
+ samse_data = {'prefix': '/fa_in', 'sai_in': '/sai_in',
+ 'fastq_in': '/fq_in'}
+ results = samse._get_result_paths(samse_data)
+ self.assertEqual(results['output'].Path, '/sam_out')
+
+ sampe_data = {'prefix': '/fa_in', 'sai1_in': '/sai1_in',
+ 'sai2_in': '/sai2_in', 'fastq1_in': '/fq1_in',
+ 'fastq2_in': '/fq2_in'}
+ results = sampe._get_result_paths(sampe_data)
+ self.assertEqual(results['output'].Path, '/sam_out')
+
+ def test_create_bwa_index_from_fasta_file(self):
+ """Test create_bwa_index_from_fasta_file
+
+ Makes sure that the file paths are as expected.
+ """
+
+ # get a new temp file for the input fasta
+ _, fasta_in = mkstemp(suffix=".fna")
+ # write the test fasta (see end of this file) to the temp file
+ fasta = open(fasta_in, 'w')
+ fasta.write(test_fasta)
+ fasta.close()
+
+ # make sure to remove this fasta file upon tearDown
+ self.files_to_remove.append(fasta_in)
+
+ # run the function
+ results = create_bwa_index_from_fasta_file(fasta_in, {})
+
+ # for each of the 5 output files (not counting stdout, stderr, and
+ # the exitStatus), make sure the file paths are as expcted.
+ for filetype, result in results.iteritems():
+ if filetype not in ('ExitStatus'):
+ # be sure to remove these 5 files
+ self.files_to_remove.append(result.name)
+ if filetype not in ('StdOut', 'ExitStatus', 'StdErr'):
+ self.assertEqual(fasta_in + filetype, result.name)
+
+ def test_assign_reads_to_database(self):
+ """Tests for proper failure in assign_reads_to_database
+ """
+
+ # sets of params that should cause failure
+ no_alg = {}
+ wrong_alg = {'algorithm': 'not_an_algorithm'}
+ no_aln_params = {'algorithm': 'bwa-short'}
+
+ # dummy files -- checking for failure as expected, so the function
+ # won't get as far as actually running the program
+ database = '/db'
+ query = '/query'
+ out = '/sam'
+
+ self.assertRaises(InvalidArgumentApplicationError,
+ assign_reads_to_database, query, database,
+ out, no_alg)
+
+ self.assertRaises(InvalidArgumentApplicationError,
+ assign_reads_to_database, query, database, out,
+ wrong_alg)
+
+ self.assertRaises(InvalidArgumentApplicationError,
+ assign_reads_to_database, query, database, out,
+ no_aln_params)
+
+test_fasta = '''>NZ_GG770509_647533119
+UACUUGGAGUUUGAUCCUGGCUCAGAACGAACGCUGGCGGCAGGCUUAACACAUGCAAGUCGAGCGAGCGGCAGACGGGUGAGUAACGCGUGGGAACGUACCAUUUGCUACGGAAUAACUCAGGGAAACUUGUGCUAAUACCGUAUGUGGAAAGUCGGCAAAUGAUCGGCCCGCGUUGGAUUAGCUAGUUGGUGGGGUAAAGGCUCACCAAGGCGACGAUCCAUAGCUGGUCUGAGAGGAUGAUCAGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGCAAGCCUGAUCCAGCCAUGCCGCGUGAGUGAUGAAGGCCCUAGGGUUGUAAAGCUCUUUCACCGGUGAAGAUGACGGUAACCGGAGAAGAAGCCCCGGCUAACUUCGUGCCAGCAGCCGCGGUAAUACGAAGGGGGCUAGCGUUGUUCGGAUUUACUGGGCGUAAAGCGCACGUA [...]
+>NZ_GG739926_647533195
+UAAUGGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCUACAGGCUUAACACAUGCAAGUCGAGGGACCGGCGCACGGGUGAGUAACGCGUAUCCAACCUUCCCGCGACCAAGGGAUAACCUGCCGAAAGGCAGACUAAUACCUUAUGUCCAAAGUCGGUCACGGAUGGGGAUGCGUCCGAUUAGCUUGUUGGCGGGGCAACGGCCCACCAAGGCAUCGAUCGGUAGGGGUUCUGAGAGGAAGGCCCCCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGAGGAAUAUUGGUCAAUGGGCGGAAGCCUGAACCAGCCAAGUAGCGUGCAGGACGACGGCCUACGGGUUGUAAACUGCUUUUAUGCGGGGAUAUGCAGGUACCGCAUGAAUAAGGACCGGCUAAUUCCGUGCCAGCAGCCGCGGUAAUACGGAAGGUCCGGGCGUUAUCCGGAUUUAUUGGGUUUAAAGGGAGCGC [...]
+>NZ_ACIZ01000148_643886127
+AAUAUGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCGGCGUGCCUAAUACAUGCAAGUCGAACGAGUGGCGGACGGGUGAGUAACACGUGGGUAACCUGCCCUUAAGUGGGGGAUAACAUUUGGAAACAGAUGCUAAUACCGCAUAAAGAAAGUCGCUUUUGGAUGGACCCGCGGCGUAUUAGCUAGUUGGUGAGGUAACGGCUCACCAAGGCAAUGAUACGUAGCCGAACUGAGAGGUUGAUCGGCCACAUUGGGACUGAGACACGGCCCAAACUCCUACGGGAGGCAGCAGUAGGGAAUCUUCCACAAUGGACGCAAGUCUGAUGGAGCAACGCCGCGUGAGUGAAGAAGGCUUUCGGGUCGUAAAACUCUGUUGUUGGAGAAGAUGACGGUAUCCAACCAGAAAGCCACGGCUAACUACGUGCCAGCAGCCGCGGUAAUACGUAGGUGGCAAGCGUUAUCCGGAUUUAUUGGGCGUAAAGCGAGCGC [...]
+
+if __name__ == "__main__":
+ main()
diff --git a/bfillings/tests/test_cd_hit.py b/bfillings/tests/test_cd_hit.py
new file mode 100644
index 0000000..68753a0
--- /dev/null
+++ b/bfillings/tests/test_cd_hit.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, rmdir
+from unittest import TestCase, main
+
+from cogent.core.moltype import PROTEIN, DNA
+
+from bfillings.cd_hit import (CD_HIT, CD_HIT_EST, cdhit_from_seqs,
+ cdhit_clusters_from_seqs, clean_cluster_seq_id,
+ parse_cdhit_clstr_file)
+
+
+class CD_HIT_Tests(TestCase):
+ """Tests for the CD-HIT application controller"""
+
+ def test_base_command(self):
+ """CD_HIT BaseCommand should return the correct BaseCommand"""
+ c = CD_HIT()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cd-hit']))
+ c.Parameters['-i'].on('seq.txt')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cd-hit -i "seq.txt"']))
+ c.Parameters['-c'].on(0.8)
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cd-hit -c 0.8' +
+ ' -i "seq.txt"']))
+
+ def test_changing_working_dir(self):
+ """CD_HIT BaseCommand should change according to WorkingDir"""
+ c = CD_HIT(WorkingDir='/tmp/cdhit_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cdhit_test','/"; ','cd-hit']))
+ c = CD_HIT()
+ c.WorkingDir = '/tmp/cdhit_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cdhit_test2','/"; ','cd-hit']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/cdhit_test')
+ rmdir('/tmp/cdhit_test2')
+
+ def test_cdhit_from_seqs(self):
+ """CD_HIT should return expected seqs"""
+ res = cdhit_from_seqs(protein_seqs, PROTEIN, {'-c':0.8})
+ self.assertEqual(res.toFasta(), protein_expected)
+
+class CD_HIT_EST_Tests(TestCase):
+ """Tests for the CD-HIT application controller"""
+
+ def test_base_command(self):
+ """CD_HIT_EST BaseCommand should return the correct BaseCommand"""
+ c = CD_HIT_EST()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cd-hit-est']))
+ c.Parameters['-i'].on('seq.txt')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cd-hit-est -i "seq.txt"']))
+ c.Parameters['-c'].on(0.8)
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cd-hit-est -c 0.8' +
+ ' -i "seq.txt"']))
+
+ def test_changing_working_dir(self):
+ """CD_HIT_EST BaseCommand should change according to WorkingDir"""
+ c = CD_HIT_EST(WorkingDir='/tmp/cdhitest_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cdhitest_test','/"; ','cd-hit-est']))
+ c = CD_HIT_EST()
+ c.WorkingDir = '/tmp/cdhitest_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cdhitest_test2','/"; ','cd-hit-est']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/cdhitest_test')
+ rmdir('/tmp/cdhitest_test2')
+
+ def test_cdhit_from_seqs(self):
+ """CD_HIT should return expected seqs"""
+ res = cdhit_from_seqs(dna_seqs, DNA, {'-c':0.8})
+ self.assertEqual(res.toFasta(), dna_expected)
+
+ def test_cdhit_from_seqs_synonym(self):
+ """CD_HIT should return expected seqs with -c synonym"""
+ res = cdhit_from_seqs(dna_seqs, DNA, {'Similarity':0.8})
+ self.assertEqual(res.toFasta(), dna_expected)
+
+
+class CD_HIT_SupportMethodTests(TestCase):
+ """Tests for supporting methods"""
+ def test_clean_cluster_seq_id(self):
+ """clean_cluster_seq_id returns a cleaned sequence id"""
+ data = ">foobar..."
+ exp = "foobar"
+ obs = clean_cluster_seq_id(data)
+ self.assertEqual(obs, exp)
+
+ def test_parse_cdhit_clstr_file(self):
+ """parse_cdhit_clstr_file returns the correct clusters"""
+ data = cdhit_clstr_file.split('\n')
+ exp = [['seq0'],['seq1','seq10','seq3','seq23','seq145'],\
+ ['seq7','seq17','seq69','seq1231']]
+ obs = parse_cdhit_clstr_file(data)
+ self.assertEqual(obs, exp)
+
+ def test_cdhit_clusters_from_seqs(self):
+ """cdhit_clusters_from_seqs returns expected clusters"""
+ exp = [['cdhit_test_seqs_0'],['cdhit_test_seqs_1'],\
+ ['cdhit_test_seqs_2'],['cdhit_test_seqs_3'],\
+ ['cdhit_test_seqs_4'],['cdhit_test_seqs_5'],\
+ ['cdhit_test_seqs_6','cdhit_test_seqs_8'],\
+ ['cdhit_test_seqs_7'],['cdhit_test_seqs_9']]
+ obs = cdhit_clusters_from_seqs(dna_seqs, DNA)
+ self.assertEqual(obs, exp)
+
+dna_seqs = """>cdhit_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>cdhit_test_seqs_1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>cdhit_test_seqs_2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>cdhit_test_seqs_3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>cdhit_test_seqs_4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>cdhit_test_seqs_5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>cdhit_test_seqs_6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>cdhit_test_seqs_7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>cdhit_test_seqs_8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>cdhit_test_seqs_9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA"""
+
+dna_expected = """>cdhit_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>cdhit_test_seqs_1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>cdhit_test_seqs_2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>cdhit_test_seqs_4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>cdhit_test_seqs_5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>cdhit_test_seqs_7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT"""
+
+protein_seqs = """>seq1
+MGNKWSKSWPQVRDRMRRAAPAPAADGVGAVSQDLAKHGAITSSNTAATNDDCAWLEAQTEEEVGFPVRPQVPLRPMTYK
+>seq2
+MGGKWSKSSIVGWSTVRERMRKTPPAADGVGAVSQDLDKHGAVTSSNTAFNNPDCAWLEAQEDEDVGFPVRPQVPLRPT
+>seq3
+MGGKWSKSSIVGWPAIRERMRRARPAADRVGTQPAADGVGAVSQDLARHGAVTSSNTSHNNPDCAWLEAQEEEEVGVR
+>seq4
+MGKIWSKSSIVGWPEIRERMRRQRPHEPAVEPAVGVGAASQDLANRGALTTSNTRTNNPTVAWVEAQEEEGEVVRPQ
+>seq5
+MGKIWSKSSLVGWPEIRERMRRQTQEPAVEPAVGAGAASQDLANRGAITIRNTRDNNESIAWLEAQEEEFPVRPQV
+>seq6
+MGKIWSKSSLVGWPEIRERIRRQTPEPAVGVGAVSQDLANRGAITTSNTKDNNQTVAWLEAQEEPVRPQVPLRPM
+>seq7
+MGNALRKGKFEGWAAVRERMRRTRTFPESEPCAPGVGQISRELAARGGIPSSHTPQNNESHQEEEVGFPVAPQV
+>seq8
+MGNAWSKSKFAGWSEVRDRMRRSSSDPQQPCAPGVGAVSRELATRGGISSSALAFLDSHKDEDVGFPVRPQVP
+>seq9
+MGNVLGKDKFKGWAAVRERMRKTSSDPDPQPCAPGVGPVSRELSYTPQNNAALAFLESHEDEDVGFPVXPQV
+>seq10
+MGNVLGKDKFKGWSAVRERMRKTSPEPEPCAPGVRGGISNSHTPQNNAALAFLESHQDEDVGFPVRPQVPL"""
+
+protein_expected = """>seq1
+MGNKWSKSWPQVRDRMRRAAPAPAADGVGAVSQDLAKHGAITSSNTAATNDDCAWLEAQTEEEVGFPVRPQVPLRPMTYK
+>seq2
+MGGKWSKSSIVGWSTVRERMRKTPPAADGVGAVSQDLDKHGAVTSSNTAFNNPDCAWLEAQEDEDVGFPVRPQVPLRPT
+>seq3
+MGGKWSKSSIVGWPAIRERMRRARPAADRVGTQPAADGVGAVSQDLARHGAVTSSNTSHNNPDCAWLEAQEEEEVGVR
+>seq4
+MGKIWSKSSIVGWPEIRERMRRQRPHEPAVEPAVGVGAASQDLANRGALTTSNTRTNNPTVAWVEAQEEEGEVVRPQ
+>seq5
+MGKIWSKSSLVGWPEIRERMRRQTQEPAVEPAVGAGAASQDLANRGAITIRNTRDNNESIAWLEAQEEEFPVRPQV
+>seq7
+MGNALRKGKFEGWAAVRERMRRTRTFPESEPCAPGVGQISRELAARGGIPSSHTPQNNESHQEEEVGFPVAPQV
+>seq8
+MGNAWSKSKFAGWSEVRDRMRRSSSDPQQPCAPGVGAVSRELATRGGISSSALAFLDSHKDEDVGFPVRPQVP
+>seq9
+MGNVLGKDKFKGWAAVRERMRKTSSDPDPQPCAPGVGPVSRELSYTPQNNAALAFLESHEDEDVGFPVXPQV"""
+
+cdhit_clstr_file = """>Cluster 0
+0 2799aa, >seq0... *
+>Cluster 1
+0 2214aa, >seq1... at 80%
+1 2215aa, >seq10... at 84%
+2 2217aa, >seq3... *
+3 2216aa, >seq23... at 84%
+4 527aa, >seq145... at 63%
+>Cluster 2
+0 2202aa, >seq7... at 60%
+1 2208aa, >seq17... *
+2 2207aa, >seq69... at 73%
+3 2208aa, >seq1231... at 69%"""
+
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_clearcut.py b/bfillings/tests/test_clearcut.py
new file mode 100644
index 0000000..aff0ef5
--- /dev/null
+++ b/bfillings/tests/test_clearcut.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir, path
+import tempfile
+import shutil
+from unittest import TestCase, main
+
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import DataError
+from cogent.util.misc import flatten
+from cogent.util.dict2d import Dict2D
+
+from bfillings.clearcut import (Clearcut, build_tree_from_alignment,
+ _matrix_input_from_dict2d,
+ build_tree_from_distance_matrix)
+
+
+class GeneralSetUp(TestCase):
+
+ def setUp(self):
+ """Clearcut general setUp method for all tests"""
+ self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+ 'GCGGCUAUUAGAUCGUA']
+
+ self.labels1 = ['>1','>2','>3']
+ self.lines1 = flatten(zip(self.labels1,self.seqs1))
+
+ self.seqs2=['UAGGCUCUGAUAUAAUAGCUCUC','UAUCGCUUCGACGAUUCUCUGAUAGAGA',
+ 'UGACUACGCAU']
+ self.labels2=['>a','>b','>c']
+ self.lines2 = flatten(zip(self.labels2,self.seqs2))
+
+ self.temp_dir = tempfile.mkdtemp()
+ #self.temp_dir_spaces = '/tmp/test for clearcut/'
+ #try:
+ # mkdir(self.temp_dir_spaces)
+ #except OSError:
+ # pass
+ try:
+ #create sequence files
+ f = open(path.join(self.temp_dir, 'seq1.txt'),'w')
+ f.write('\n'.join(self.lines1))
+ f.close()
+ g = open(path.join(self.temp_dir, 'seq2.txt'),'w')
+ g.write('\n'.join(self.lines2))
+ g.close()
+ except OSError:
+ pass
+
+
+class ClearcutTests(GeneralSetUp):
+ """Tests for the Clearcut application controller"""
+
+ def test_base_command(self):
+ """Clearcut BaseCommand should return the correct BaseCommand"""
+ c = Clearcut()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','clearcut -d -q']))
+ c.Parameters['--in'].on('seq.txt')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','clearcut -d --in="seq.txt" -q']))
+
+
+ def test_changing_working_dir(self):
+ """Clearcut BaseCommand should change according to WorkingDir"""
+ c = Clearcut(WorkingDir='/tmp/clearcut_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/clearcut_test','/"; ','clearcut -d -q']))
+ c = Clearcut()
+ c.WorkingDir = '/tmp/clearcut_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/clearcut_test2','/"; ','clearcut -d -q']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/clearcut_test')
+ rmdir('/tmp/clearcut_test2')
+
+ def test_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ #shutil.rmtree(self.temp_dir_spaces)
+
+ def test_build_tree_from_alignment(self):
+ """Clearcut should return a tree built from the passed alignment"""
+ tree_short = build_tree_from_alignment(build_tree_seqs_short,\
+ moltype=DNA)
+ num_seqs = flatten(build_tree_seqs_short).count('>')
+ self.assertEqual(len(tree_short.tips()), num_seqs)
+
+ tree_long = build_tree_from_alignment(build_tree_seqs_long, moltype=DNA)
+ seq_names = []
+ for line in build_tree_seqs_long.split('\n'):
+ if line.startswith('>'):
+ seq_names.append(line[1:])
+
+ for node in tree_long.tips():
+ if node.Name not in seq_names:
+ self.fail()
+ #repeat with best_tree = True
+ tree_long = build_tree_from_alignment(build_tree_seqs_long,\
+ best_tree=True,\
+ moltype=DNA)
+ seq_names = []
+ for line in build_tree_seqs_long.split('\n'):
+ if line.startswith('>'):
+ seq_names.append(line[1:])
+
+ for node in tree_long.tips():
+ if node.Name not in seq_names:
+ self.fail()
+
+ #build_tree_from_alignment should raise DataError when constructing
+ # an Alignment from unaligned sequences. Clearcut only allows aligned
+ # or a distance matrix as input.
+ self.assertRaises(DataError,build_tree_from_alignment,\
+ build_tree_seqs_unaligned,DNA)
+
+ def test_matrix_input_from_dict2d(self):
+ """matrix_input_from_dict2d formats dict2d object into distance matrix
+ """
+ data = [('sample1aaaaaaa', 'sample2', 1.438), ('sample2', 'sample1aaaaaaa', 1.438), ('sample1aaaaaaa', 'sample3', 2.45678), ('sample3', 'sample1aaaaaaa', 2.45678), ('sample2', 'sample3', 2.7), ('sample3', 'sample2', 2.7)]
+ data_dict2d = Dict2D(data, Pad=True, Default=0.0)
+ matrix, int_map = _matrix_input_from_dict2d(data_dict2d)
+ #of = open('temp.txt', 'w')
+ #of.write(matrix)
+ #of.close()
+ matrix = matrix.split('\n')
+ self.assertEqual(matrix[0], ' 3')
+ self.assertEqual(matrix[1], 'env_0 0.0 1.438 2.45678')
+ self.assertEqual(matrix[2], 'env_1 1.438 0.0 2.7')
+ self.assertEqual(matrix[3], 'env_2 2.45678 2.7 0.0')
+ self.assertEqual(int_map['env_1'], 'sample2')
+ self.assertEqual(int_map['env_0'], 'sample1aaaaaaa')
+ self.assertEqual(int_map['env_2'], 'sample3')
+
+ def test_build_tree_from_distance_matrix(self):
+ """build_tree_from_distance_matrix builds a tree from a dict2d
+ """
+ data = [('sample1aaaaaaa', 'sample2', 1.438), ('sample2', 'sample1aaaaaaa', 1.438), ('sample1aaaaaaa', 'sample3', 2.45678), ('sample3', 'sample1aaaaaaa', 2.45678), ('sample2', 'sample3', 2.7), ('sample3', 'sample2', 2.7)]
+ data_dict2d = Dict2D(data, Pad=True, Default=0.0)
+ result = build_tree_from_distance_matrix(data_dict2d)
+ self.assertEqual(str(result), '((sample1aaaaaaa:0.59739,sample2:0.84061),sample3:1.85939);')
+
+
+align1 = ">seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+build_tree_seqs_short = """>clearcut_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clearcut_test_seqs_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clearcut_test_seqs_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clearcut_test_seqs_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clearcut_test_seqs_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clearcut_test_seqs_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clearcut_test_seqs_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+build_tree_seqs_long = """>clearcut_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clearcut_test_seqsaaaaaaaa_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clearcut_test_seqsaaaaaaaa_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clearcut_test_seqsaaaaaaaa_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clearcut_test_seqsaaaaaaaa_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+#Unaligned seqs. First two sequences are 3 nucleotides shorter.
+build_tree_seqs_unaligned = """>clearcut_test_seqs_0
+CCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_1
+CCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clearcut_test_seqs_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clearcut_test_seqs_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clearcut_test_seqs_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clearcut_test_seqs_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clearcut_test_seqs_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clearcut_test_seqs_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_clustalw.py b/bfillings/tests/test_clustalw.py
new file mode 100644
index 0000000..bdc57ee
--- /dev/null
+++ b/bfillings/tests/test_clustalw.py
@@ -0,0 +1,627 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Tests for application controller for ClustalW v1.83"""
+import re
+from os import getcwd, remove, rmdir, mkdir, path
+import shutil
+from cogent.core.alignment import Alignment
+from cogent.core.moltype import RNA
+from cogent.util.unit_test import TestCase, main
+from cogent.util.misc import flatten
+from skbio.parse.sequences import parse_fasta
+from bfillings.clustalw import (Clustalw, alignUnalignedSeqsFromFile,
+ alignUnalignedSeqs, alignTwoAlignments,
+ addSeqsToAlignment, buildTreeFromAlignment,
+ build_tree_from_alignment,
+ bootstrap_tree_from_alignment,
+ align_unaligned_seqs, align_and_build_tree,
+ add_seqs_to_alignment, align_two_alignments)
+
+
+cw_vers = re.compile("CLUSTAL W [(]1\.8[1-3][.\d]*[)]")
+
+class GeneralSetUp(TestCase):
+
+ def setUp(self):
+ """Clustalw general setUp method for all tests"""
+ self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+ 'GCGGCUAUUAGAUCGUA']
+ self.aln1_fasta = ALIGN1_FASTA
+ self.labels1 = ['>1','>2','>3']
+ self.lines1 = flatten(zip(self.labels1,self.seqs1))
+ self.stdout1 = STDOUT1
+ self.aln1 = ALIGN1
+ self.dnd1 = DND1
+
+ self.multiline1 = '\n'.join(flatten(zip(self.labels1, self.seqs1)))
+
+ self.seqs2=['UAGGCUCUGAUAUAAUAGCUCUC','UAUCGCUUCGACGAUUCUCUGAUAGAGA',
+ 'UGACUACGCAU']
+ self.labels2=['>a','>b','>c']
+ self.lines2 = flatten(zip(self.labels2,self.seqs2))
+ self.aln2 = ALIGN2
+ self.dnd2 = DND2
+
+ self.twoalign = TWOALIGN
+ self.alignseqs = ALIGNSEQS
+ self.treeduringalignseqs = TREEDURINGALIGNSEQS
+ self.treefromalignseqs = TREEFROMALIGNSEQS
+
+ self.temp_dir_space = "/tmp/clustalw test"
+
+ self.build_tree_seqs_short = """>clustal_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clustal_test_seqs_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clustal_test_seqs_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clustal_test_seqs_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clustal_test_seqs_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clustal_test_seqs_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clustal_test_seqs_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clustal_test_seqs_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clustal_test_seqs_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clustal_test_seqs_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+ self.build_tree_seqs_long = """>clustal_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clustal_test_seqsaaaaaaaa_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clustal_test_seqsaaaaaaaa_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clustal_test_seqsaaaaaaaa_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clustal_test_seqsaaaaaaaa_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clustal_test_seqsaaaaaaaa_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clustal_test_seqsaaaaaaaa_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clustal_test_seqsaaaaaaaa_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clustal_test_seqsaaaaaaaa_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clustal_test_seqsaaaaaaaa_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+ try:
+ mkdir('/tmp/ct')
+ except OSError: #dir already exists
+ pass
+
+ try:
+ #create sequence files
+ f = open('/tmp/ct/seq1.txt','w')
+ f.write('\n'.join(self.lines1))
+ f.close()
+ g = open('/tmp/ct/seq2.txt','w')
+ g.write('\n'.join(self.lines2))
+ g.close()
+ #create alignment files
+ f = open('/tmp/ct/align1','w')
+ f.write(self.aln1)
+ f.close()
+ g = open('/tmp/ct/align2','w')
+ g.write(self.aln2)
+ g.close()
+ #create tree file
+ f = open('/tmp/ct/tree1','w')
+ f.write(DND1)
+ f.close()
+ except OSError:
+ pass
+
+
+
+class ClustalwTests(GeneralSetUp):
+ """Tests for the Clustalw application controller"""
+
+ def test_base_command(self):
+ """Clustalw BaseCommand should return the correct BaseCommand"""
+ c = Clustalw()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','clustalw -align']))
+ c.Parameters['-infile'].on('seq.txt')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ',\
+ 'clustalw -infile="seq.txt" -align']))
+ c.Parameters['-align'].off()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','clustalw -infile="seq.txt"']))
+ c.Parameters['-nopgap'].on()
+ c.Parameters['-infile'].off()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','clustalw -nopgap']))
+
+ def test_changing_working_dir(self):
+ """Clustalw BaseCommand should change according to WorkingDir"""
+ c = Clustalw(WorkingDir='/tmp/clustaltest')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/clustaltest','/"; ','clustalw -align']))
+ c = Clustalw(WorkingDir='/tmp/clustaltest/')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/clustaltest/','/"; ','clustalw -align']))
+ c = Clustalw()
+ c.WorkingDir = '/tmp/clustaltest2/'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/clustaltest2/','/"; ','clustalw -align']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/clustaltest')
+ rmdir('/tmp/clustaltest2')
+
+ def test_stdout_input_as_string(self):
+ """Clustalw input_as_string shoud function as expected"""
+ c = Clustalw(WorkingDir='/tmp/ct')
+ res = c('/tmp/ct/seq1.txt')
+ self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+ cw_vers.sub("", self.stdout1))
+ self.assertEqual(res['StdErr'].read(),'')
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.aln1))
+ self.assertEqual(res['Dendro'].read(),self.dnd1)
+ res.cleanUp()
+
+ def test_stdout_input_as_lines(self):
+ """Clustalw input_as_lines should function as expected"""
+ c = Clustalw(InputHandler='_input_as_lines',WorkingDir='/tmp/ct')
+ res = c(self.lines1)
+ #get info on input file name and change output accordingly
+ name = c.Parameters['-infile'].Value
+ out = self.stdout1.split('\n')
+ out[16] =\
+ 'Guide tree file created: ['+name.rsplit(".")[0]+'.dnd]'
+ out[23] =\
+ 'CLUSTAL-Alignment file created ['+name.rsplit(".")[0]+'.aln]'
+
+ self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+ cw_vers.sub("", '\n'.join(out)))
+ self.assertEqual(res['StdErr'].read(),'')
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.aln1))
+ self.assertEqual(res['Dendro'].read(),self.dnd1)
+ res.cleanUp()
+
+ def test_stdout_input_as_lines_local(self):
+ """Clustalw input_as_lines should function as expected"""
+ c = Clustalw(InputHandler='_input_as_lines',WorkingDir=self.temp_dir_space)
+ res = c(self.lines1)
+ #get info on input file name and change output accordingly
+ name = c.Parameters['-infile'].Value
+ out = self.stdout1.split('\n')
+ out[16] =\
+ 'Guide tree file created: ['+name.rsplit(".")[0]+'.dnd]'
+ out[23] =\
+ 'CLUSTAL-Alignment file created ['+name.rsplit(".")[0]+'.aln]'
+
+ self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+ cw_vers.sub("", '\n'.join(out)))
+ self.assertEqual(res['StdErr'].read(),'')
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.aln1))
+ self.assertEqual(res['Dendro'].read(),self.dnd1)
+ res.cleanUp()
+
+ def test_stdout_input_as_seqs(self):
+ """Clustalw input_as_seqs should function as expected"""
+ c = Clustalw(InputHandler='_input_as_seqs',WorkingDir='/tmp/ct')
+ res = c(self.seqs1)
+ #get info on input file name and change output accordingly
+ name = c.Parameters['-infile'].Value
+ out = self.stdout1.split('\n')
+ out[16] =\
+ 'Guide tree file created: ['+name.rsplit(".")[0]+'.dnd]'
+ out[23] =\
+ 'CLUSTAL-Alignment file created ['+name.rsplit(".")[0]+'.aln]'
+
+ self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+ cw_vers.sub("", '\n'.join(out)))
+ self.assertEqual(res['StdErr'].read(),'')
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.aln1))
+ self.assertEqual(res['Dendro'].read(),self.dnd1)
+ res.cleanUp()
+
+ def test_stdout_input_as_multiline_string(self):
+ """Clustalw input_as_multiline_string should function as expected"""
+ c = Clustalw(InputHandler='_input_as_multiline_string',\
+ WorkingDir='/tmp/ct')
+ res = c(self.multiline1)
+ name = c.Parameters['-infile'].Value
+ out = self.stdout1.split('\n')
+ out[16] =\
+ 'Guide tree file created: ['+name.rsplit(".")[0]+'.dnd]'
+ out[23] =\
+ 'CLUSTAL-Alignment file created ['+name.rsplit(".")[0]+'.aln]'
+
+ self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+ cw_vers.sub("", '\n'.join(out)))
+ self.assertEqual(res['StdErr'].read(),'')
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.aln1))
+ self.assertEqual(res['Dendro'].read(),self.dnd1)
+ res.cleanUp()
+
+ def test_alignment_trees(self):
+ """Clustalw alignment should work correctly with new/usetree"""
+ c = Clustalw(params={'-quicktree':True,'-type':'DNA','-gapopen':10},\
+ WorkingDir='/tmp/ct')
+ res = c('/tmp/ct/seq1.txt')
+ self.assertEqual(res['Align'].name,'/tmp/ct/seq1.aln')
+ self.assertEqual(res['Dendro'].name,'/tmp/ct/seq1.dnd')
+ res.cleanUp()
+ c.Parameters['-usetree'].on('/tmp/ct/tree1')
+ c.Parameters['-output'].on('PHYLIP')
+ res = c('/tmp/ct/seq1.txt')
+ self.assertEqual(res['Align'].name,'/tmp/ct/seq1.phy')
+ self.assertEqual(res['Dendro'].name,'/tmp/ct/tree1')
+ res.cleanUp()
+ c.Parameters['-newtree'].on('newtree')
+ c.Parameters['-outfile'].on('outfile')
+ res = c('/tmp/ct/seq1.txt')
+ self.assertEqual(res['Align'].name, c.WorkingDir + 'outfile')
+ self.assertEqual(res['Dendro'].name, c.WorkingDir + 'newtree')
+ res.cleanUp()
+
+ def test_profile_newtree(self):
+ """Clustalw profile should work correctly with new/usetree"""
+ c = Clustalw(params={'-profile':None,'-profile1':'/tmp/ct/seq1.txt',\
+ '-profile2':'/tmp/ct/seq2.txt','-newtree1':'lala'},\
+ WorkingDir='/tmp/ct')
+ c.Parameters['-align'].off()
+ res = c()
+ self.assertEqual(res['Align'],None)
+ self.assertEqual(res['Dendro1'].name,'/tmp/ct/lala')
+ self.assertEqual(res['Dendro2'].name,'/tmp/ct/seq2.dnd')
+ res.cleanUp()
+
+ def test_sequences_newtree(self):
+ """Clustalw sequences should work correctly with new/usetree"""
+ c = Clustalw(params={'-sequences':None,'-newtree':'lala',\
+ '-profile1':'/tmp/ct/align1','-profile2':'/tmp/ct/seq2.txt'},\
+ WorkingDir='/tmp/ct')
+ c.Parameters['-align'].off()
+ res = c()
+ self.assertEqual(res['Align'],None)
+ self.assertEqual(res['Dendro'].name,'/tmp/ct/lala')
+ res.cleanUp()
+
+ #is this a bug in clustal. It's creating an empty file 'seq2.aln'
+ #but doesn't report it in the stdout
+ remove('/tmp/ct/seq2.aln')
+
+ def test_tree_outputtree(self):
+ """Clustalw tree should work correctly with outputtree"""
+ c = Clustalw(params={'-tree':None,'-outputtree':'dist',\
+ '-infile':'/tmp/ct/align1'},WorkingDir='/tmp/ct/')
+ c.Parameters['-align'].off()
+ res = c()
+ self.assertEqual(res['Tree'].name,'/tmp/ct/align1.ph')
+ self.assertEqual(res['TreeInfo'].name,'/tmp/ct/align1.dst')
+ res.cleanUp()
+
+
+class clustalwTests(GeneralSetUp):
+ """Tests for module level functions in clustalw.py"""
+
+
+ def test_alignUnalignedSeqs(self):
+ """Clustalw alignUnalignedSeqs should work as expected"""
+ res = alignUnalignedSeqs(self.seqs1,WorkingDir='/tmp/ct')
+ self.assertNotEqual(res['StdErr'],None)
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.aln1))
+ self.assertEqual(res['Dendro'].read(),self.dnd1)
+ res.cleanUp()
+
+ #suppress stderr and stdout
+ res = alignUnalignedSeqs(self.seqs1,WorkingDir='/tmp/ct',\
+ SuppressStderr=True,SuppressStdout=True)
+ self.assertEqual(res['StdOut'],None)
+ self.assertEqual(res['StdErr'],None)
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.aln1))
+ self.assertEqual(res['Dendro'].read(),self.dnd1)
+ res.cleanUp()
+
+ def test_alignUnalignedSeqsFromFile(self):
+ """Clustalw alignUnalignedSeqsFromFile should work as expected"""
+ #make temp file
+ res = alignUnalignedSeqsFromFile('/tmp/ct/seq1.txt')
+ self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+ cw_vers.sub("", self.stdout1))
+ self.assertEqual(res['StdErr'].read(),'')
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.aln1))
+ self.assertEqual(res['Dendro'].read(),self.dnd1)
+ res.cleanUp()
+ #suppress stderr and stdout
+ res = alignUnalignedSeqsFromFile('/tmp/ct/seq1.txt',\
+ SuppressStderr=True, SuppressStdout=True)
+ self.assertEqual(res['StdOut'],None)
+ self.assertEqual(res['StdErr'],None)
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.aln1))
+ self.assertEqual(res['Dendro'].read(),self.dnd1)
+ res.cleanUp()
+
+ def test_alignTwoAlignments(self):
+ """Clustalw alignTwoAlignments should work as expected"""
+ res = alignTwoAlignments('/tmp/ct/align1','/tmp/ct/align2',\
+ 'twoalign.aln')
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.twoalign))
+ self.assertNotEqual(res['Dendro1'],None)
+ self.assertNotEqual(res['Dendro2'],None)
+ #are there new trees created during the profiling?
+ #the produced trees are not the same as when aligning individually
+ #self.assertEqual(res['Dendro1'].read(),self.dnd)
+ #self.assertEqual(res['Dendro2'].read(),self.dnd2)
+ res.cleanUp()
+
+ def test_addSeqsToAlignment(self):
+ """Clustalw addSeqsToAlignment shoudl work as expected"""
+ res = addSeqsToAlignment('/tmp/ct/align1','/tmp/ct/seq2.txt',\
+ 'alignseqs')
+ self.assertEqual(cw_vers.sub("", res['Align'].read()),
+ cw_vers.sub("", self.alignseqs))
+ self.assertEqual(res['Dendro'].read(),self.treeduringalignseqs)
+ res.cleanUp()
+
+ def test_buildTreeFromAlignment(self):
+ """Clustalw buildTreeFromAlignment shoudl work as expected"""
+ pre_res = addSeqsToAlignment('/tmp/ct/align1','/tmp/ct/seq2.txt',\
+ 'alignseqs',WorkingDir='/tmp/ct')
+ res = buildTreeFromAlignment('/tmp/ct/alignseqs',WorkingDir='/tmp/ct')
+ self.assertEqual(res['Tree'].read(),self.treefromalignseqs)
+
+ res.cleanUp()
+ pre_res.cleanUp()
+
+ def test_build_tree_from_alignment(self):
+ """Clustalw should return a tree built from the passed alignment"""
+ tree_short = build_tree_from_alignment(self.build_tree_seqs_short, \
+ RNA, best_tree=False)
+ num_seqs = flatten(self.build_tree_seqs_short).count('>')
+ self.assertEqual(len(tree_short.tips()), num_seqs)
+
+ tree_long = build_tree_from_alignment(self.build_tree_seqs_long, \
+ RNA, best_tree=False)
+ seq_names = []
+ for line in self.build_tree_seqs_long.split('\n'):
+ if line.startswith('>'):
+ seq_names.append(line[1:])
+
+ for node in tree_long.tips():
+ if node.Name not in seq_names:
+ self.fail()
+
+ tree_short = build_tree_from_alignment(self.build_tree_seqs_short, \
+ RNA, best_tree=True, params={'-bootstrap':3})
+ num_seqs = flatten(self.build_tree_seqs_short).count('>')
+ self.assertEqual(len(tree_short.tips()), num_seqs)
+
+ def test_align_unaligned_seqs(self):
+ """Clustalw align_unaligned_seqs should work as expected"""
+ res = align_unaligned_seqs(self.seqs1, RNA)
+ self.assertEqual(res.toFasta(), self.aln1_fasta)
+
+ def test_bootstrap_tree_from_alignment(self):
+ """Clustalw should return a bootstrapped tree from the passed aln"""
+ tree_short = bootstrap_tree_from_alignment(self.build_tree_seqs_short)
+ num_seqs = flatten(self.build_tree_seqs_short).count('>')
+ self.assertEqual(len(tree_short.tips()), num_seqs)
+
+ tree_long = bootstrap_tree_from_alignment(self.build_tree_seqs_long)
+ seq_names = []
+ for line in self.build_tree_seqs_long.split('\n'):
+ if line.startswith('>'):
+ seq_names.append(line[1:])
+
+ for node in tree_long.tips():
+ if node.Name not in seq_names:
+ self.fail()
+ def test_align_and_build_tree(self):
+ """Aligns and builds a tree for a set of sequences"""
+ res = align_and_build_tree(self.seqs1, RNA)
+ self.assertEqual(res['Align'].toFasta(), self.aln1_fasta)
+
+ tree = res['Tree']
+ seq_names = []
+ for line in self.aln1_fasta.split('\n'):
+ if line.startswith('>'):
+ seq_names.append(line[1:])
+
+ for node in tree.tips():
+ if node.Name not in seq_names:
+ self.fail()
+
+ def test_add_seqs_to_alignment(self):
+ """Clustalw add_seqs_to_alignment should work as expected."""
+ seq2 = dict(parse_fasta(self.lines2))
+ align1 = dict(parse_fasta(ALIGN1_FASTA.split('\n')))
+ res = add_seqs_to_alignment(seq2,align1,RNA)
+ self.assertEqual(res.toFasta(), SEQ_PROFILE_ALIGN)
+
+ def test_align_two_alignments(self):
+ """Clustalw align_two_alignments should work as expected."""
+ align1 = dict(parse_fasta(ALIGN1_FASTA.split('\n')))
+ align2 = dict(parse_fasta(ALIGN2_FASTA.split('\n')))
+ res = align_two_alignments(align1,align2,RNA)
+ self.assertEqual(res.toFasta(), PROFILE_PROFILE_ALIGN)
+
+ def test_zzz_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ remove('/tmp/ct/seq1.txt')
+ remove('/tmp/ct/seq2.txt')
+ remove('/tmp/ct/align1')
+ remove('/tmp/ct/align2')
+ remove('/tmp/ct/tree1')
+ rmdir('/tmp/ct')
+ shutil.rmtree(self.temp_dir_space)
+
+STDOUT1=\
+"""
+
+
+ CLUSTAL W (1.83) Multiple Sequence Alignments
+
+
+
+Sequence format is Pearson
+Sequence 1: 1 23 bp
+Sequence 2: 2 13 bp
+Sequence 3: 3 17 bp
+Start of Pairwise alignments
+Aligning...
+Sequences (1:2) Aligned. Score: 46
+Sequences (1:3) Aligned. Score: 41
+Sequences (2:3) Aligned. Score: 30
+Guide tree file created: [/tmp/ct/seq1.dnd]
+Start of Multiple Alignment
+There are 2 groups
+Aligning...
+Group 1: Sequences: 2 Score:171
+Group 2: Sequences: 3 Score:162
+Alignment Score 33
+CLUSTAL-Alignment file created [/tmp/ct/seq1.aln]
+"""
+
+ALIGN1=\
+"""CLUSTAL W (1.83) multiple sequence alignment
+
+
+1 ACUGCUAGCUAGUAGCGUACGUA
+2 ---GCUACGUAGCUAC-------
+3 GCGGCUAUUAGAUCGUA------
+ ****
+"""
+
+ALIGN1_FASTA = ">seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+DND1=\
+"""(
+1:0.21719,
+2:0.32127,
+3:0.37104);
+"""
+
+ALIGN2 =\
+"""CLUSTAL W (1.83) multiple sequence alignment
+
+
+a UAGGCUCUGAUAUAAUAGCUCUC---------
+b ----UAUCGCUUCGACGAUUCUCUGAUAGAGA
+c ------------UGACUACGCAU---------
+ * *
+"""
+
+ALIGN2_FASTA = ">a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\n----UAUCGCUUCGACGAUUCUCUGAUAGAGA\n>c\n------------UGACUACGCAU---------"
+
+DND2=\
+"""(
+a:0.30435,
+b:0.30435,
+c:0.33202);
+"""
+
+TWOALIGN=\
+"""CLUSTAL W (1.83) multiple sequence alignment
+
+
+1 ---ACUGCUAGCUAGUAGCGUACGUA------
+2 ------GCUACGUAGCUAC-------------
+3 ---GCGGCUAUUAGAUCGUA------------
+a UAGGCUCUGAUAUAAUAGCUCUC---------
+b ----UAUCGCUUCGACGAUUCUCUGAUAGAGA
+c ------------UGACUACGCAU---------
+
+"""
+
+ALIGNSEQS=\
+"""CLUSTAL W (1.83) multiple sequence alignment
+
+
+1 ----------ACUGCUAGCUAGUAGCGUACGUA
+2 -------------GCUACGUAGCUAC-------
+3 ----------GCGGCUAUUAGAUCGUA------
+a -------UAGGCUCUGAUAUAAUAGCUCUC---
+c -------------------UGACUACGCAU---
+b UAUCGCUUCGACGAUUCUCUGAUAGAGA-----
+
+"""
+
+TREEDURINGALIGNSEQS=\
+"""(
+1:0.34511,
+(
+2:0.25283,
+(
+(
+3:0.21486,
+a:0.19691)
+:0.11084,
+b:0.31115)
+:0.06785)
+:0.02780,
+c:0.20035);
+"""
+
+TREEFROMALIGNSEQS=\
+"""(
+(
+(
+1:0.17223,
+(
+2:0.14749,
+c:0.13822)
+:0.19541)
+:0.07161,
+a:0.25531)
+:0.03600,
+3:0.29438,
+b:0.23503);
+"""
+
+SEQ_PROFILE_ALIGN = """>a\n-------UAGGCUCUGAUAUAAUAGCUCUC---\n>b\nUAUCGCUUCGACGAUUCUCUGAUAGAGA-----\n>c\n-------------------UGACUACGCAU---\n>seq_0\n----------ACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n-------------GCUACGUAGCUAC-------\n>seq_2\n----------GCGGCUAUUAGAUCGUA------"""
+
+PROFILE_PROFILE_ALIGN = """>a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\n----UAUCGCUUCGACGAUUCUCUGAUAGAGA\n>c\n------------UGACUACGCAU---------\n>seq_0\n---ACUGCUAGCUAGUAGCGUACGUA------\n>seq_1\n------GCUACGUAGCUAC-------------\n>seq_2\n---GCGGCUAUUAGAUCGUA------------"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_fasttree.py b/bfillings/tests/test_fasttree.py
new file mode 100644
index 0000000..e86cd56
--- /dev/null
+++ b/bfillings/tests/test_fasttree.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Tests for FastTree v1.1 application controller.
+Also functions on v2.0.1, v2.1.0 and v2.1.3"""
+
+from shutil import rmtree
+from os import getcwd
+from unittest import TestCase, main
+
+from cogent.core.alignment import Alignment
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA
+
+from skbio.parse.sequences import parse_fasta
+
+from bfillings.fasttree import FastTree, build_tree_from_alignment
+
+
+class FastTreeTests(TestCase):
+ def setUp(self):
+ self.seqs = Alignment(dict(parse_fasta(test_seqs.split())))
+
+ def test_base_command(self):
+ app = FastTree()
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "',getcwd(),'/"; ','FastTree']))
+ app.Parameters['-nt'].on()
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "',getcwd(),'/"; ','FastTree -nt']))
+
+ def test_change_working_dir(self):
+ app = FastTree(WorkingDir='/tmp/FastTreeTest')
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "','/tmp/FastTreeTest','/"; ','FastTree']))
+ rmtree('/tmp/FastTreeTest')
+
+ def test_build_tree_from_alignment(self):
+ tree = build_tree_from_alignment(self.seqs, DNA)
+ # test expected output for fasttree 1.1 and 2.0.1
+ try:
+ for o,e in zip(tree.traverse(), DndParser(exp_tree).traverse()):
+ self.assertEqual(o.Name,e.Name)
+ self.assertAlmostEqual(o.Length,e.Length)
+ except AssertionError:
+ for o,e in zip(tree.traverse(), DndParser(exp_tree_201).traverse()):
+ self.assertEqual(o.Name,e.Name)
+ self.assertAlmostEqual(o.Length,e.Length)
+test_seqs = """>test_set1_0
+GGTAGATGGGACTACCTCATGACATGAAACTGCAGTCTGTTCTTTTATAGAAGCTTCATACTTGGAGATGTATACTATTA
+CTTAGGACTATGGAGGTATA
+>test_set1_1
+GGTTGATGGGACTACGTAGTGACATGAAATTGCAGTCTGTGCTTTTATAGAAGTTTGATACTTGGAGCTCTCTACTATTA
+CTTAGGACTATGGAGGTATA
+>test_set1_2
+GGTTGATGGGCCTACCTCATGACAATAAACTGAAGTCTGTGCTTTTATAGAGGCTTGATACTTGGAGCTCTATACTATTA
+CTTAGGATTATGGAGGTCTA
+>test_set1_3
+GGTTGATGGGACTACCTCATGACATGAAACTGCAGTCTGTGCTTTTATAGAAGCTTGATACTTGGAGATCTATACTATTA
+CTTAGGACTATGGAGGTCAC
+>test_set1_4
+GGTTGGTGGGACTACCTCATGACATGAAGATGCAGTCTGTGCTTGTATAGAAGCTTGAAACTTGGATATCTATACTATTA
+CTTAAGACTATGGAGGTCTA
+>test_set1_5
+GGTTGATGCGACTACCTCATGACATGAGACTGCAGTCTGTGCTTTTACTGAAGCTTGATACTTGGAGATCTATACTATTA
+CTTAGGACTATGGAGGTTTA
+>test_set1_6
+GGTTGATGGGACTACCTCATGACATGAAAATGCAGTCTGTCCTTTTATAGAAGCTTGATACTTGTAGATCTATACTGTTA
+CTTAGGACTATGGAGGTCTA
+>test_set1_7
+GGTTGATGGGACTCCCTCATGACATAAAACTGCAGTCTGTGCTTTTACAGAAGCTTGATACTTGGAGATCTATACTATTA
+CATAGGACTATGGAGGTCTA
+>test_set1_8
+GGTTGATGGCACTACCTCATGAGATGAAACTGCAGTCTGTGCTTTTATAGAAGCTTGATACTTGGATATCTATACTATAA
+CTTAGTACTATGGAGGCCTA
+>test_set1_9
+GGTTTATGTTACTACCTCATGACATGAAACGGCAGCATGTGCTTTTATAGAAGCTTGATACTTGGAGATCTAAACTATTA
+CTTAGGACTATGGAGGTCTA
+>test_set2_0
+AGCGAATCATACTCTGGAAAGAAAAGGACGACTCCTTTGCTCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_1
+AGAGAATAGTACTCTGGAAAGACAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGATTCA
+>test_set2_2
+AGAGTATAATACTCTGGAAAGAAAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTTAA
+TGATGGTTGAACCGGGGTCA
+>test_set2_3
+AGAGAATCATACTCTGGAAAGAAATGGACGACTCCTTTGATCGCGGTCCAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGGACCGGGTTCA
+>test_set2_4
+AGAGAATAATAGTCTGGAAAGAAAAGGACGACTCCTTTGTTCCCGGTCTAGCTGCTACAGCTTCCCCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_5
+ACAGAATACTACTCTGGAAAGAAAAGGCCGACTCCTTTGATCGCTGTCTAGCTGCGACAGCTGCACGGAGTCCATCCGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_6
+AGAGAATAATACTCTGGACAGAAATGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGCTGAACCGGGTTCA
+>test_set2_7
+AGAGAATATTACTCTGGAAAGAAAAGGACGACTCCTTGGATCGCGGTCTAGCTGCTACAGCTTCAGCGAGTACATCGGAA
+TGATGGTTTAACCGGGTTCA
+>test_set2_8
+AGTGAATAATACTCTGGAAAGAAAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTAGAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_9
+AGAGATTAATACTCTGGATAGAAAATGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGATTGACCTATTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set3_0
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAAAGGAGGATAGAACTCGGACAGTATTCTGAACATTACAG
+AATCGCCGTATTTACGGTGT
+>test_set3_1
+TTGTCTCCATTGAGCACTCTAATCATGCCGTGTATTCAGGAACGGAGGAGAGGACTCGGTCAGTATTCGGAACATTACAG
+AATGGCGTTATTTACGGTGT
+>test_set3_2
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGAATCCTGAATATTACAA
+AATCGGGTTATTTACGGTGT
+>test_set3_3
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTTTTCAGGAACGGAGGATAGAACTCGGACAGTAGCCTGAACATTACAG
+AATCCCGTTATTTACGGTGT
+>test_set3_4
+TTGTCTCCATCGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATTGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_5
+TTGTCTCCATTGAGCACGCTAAGCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_6
+TTGTCGTCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGAAGGATAGAACTCGGACAGTATCCTGAACTTTGCAA
+AATCGCGTTATTTACGGTGT
+>test_set3_7
+TTGTCTCCATTGAGCACTCTAATCTAGCCGTGTAGTCAGGAACGGAGGATGGAACGCGCACAGTATCCTGAACATAACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_8
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTATATTCCCGAACGGAGGATAGAACTCGGACAGTAGCCTGAACAGTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_9
+TTGTCTCCCTTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set4_0
+CTTTTACCGGGCTGCCCGAGAGCACTATCTGCGTCGTGCCCTGCTTCGATGCCCACACTACCATCATACTATTCGTGAAT
+TTGCGGCCGCTAAGATCCGA
+>test_set4_1
+CTTTTATCGGGGTGCCTGATAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCTAAACCACCGTCATGCTATTTGTGAAT
+TTGAGGTCGCTAAGAGCCCA
+>test_set4_2
+CTTTTATCGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCCAGGCCACCATCATACTATTTGTGGCT
+TAGGGGTCGCTAAGAGCCGA
+>test_set4_3
+CTTTTATCGGGGGGCCCGAGAGCACCACCTGCGTCGTGCCCTGCTTCGATGCCCAAACCACCATCATACTATTTGTGAAT
+TTGGGGTCGCTAAGAGCCGA
+>test_set4_4
+CTTTTATAGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCAGCTTCGATTTCCAAACCACCATCATACTATTTGTGAAC
+TTGGGGACGTTAAGAGCCGA
+>test_set4_5
+CTTTTCGCGGGGTGCCCGAGAGCACCATCTGCGTCGCGCCCTGCTTCGGTGCCCATACCACCATCATAATATTTGGGAAA
+TTGGGATCGCTAAGAGTCGA
+>test_set4_6
+CTTTTCTCGGGGTGCCCGAGAGCCCCATCTGCGTTGTGCCCTGCTACTATGCCCAAACCACCATCATACTATTTGTGAAT
+GTGGCGTCGCTCAGAGCCGA
+>test_set4_7
+CTTTTATCGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCCACGTCACCATACTACTATTTGTGAAT
+TTGGGGTCGCTAATAGCCGA
+>test_set4_8
+CTTTTATCGGGGGGCCCGAGAGCATCATCTGCGTCGTGCCCTGCTTCGATGCCCAAACTACCATCATACTATTTGTGAAT
+TTGGGGTTTCTAAGAGCCGA
+>test_set4_9
+CTTTTACCGGGGTGACCGAGAGCACCATCTGCGCCGTGCCCTGCTTCGAGGCCCAAACCACCATCATACTGTTTGTGAAT
+CAGGGGTTGCTAAGAGCCGA"""
+
+exp_tree = """((test_set2_0:0.02121,(test_set2_8:-0.03148,(((test_set3_6:0.05123,(test_set3_5:0.01878,((test_set3_0:0.03155,test_set3_1:0.06432)0.664:0.01096,(((test_set3_3:0.02014,test_set3_8:0.04240)0.880:0.01129,(test_set3_7:0.05900,test_set3_4:0.01449)0.756:0.00571)0.514:0.00038,test_set3_9:0.00907)0.515:0.00020)0.834:0.00164)0.708:0.01349)0.754:0.19207,test_set3_2:-0.16026)0.999:1.34181,(test_set1_2:0.00324,((test_set1_0:0.04356,test_set1_1:0.07539)0.393:0.00223,((test_set1_3:0.0199 [...]
+# for FastTree version 2.0.1
+exp_tree_201 = """(((test_set2_8:0.00039,(((test_set3_6:0.05278,(test_set3_5:0.02030,(((test_set3_0:0.03166,test_set3_1:0.06412)0.783:0.00945,(test_set3_7:0.06330,test_set3_4:0.02026)0.896:0.00014)0.911:0.00014,((test_set3_3:0.02053,test_set3_8:0.04149)0.790:0.00995,test_set3_9:0.01011)0.927:0.00015)0.922:0.00015)0.780:0.00976)0.763:0.03112,test_set3_2:0.00014)0.881:1.40572,(((((test_set1_9:0.07378,(test_set1_7:0.03123,test_set1_5:0.04198)0.756:0.00995)0.883:0.00016,(test_set1_3:0.02027, [...]
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_fasttree_v1.py b/bfillings/tests/test_fasttree_v1.py
new file mode 100644
index 0000000..23a6890
--- /dev/null
+++ b/bfillings/tests/test_fasttree_v1.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Tests for FastTree v1.0.0 application controller"""
+
+from shutil import rmtree
+from os import getcwd
+from unittest import TestCase, main
+
+from skbio.parse.sequences import parse_fasta
+
+from cogent.core.alignment import Alignment
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA
+
+from bfillings.fasttree_v1 import FastTree, build_tree_from_alignment
+
+
+class FastTreeTests(TestCase):
+ def setUp(self):
+ self.seqs = Alignment(dict(parse_fasta(test_seqs.split())))
+
+ def test_base_command(self):
+ app = FastTree()
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "',getcwd(),'/"; ','FastTree']))
+ app.Parameters['-nt'].on()
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "',getcwd(),'/"; ','FastTree -nt']))
+
+ def test_change_working_dir(self):
+ app = FastTree(WorkingDir='/tmp/FastTreeTest')
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "','/tmp/FastTreeTest','/"; ','FastTree']))
+ rmtree('/tmp/FastTreeTest')
+
+ def test_build_tree_from_alignment(self):
+ tree = build_tree_from_alignment(self.seqs, DNA)
+ for o,e in zip(tree.traverse(), DndParser(exp_tree).traverse()):
+ self.assertEqual(o.Name,e.Name)
+ self.assertAlmostEqual(o.Length,e.Length)
+
+test_seqs = """>test_set1_0
+GGTAGATGGGACTACCTCATGACATGAAACTGCAGTCTGTTCTTTTATAGAAGCTTCATACTTGGAGATGTATACTATTA
+CTTAGGACTATGGAGGTATA
+>test_set1_1
+GGTTGATGGGACTACGTAGTGACATGAAATTGCAGTCTGTGCTTTTATAGAAGTTTGATACTTGGAGCTCTCTACTATTA
+CTTAGGACTATGGAGGTATA
+>test_set1_2
+GGTTGATGGGCCTACCTCATGACAATAAACTGAAGTCTGTGCTTTTATAGAGGCTTGATACTTGGAGCTCTATACTATTA
+CTTAGGATTATGGAGGTCTA
+>test_set1_3
+GGTTGATGGGACTACCTCATGACATGAAACTGCAGTCTGTGCTTTTATAGAAGCTTGATACTTGGAGATCTATACTATTA
+CTTAGGACTATGGAGGTCAC
+>test_set1_4
+GGTTGGTGGGACTACCTCATGACATGAAGATGCAGTCTGTGCTTGTATAGAAGCTTGAAACTTGGATATCTATACTATTA
+CTTAAGACTATGGAGGTCTA
+>test_set1_5
+GGTTGATGCGACTACCTCATGACATGAGACTGCAGTCTGTGCTTTTACTGAAGCTTGATACTTGGAGATCTATACTATTA
+CTTAGGACTATGGAGGTTTA
+>test_set1_6
+GGTTGATGGGACTACCTCATGACATGAAAATGCAGTCTGTCCTTTTATAGAAGCTTGATACTTGTAGATCTATACTGTTA
+CTTAGGACTATGGAGGTCTA
+>test_set1_7
+GGTTGATGGGACTCCCTCATGACATAAAACTGCAGTCTGTGCTTTTACAGAAGCTTGATACTTGGAGATCTATACTATTA
+CATAGGACTATGGAGGTCTA
+>test_set1_8
+GGTTGATGGCACTACCTCATGAGATGAAACTGCAGTCTGTGCTTTTATAGAAGCTTGATACTTGGATATCTATACTATAA
+CTTAGTACTATGGAGGCCTA
+>test_set1_9
+GGTTTATGTTACTACCTCATGACATGAAACGGCAGCATGTGCTTTTATAGAAGCTTGATACTTGGAGATCTAAACTATTA
+CTTAGGACTATGGAGGTCTA
+>test_set2_0
+AGCGAATCATACTCTGGAAAGAAAAGGACGACTCCTTTGCTCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_1
+AGAGAATAGTACTCTGGAAAGACAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGATTCA
+>test_set2_2
+AGAGTATAATACTCTGGAAAGAAAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTTAA
+TGATGGTTGAACCGGGGTCA
+>test_set2_3
+AGAGAATCATACTCTGGAAAGAAATGGACGACTCCTTTGATCGCGGTCCAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGGACCGGGTTCA
+>test_set2_4
+AGAGAATAATAGTCTGGAAAGAAAAGGACGACTCCTTTGTTCCCGGTCTAGCTGCTACAGCTTCCCCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_5
+ACAGAATACTACTCTGGAAAGAAAAGGCCGACTCCTTTGATCGCTGTCTAGCTGCGACAGCTGCACGGAGTCCATCCGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_6
+AGAGAATAATACTCTGGACAGAAATGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGCTGAACCGGGTTCA
+>test_set2_7
+AGAGAATATTACTCTGGAAAGAAAAGGACGACTCCTTGGATCGCGGTCTAGCTGCTACAGCTTCAGCGAGTACATCGGAA
+TGATGGTTTAACCGGGTTCA
+>test_set2_8
+AGTGAATAATACTCTGGAAAGAAAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTAGAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_9
+AGAGATTAATACTCTGGATAGAAAATGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGATTGACCTATTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set3_0
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAAAGGAGGATAGAACTCGGACAGTATTCTGAACATTACAG
+AATCGCCGTATTTACGGTGT
+>test_set3_1
+TTGTCTCCATTGAGCACTCTAATCATGCCGTGTATTCAGGAACGGAGGAGAGGACTCGGTCAGTATTCGGAACATTACAG
+AATGGCGTTATTTACGGTGT
+>test_set3_2
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGAATCCTGAATATTACAA
+AATCGGGTTATTTACGGTGT
+>test_set3_3
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTTTTCAGGAACGGAGGATAGAACTCGGACAGTAGCCTGAACATTACAG
+AATCCCGTTATTTACGGTGT
+>test_set3_4
+TTGTCTCCATCGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATTGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_5
+TTGTCTCCATTGAGCACGCTAAGCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_6
+TTGTCGTCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGAAGGATAGAACTCGGACAGTATCCTGAACTTTGCAA
+AATCGCGTTATTTACGGTGT
+>test_set3_7
+TTGTCTCCATTGAGCACTCTAATCTAGCCGTGTAGTCAGGAACGGAGGATGGAACGCGCACAGTATCCTGAACATAACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_8
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTATATTCCCGAACGGAGGATAGAACTCGGACAGTAGCCTGAACAGTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_9
+TTGTCTCCCTTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set4_0
+CTTTTACCGGGCTGCCCGAGAGCACTATCTGCGTCGTGCCCTGCTTCGATGCCCACACTACCATCATACTATTCGTGAAT
+TTGCGGCCGCTAAGATCCGA
+>test_set4_1
+CTTTTATCGGGGTGCCTGATAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCTAAACCACCGTCATGCTATTTGTGAAT
+TTGAGGTCGCTAAGAGCCCA
+>test_set4_2
+CTTTTATCGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCCAGGCCACCATCATACTATTTGTGGCT
+TAGGGGTCGCTAAGAGCCGA
+>test_set4_3
+CTTTTATCGGGGGGCCCGAGAGCACCACCTGCGTCGTGCCCTGCTTCGATGCCCAAACCACCATCATACTATTTGTGAAT
+TTGGGGTCGCTAAGAGCCGA
+>test_set4_4
+CTTTTATAGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCAGCTTCGATTTCCAAACCACCATCATACTATTTGTGAAC
+TTGGGGACGTTAAGAGCCGA
+>test_set4_5
+CTTTTCGCGGGGTGCCCGAGAGCACCATCTGCGTCGCGCCCTGCTTCGGTGCCCATACCACCATCATAATATTTGGGAAA
+TTGGGATCGCTAAGAGTCGA
+>test_set4_6
+CTTTTCTCGGGGTGCCCGAGAGCCCCATCTGCGTTGTGCCCTGCTACTATGCCCAAACCACCATCATACTATTTGTGAAT
+GTGGCGTCGCTCAGAGCCGA
+>test_set4_7
+CTTTTATCGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCCACGTCACCATACTACTATTTGTGAAT
+TTGGGGTCGCTAATAGCCGA
+>test_set4_8
+CTTTTATCGGGGGGCCCGAGAGCATCATCTGCGTCGTGCCCTGCTTCGATGCCCAAACTACCATCATACTATTTGTGAAT
+TTGGGGTTTCTAAGAGCCGA
+>test_set4_9
+CTTTTACCGGGGTGACCGAGAGCACCATCTGCGCCGTGCCCTGCTTCGAGGCCCAAACCACCATCATACTGTTTGTGAAT
+CAGGGGTTGCTAAGAGCCGA"""
+
+exp_tree = """(test_set1_3:0.02062,(test_set1_8:0.05983,test_set1_9:0.07093)0.652:0.00422,((test_set1_5:0.04140,test_set1_7:0.03208)0.634:0.00995,((test_set1_0:0.04748,(test_set1_1:0.07025,(test_set1_2:-0.00367,((((((test_set3_4:0.01485,test_set3_7:0.05863)0.862:0.00569,(test_set3_5:0.02048,(test_set3_3:0.02036,test_set3_8:0.04218)0.724:0.01088)0.397:0.00005)0.519:0.00018,((test_set3_0:0.03139,test_set3_1:0.06448)0.699:0.01095,test_set3_9:0.00940)0.505:0.00036)0.721:0.01080,test_set3_6:0 [...]
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_formatdb.py b/bfillings/tests/test_formatdb.py
new file mode 100755
index 0000000..168c8d4
--- /dev/null
+++ b/bfillings/tests/test_formatdb.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+""" Description
+File created on 16 Sep 2009.
+
+"""
+from __future__ import division
+from os.path import split, exists
+from unittest import TestCase, main
+
+from skbio.util import remove_files
+
+from cogent import LoadSeqs
+from cogent.app.util import get_tmp_filename
+
+from bfillings.blast import blastn
+from bfillings.formatdb import (FormatDb, build_blast_db_from_seqs,
+ build_blast_db_from_fasta_path,
+ build_blast_db_from_fasta_file)
+
+
+class FormatDbTests(TestCase):
+
+ def setUp(self):
+ self.in_seqs1_fp =\
+ get_tmp_filename(prefix='FormatDbTests',suffix='.fasta')
+ self.in_seqs1_file = open(self.in_seqs1_fp,'w')
+ self.in_seqs1_file.write(in_seqs1)
+ self.in_seqs1_file.close()
+ self.in_seqs1 = LoadSeqs(self.in_seqs1_fp,aligned=False)
+ self.test_seq = test_seq
+
+ self.in_aln1_fp =\
+ get_tmp_filename(prefix='FormatDbTests',suffix='.fasta')
+ self.in_aln1_file = open(self.in_aln1_fp,'w')
+ self.in_aln1_file.write(in_aln1)
+ self.in_aln1_file.close()
+ self.in_aln1 = LoadSeqs(self.in_aln1_fp)
+
+
+ self.files_to_remove = [self.in_seqs1_fp,self.in_aln1_fp]
+
+ def tearDown(self):
+ remove_files(self.files_to_remove)
+
+ def test_call(self):
+ """FormatDb: Calling on a nucleotide data functions as expected
+ """
+ fdb = FormatDb(WorkingDir='/tmp')
+ result = fdb(self.in_seqs1_fp)
+
+ # test sucessful run
+ self.assertEqual(result['ExitStatus'],0)
+
+ expected_result_keys = set(\
+ ['log','nhr','nin','nsd','nsi','nsq','ExitStatus','StdOut','StdErr'])
+ self.assertEqual(set(result.keys()),expected_result_keys)
+
+ inputfile_basename = split(self.in_seqs1_fp)[1]
+ # got all the expected out files, and filepaths are as expected
+ outpaths = []
+ for ext in ['log','nhr','nin','nsd','nsi','nsq']:
+ outpath = '/tmp/%s.%s' % (inputfile_basename,ext)
+ outpaths.append(outpath)
+ self.assertEqual(result[ext].name,outpath)
+ result.cleanUp()
+
+ # all created files are cleaned up
+ for outpath in outpaths:
+ self.assertFalse(exists(outpath),\
+ "%s was not cleaned up." % outpath)
+
+ def test_blast_against_new_db(self):
+ """Formatdb: blastall against a newly created DB functions as expected
+ """
+ fdb = FormatDb(WorkingDir='/tmp')
+ result = fdb(self.in_seqs1_fp)
+ blast_res = blastn(self.test_seq,blast_db=self.in_seqs1_fp)
+ result.cleanUp()
+
+ # Test that a blast result was returned
+ self.assertTrue('s1' in blast_res,\
+ "Not getting any blast results.")
+ # Test that the sequence we expect was a good blast hit
+ subject_ids = [r['SUBJECT ID'] for r in blast_res['s1'][0]]
+ self.assertTrue('11472384' in subject_ids,\
+ "Not getting expected blast results.")
+
+ def test_build_blast_db_from_seqs(self):
+ """build_blast_db_from_seqs convenience function works as expected
+ """
+ blast_db, db_files = build_blast_db_from_seqs(self.in_seqs1,output_dir='/tmp')
+ self.assertTrue(blast_db.startswith('/tmp/Blast_tmp_db'))
+ self.assertTrue(blast_db.endswith('.fasta'))
+ expected_db_files = set([blast_db + ext\
+ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
+ self.assertEqual(set(db_files),expected_db_files)
+
+ # result returned when blasting against new db
+ self.assertEqual(\
+ len(blastn(self.test_seq,blast_db=blast_db)),1)
+
+ # Make sure all db_files exist
+ for fp in db_files:
+ self.assertTrue(exists(fp))
+
+ # Remove all db_files exist
+ remove_files(db_files)
+
+ # Make sure nothing weird happened in the remove
+ for fp in db_files:
+ self.assertFalse(exists(fp))
+
+ def test_build_blast_db_from_fasta_path(self):
+ """build_blast_db_from_fasta_path convenience function works as expected
+ """
+ blast_db, db_files = \
+ build_blast_db_from_fasta_path(self.in_seqs1_fp)
+ self.assertEqual(blast_db,self.in_seqs1_fp)
+ expected_db_files = set([self.in_seqs1_fp + ext\
+ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
+ self.assertEqual(set(db_files),expected_db_files)
+
+ # result returned when blasting against new db
+ self.assertEqual(\
+ len(blastn(self.test_seq,blast_db=blast_db)),1)
+
+ # Make sure all db_files exist
+ for fp in db_files:
+ self.assertTrue(exists(fp))
+
+ # Remove all db_files exist
+ remove_files(db_files)
+
+ # Make sure nothing weird happened in the remove
+ for fp in db_files:
+ self.assertFalse(exists(fp))
+
+ def test_build_blast_db_from_fasta_path_aln(self):
+ """build_blast_db_from_fasta_path works with alignment as input
+ """
+ blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp)
+ self.assertEqual(blast_db,self.in_aln1_fp)
+ expected_db_files = set([blast_db + ext\
+ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
+ self.assertEqual(set(db_files),expected_db_files)
+ # result returned when blasting against new db
+ self.assertEqual(\
+ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)
+
+ # Make sure all db_files exist
+ for fp in db_files:
+ self.assertTrue(exists(fp))
+
+ # Remove all db_files exist
+ remove_files(db_files)
+
+ # Make sure nothing weird happened in the remove
+ for fp in db_files:
+ self.assertFalse(exists(fp))
+
+ def test_build_blast_db_from_fasta_file(self):
+ """build_blast_db_from_fasta_file works with open files as input
+ """
+ blast_db, db_files = \
+ build_blast_db_from_fasta_file(open(self.in_aln1_fp),output_dir='/tmp/')
+ self.assertTrue(blast_db.startswith('/tmp/BLAST_temp_db'))
+ self.assertTrue(blast_db.endswith('.fasta'))
+ expected_db_files = set([blast_db] + [blast_db + ext\
+ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
+ self.assertEqual(set(db_files),expected_db_files)
+ # result returned when blasting against new db
+ self.assertEqual(\
+ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)
+
+ # Make sure all db_files exist
+ for fp in db_files:
+ self.assertTrue(exists(fp))
+
+ # Remove all db_files exist
+ remove_files(db_files)
+
+ # Make sure nothing weird happened in the remove
+ for fp in db_files:
+ self.assertFalse(exists(fp))
+
+
+in_seqs1 = """>11472286
+GATGAACGCTGGCGGCATGCTTAACACATGCAAGTCGAACGGAACACTTTGTGTTTTGAGTTAATAGTTCGATAGTAGATAGTAAATAGTGAACACTATGAACTAGTAAACTATTTAACTAGAAACTCTTAAACGCAGAGCGTTTAGTGGCGAACGGGTGAGTAATACATTGGTATCTACCTCGGAGAAGGACATAGCCTGCCGAAAGGTGGGGTAATTTCCTATAGTCCCCGCACATATTTGTTCTTAAATCTGTTAAAATGATTATATGTTTTATGTTTATTTGATAAAAAGCAGCAAGACAAATGAGTTTTATATTGGTTATACAGCAGATTTAAAAAATAGAATTAGGTCTCATAATCAGGGAGAAAACAAATCAACTAAATCTAAAATACCTTGGGAATTGGTTTACTATGAAGCCTACAAAAACCAAACATCAGCAAGGGTTAGAGAATCAAAGTTGAAACATTATGGGCAATCATTAACTAGACT [...]
+>11472384
+AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTTGTCCGGAAAGAAAACGCCGTGGTTAATACCCGTGGCGGATGACGGTACCGGAAGAATAAGCACCG [...]
+>11468680
+TAAACTGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGTGCTTGCACCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACATGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGATCTACGGATGAAAGCGGGGGACCTTCGGGCCTCGCGCTATAGGGTTGGCCGATGGCTGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCAGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGAAAGCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTTGTCCGGAAAGAAATCCTTGGCTCTAATACAGTCGGGGGATGACGGTACCGGAAGA [...]
+>11458037
+GACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGTTTCGAAGATCGGACTTCGAATTTCGAATTTCGATCATCGAGATAGTGGCGGACGGGTGAGTAACGCGTGGGTAACCTACCCATAAAGCCGGGACAACCCTTGGAAACGAGGGCTAATACCGGATAAGCTTGAGAAGTGGCATCACTTTTTAAGGAAAGGTGGCCGATGAGAATGCTGCCGATTATGGATGGACCCGCGTCTGATTAGCTGGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCAGTAGCCGGCCTGAGAGGGTGAACGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATCTTCCGCAATGGACGAAAGTCTGACGGAGCAACGCCGCGTGTATGATGAAGGTTTTCGGATTGTAAAGTACTGTCTATGGGGAAGAATGGTGTGCTTGAGAATATTAAGTACAAATGACGGTAC [...]
+>11469739
+AGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGAGAAGCTAACTTCTGATTCCTTCGGGATGATGAGGTTAGCAGAAAGTGGCGAACGGGTGAGTAACGCGTGGGTAATCTACCCTGTAAGTGGGGGATAACCCTCCGAAAGGAGGGCTAATACCGCATAATATCTTTATCCCAAAAGAGGTAAAGATTAAAGATGGCCTCTATACTATGCTATCGCTTCAGGATGAGTCCGCGTCCTATTAGTTAGTTGGTGGGGTAATGGCCTACCAAGACGACAATGGGTAGCCGGTCTGAGAGGATGTACGGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGAGACAGCAGTGGGGAATATTGCGCAATGGGGGAAACCCTGACGCAGCGACGCCGCGTGGATGATGAAGGCCCTTGGGTTGTAAAATCCTGTTCTGGGGGAAGAAAGCTTAAAGGTCCAAT [...]
+>11469752
+AGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAGCGGCAGCGAGTTCCTCACCGAGGTTCGGAACAGTTGACAGTAAACAGTTGACAGTAAACAGTAACTTCAGAAATGAAGCGGACTGTGAACTGTTTACTGTAACCTGTTAGCTATTATTTCGAGCTTTAGTGAGGAATGTCGGCGAGCGGCGGACGGCTGAGTAACGCGTAGGAACGTACCCCAAACTGAGGGATAAGCACCAGAAATGGTGTCTAATACCGCATATGGCCCAGCACCTTTTTTAATCAACCACGACCCTAAAATCGTGAATAATTGGTAGGAAAAGGTGTTGGGTTAAAGCTTCGGCGGTTTGGGAACGGCCTGCGTATGATTAGCTTGTTGGTGAGGTAAAAGCTCACCAAGGCGACGATCATTAGCTGGTCTGAGAGGATGATCAGCCAGACTGGGACTGAGACACGGCCCAGACTCCTAC [...]
+>11460523
+AGAGTTTGATCCTGGCTCAGAACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGCGAAATCGGGCACTCAATTTTGCTTTTCAAACATTAACTGATGAAACGACCAGAGAGATTGTTCCAGTTTAAAGAGTGAAAAGCAGGCTTGAGTGCCTGAGAGTAGAGTGGCGCACGGGTGAGTAACGCGTAAATAATCTACCCCTGCATCTGGGATAACCCACCGAAAGGTGAGCTAATACCGGATACGTTCTTTTAACCGCGAGGTTTTAAGAAGAAAGGTGGCCTCTGATATAAGCTACTGTGCGGGGAGGAGTTTGCGTACCATTAGCTAGTTGGTAGGGTAATGGCCTACCAAGGCATCGATGGTTAGCGGGTCTGAGAGGATGATCCGCCACACTGGAACTGGAACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGCGCAATGGGGGCAACCCTGACGCAGCGACGCCGCGTGG [...]
+>11460543
+TGGTTTGATCCTGGCTCAGGACAAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGAGAAGCCAGCTTTTGATTCCTTCGGGATGAGAAAGCAGGTAGAAAGTGGCGAACGGGTGAGTAACGCGTGGGTAATCTACCCTGTAAGTAGGGGATAACCCTCTGAAAAGAGGGCTAATACCGCATAATATCTTTACCCCATAAGAAGTAAAGATTAAAGATGGCCTCTGTATATGCTATCGCTTCAGGATGAGCCCGCGTCCTATTAGTTAGTTGGTAAGGTAATGGCTTACCAAGACCACGATGGGTAGCCGGTCTGAGAGGATGTACGGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCGCAATGGGGGAAACCCTGACGCAGCGACGCCGCGTGGATGATGAAGGCCTTCGGGTTGTAAAATCCTGTTTTGGGGGACGAAACCTTAAGGGTCCAATAA [...]
+>11480235
+TGGTTTGATCCTGGCTCAGGATTAACGCTGGCGGCGCGCCTTATACATGCAAGTCGAACGAGCCTTGTGCTTCGCACAAGGAAATTCCAAGCACCAAGCACCAAATCTCAAACAAATCCCAATGACCAAAATTCCAAAAACCTAAACATTTTAAATGTTTAGAATTTGGAAAATTGGAATTTGGAATTTATTTGTTATTTGGAATTTATGATTTGGGATTTTCTCGCGCGGAGANCNTNAGTGGCGAACGGGTGAGTAATACGTTGGTATCTACCCCAAAGTAGAGAATAAGCCCGAGAAATCGGGGTTAATACTCTATGTGTTCGAAAGAACAAAGACTTCGGTTGCTTTGGGAAGAACCTGCGGCCTATCAGCTTGTTGGTAAGGTAACGGCTTACCAAGGCTTTGACGGGTAGCTGGTCTGGGAAGACGACCAGCCACAATGGGACTTAGACACGGCCCATACTCCTACGGGAGGCAGCAGTAGGGAAT [...]
+>11480408
+AATTTAGCGGCCGCGAATTCGCCCTTGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGGGATATCCGAGCGGAAGGTTTCGGCCGGAAGGTTGGGTATTCGAGTGGCGGACGGGTGAGTAACGCGTGAGCAATCTGTCCCGGACAGGGGGATAACACTTGGAAACAGGTGCTAATACCGCATAAGACCACAGCATCGCATGGTGCAGGGGTAAAAGGAGCGATCCGGTCTGGGGTGAGCTCGCGTCCGATTAGATAGTTGGTGAGGTAACGGCCCACCAAGTCAACGATCGGTAGCCGACCTGAGAGGGTGATCGGCCACATTGGAACTGAGAGACGGTCCAAACTCCTACGGGAGGCAGCAGTGGGGAATATTGGGCAATGGGCGAAAGCCTGACCCAGCAACGCCGCGTGAGTGAAGAAGGCCTTCGGGTTGTAAAGCTCTGTTATGCGAGACGAAGGAAG [...]
+"""
+
+test_seq = """>s1 (11472384)
+AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTTGTCCGGAAAGAAAACGCCGTGGTTAATACCCGTGGCGGATGACGGTACCGGAAGAATAAGCACCG [...]
+"""
+
+in_aln1 = """>a1
+AAACCTTT----TTTTAAATTCCGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTC
+>a2
+AAACCTTT----TTTTAAATTCCGCAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTC
+>a3
+AAACCTTT----TTTTAAATTCCGGAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTC
+>a4
+AAACCTTT----TTTTAAATTCCGTAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTC
+"""
+
+
+if __name__ == "__main__":
+ main()
diff --git a/bfillings/tests/test_infernal.py b/bfillings/tests/test_infernal.py
new file mode 100644
index 0000000..bc8d849
--- /dev/null
+++ b/bfillings/tests/test_infernal.py
@@ -0,0 +1,620 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir, path
+import tempfile
+import shutil
+from unittest import TestCase, main
+
+from cogent.util.misc import flatten
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import DataError
+from cogent.parse.rfam import (MinimalRfamParser, ChangedRnaSequence,
+ ChangedSequence)
+from cogent.format.stockholm import stockholm_from_alignment
+from cogent.struct.rna2d import ViennaStructure, wuss_to_vienna
+
+from bfillings.infernal import (Cmalign, Cmbuild, Cmcalibrate, Cmemit, Cmscore,
+ Cmsearch, Cmstat, cmbuild_from_alignment,
+ cmbuild_from_file, cmalign_from_alignment,
+ cmalign_from_file, cmsearch_from_alignment,
+ cmsearch_from_file)
+
+
+class GeneralSetUp(TestCase):
+
+ def setUp(self):
+ """Infernal general setUp method for all tests"""
+ self.seqs1_unaligned = {'1':'ACUGCUAGCUAGUAGCGUACGUA',\
+ '2':'GCUACGUAGCUAC',\
+ '3':'GCGGCUAUUAGAUCGUA'}
+ self.struct1_unaligned_string = '....(((...)))....'
+ self.seqs1_unaligned_gaps = {'1':'ACUGCUAGCUAGU-AGCGUAC--GUA',\
+ '2':'--GCUACGUAGCUAC',\
+ '3':'GCGGCUAUUAGAUCGUA--'}
+
+
+
+ self.seqs2_aligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC---------',\
+ 'c': '------------UGACUACGCAU---------',\
+ 'b': '----UAUCGCUUCGACGAUUCUCUGAUAGAGA'}
+
+ self.seqs2_unaligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC',\
+ 'c': 'UGACUACGCAU',\
+ 'b': 'UAUCGCUUCGACGAUUCUCUGAUAGAGA'}
+
+ self.struct2_aligned_string = '............((.(...)))..........'
+ self.struct2_aligned_dict = {'SS_cons':self.struct2_aligned_string}
+
+ self.lines2 = stockholm_from_alignment(aln=self.seqs2_aligned,\
+ GC_annotation=self.struct2_aligned_dict)
+
+ #self.seqs1 aligned to self.seqs2 with self.seqs2 included.
+ self.seqs1_and_seqs2_aligned = \
+ {'a': 'UAGGCUCUGAUAUAAUAGC-UCUC---------',\
+ 'b': '----UAUCGCUUCGACGAU-UCUCUGAUAGAGA',\
+ 'c': '------------UGACUAC-GCAU---------',\
+ '1': '-ACUGCUAGCUAGUAGCGUACGUA---------',\
+ '2': '----------GCUACGUAG-CUAC---------',\
+ '3': '-----GCGGCUAUUAG-AU-CGUA---------',\
+ }
+
+ self.seqs1_and_seqs2_aligned_struct_string = \
+ '............((.(....)))..........'
+
+ #self.seqs1 aligned to self.seqs2 without self.seqs2 included.
+ self.seqs1_aligned = \
+ {'1': 'ACUGCUAGCUAGUAGCGUACGUA',\
+ '2': '---------GCUACGUAG-CUAC',\
+ '3': '----GCGGCUAUUAG-AU-CGUA',\
+ }
+
+ self.seqs1_aligned_struct_string = \
+ '...........((.(....))).'
+
+ self.temp_dir = tempfile.mkdtemp()
+ self.temp_dir_spaces = '/tmp/test for infernal/'
+ try:
+ mkdir(self.temp_dir_spaces)
+ except OSError:
+ pass
+ try:
+ #create sequence files
+ f = open(path.join(self.temp_dir, 'seqs1.sto'),'w')
+ f.write(self.lines2)
+ f.close()
+ #create cm file.
+ self.cmfile = path.join(self.temp_dir, 'aln2.cm')
+ cm = open(self.cmfile,'w')
+ cm.write(ALN1_CM)
+ cm.close()
+ #create alignment file used to create cm file.
+ self.aln2_file = path.join(self.temp_dir, 'aln2.sto')
+ af = open(self.aln2_file,'w')
+ af.write(self.lines2)
+ af.close()
+ except OSError:
+ pass
+
+
+class CmalignTests(GeneralSetUp):
+ """Tests for the Cmalign application controller"""
+
+ def test_base_command(self):
+ """Infernal BaseCommand should return the correct BaseCommand"""
+ c = Cmalign()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmalign']))
+ c.Parameters['-l'].on()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmalign -l']))
+
+
+ def test_changing_working_dir(self):
+ """Infernal BaseCommand should change according to WorkingDir"""
+ c = Cmalign(WorkingDir='/tmp/cmalign_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmalign_test','/"; ','cmalign']))
+ c = Cmalign()
+ c.WorkingDir = '/tmp/cmalign_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmalign_test2','/"; ','cmalign']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/cmalign_test')
+ rmdir('/tmp/cmalign_test2')
+
+ def test_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ shutil.rmtree(self.temp_dir_spaces)
+
+ def test_cmalign_from_alignment(self):
+ """cmalign_from_alignment should work as expected.
+ """
+ #Align with cmalign_from_alignment without original alignment.
+ aln, struct = cmalign_from_alignment(aln=self.seqs2_aligned,\
+ structure_string=self.struct2_aligned_string,\
+ seqs=self.seqs1_unaligned_gaps,moltype=RNA,include_aln=False)
+ #Check correct alignment
+ self.assertEqual(aln.todict(),self.seqs1_aligned)
+ #Check correct struct
+ self.assertEqual(wuss_to_vienna(str(struct)),\
+ self.seqs1_aligned_struct_string)
+
+ #should work with gapped seqs. Need to test this is taken care of
+ # since cmalign segfaults when there are gaps in the seqs to be aligned.
+ aln, struct = cmalign_from_alignment(aln=self.seqs2_aligned,\
+ structure_string=self.struct2_aligned_string,\
+ seqs=self.seqs1_unaligned_gaps,moltype=RNA)
+ #alignment should be correct
+ self.assertEqual(aln.todict(),self.seqs1_and_seqs2_aligned)
+ #structure should be correct
+ self.assertEqual(wuss_to_vienna(str(struct)),\
+ self.seqs1_and_seqs2_aligned_struct_string)
+
+ #should work with ungapped seqs.
+ aln, struct = cmalign_from_alignment(aln=self.seqs2_aligned,\
+ structure_string=self.struct2_aligned_string,\
+ seqs=self.seqs1_unaligned_gaps,moltype=RNA)
+ #alignment should be correct
+ self.assertEqual(aln.todict(),self.seqs1_and_seqs2_aligned)
+ #structure should be correct
+ self.assertEqual(wuss_to_vienna(str(struct)),\
+ self.seqs1_and_seqs2_aligned_struct_string)
+
+ #should return standard out
+ aln, struct,stdout = cmalign_from_alignment(aln=self.seqs2_aligned,\
+ structure_string=self.struct2_aligned_string,\
+ seqs=self.seqs1_unaligned_gaps,moltype=RNA,\
+ return_stdout=True)
+ #Test that standard out is same length as expected
+ self.assertEqual(len(stdout.split('\n')),\
+ len(CMALIGN_STDOUT.split('\n')))
+
+ def test_cmalign_from_file(self):
+ """cmalign_from_file should work as expected.
+ """
+ #Align with cmalign_from_file without original alignment.
+ aln,struct = cmalign_from_file(cm_file_path=self.cmfile,\
+ seqs=self.seqs1_unaligned,\
+ moltype=RNA)
+ #Check correct alignment
+ self.assertEqual(aln.todict(),self.seqs1_aligned)
+ #Check correct struct
+ self.assertEqual(wuss_to_vienna(str(struct)),\
+ self.seqs1_aligned_struct_string)
+
+ #Align with cmalign_from_file using original alignment.
+ aln,struct = cmalign_from_file(cm_file_path=self.cmfile,\
+ seqs=self.seqs1_unaligned,\
+ moltype=RNA,\
+ alignment_file_path=self.aln2_file,\
+ include_aln=True)
+ #alignment should be correct
+ self.assertEqual(aln.todict(),self.seqs1_and_seqs2_aligned)
+ #structure should be correct
+ self.assertEqual(wuss_to_vienna(str(struct)),\
+ self.seqs1_and_seqs2_aligned_struct_string)
+
+
+class CmbuildTests(GeneralSetUp):
+ """Tests for the Cmbuild application controller"""
+
+ def test_base_command(self):
+ """Infernal BaseCommand should return the correct BaseCommand"""
+ c = Cmbuild()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmbuild']))
+ c.Parameters['-A'].on()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmbuild -A']))
+
+ def test_changing_working_dir(self):
+ """Infernal BaseCommand should change according to WorkingDir"""
+ c = Cmbuild(WorkingDir='/tmp/cmbuild_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmbuild_test','/"; ','cmbuild']))
+ c = Cmbuild()
+ c.WorkingDir = '/tmp/cmbuild_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmbuild_test2','/"; ','cmbuild']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/cmbuild_test')
+ rmdir('/tmp/cmbuild_test2')
+
+ def test_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ shutil.rmtree(self.temp_dir_spaces)
+
+ def test_cmbuild_from_alignment(self):
+ """cmbuild_from_alignment should work as expected.
+ """
+ #Test unaligned seqs and unaligned struct fail.
+ #DataError should be raised with Alignment is constructed
+ self.assertRaises(DataError,cmbuild_from_alignment,\
+ self.seqs1_unaligned,self.struct1_unaligned_string)
+
+ #Test aligned seqs and unaligned struct fail.
+ self.assertRaises(ValueError,cmbuild_from_alignment,\
+ self.seqs2_aligned,self.struct1_unaligned_string)
+
+ #Test get cm back without alignment.
+ cm_res = cmbuild_from_alignment(self.seqs2_aligned,\
+ self.struct2_aligned_string)
+ cm_lines = cm_res.split('\n')
+ ALN1_CM_lines = ALN1_CM.split('\n')
+ #Check that the same number of lines are in both CMs
+ self.assertEqual(len(cm_lines),len(ALN1_CM_lines))
+
+ #The first 13 lines are unique to the specific run. The res of the
+ # CM should be the same, since built from the same data.
+ self.assertEqual(cm_lines[13:],ALN1_CM_lines[13:])
+
+ #Make sure same alignment is returned if return_alignment=True
+ cm_res, cm_aln = cmbuild_from_alignment(self.seqs2_aligned,\
+ self.struct2_aligned_string,return_alignment=True)
+ self.assertEqual(cm_aln,self.lines2)
+
+ def test_cmbuild_from_file(self):
+ """cmbuild_from_file should work as expected.
+ """
+ cm_res = cmbuild_from_file(self.temp_dir+'/seqs1.sto')
+ cm_lines = cm_res.split('\n')
+ ALN1_CM_lines = ALN1_CM.split('\n')
+ #Check that the same number of lines are in both CMs
+ self.assertEqual(len(cm_lines),len(ALN1_CM_lines))
+
+ #The first 13 lines are unique to the specific run. The res of the
+ # CM should be the same, since built from the same data.
+ self.assertEqual(cm_lines[13:],ALN1_CM_lines[13:])
+
+ #Make sure same alignment is returned if return_alignment=True
+ cm_res, cm_aln = cmbuild_from_alignment(self.seqs2_aligned,\
+ self.struct2_aligned_string,return_alignment=True)
+ self.assertEqual(cm_aln,self.lines2)
+
+class CmcalibrateTests(GeneralSetUp):
+ """Tests for the Cmcalibrate application controller"""
+
+ def test_base_command(self):
+ """Infernal BaseCommand should return the correct BaseCommand"""
+ c = Cmcalibrate()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmcalibrate']))
+ c.Parameters['--mpi'].on()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmcalibrate --mpi']))
+
+
+ def test_changing_working_dir(self):
+ """Infernal BaseCommand should change according to WorkingDir"""
+ c = Cmcalibrate(WorkingDir='/tmp/cmcalibrate_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmcalibrate_test','/"; ','cmcalibrate']))
+ c = Cmcalibrate()
+ c.WorkingDir = '/tmp/cmcalibrate_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmcalibrate_test2','/"; ','cmcalibrate']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/cmcalibrate_test')
+ rmdir('/tmp/cmcalibrate_test2')
+
+ def test_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ shutil.rmtree(self.temp_dir_spaces)
+
+class CmemitTests(GeneralSetUp):
+ """Tests for the Cmemit application controller"""
+
+ def test_base_command(self):
+ """Infernal BaseCommand should return the correct BaseCommand"""
+ c = Cmemit()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmemit']))
+ c.Parameters['-u'].on()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmemit -u']))
+
+
+ def test_changing_working_dir(self):
+ """Infernal BaseCommand should change according to WorkingDir"""
+ c = Cmemit(WorkingDir='/tmp/cmemit_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmemit_test','/"; ','cmemit']))
+ c = Cmemit()
+ c.WorkingDir = '/tmp/cmemit_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmemit_test2','/"; ','cmemit']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/cmemit_test')
+ rmdir('/tmp/cmemit_test2')
+
+ def test_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ shutil.rmtree(self.temp_dir_spaces)
+
+class CmscoreTests(GeneralSetUp):
+ """Tests for the Cmscore application controller"""
+
+ def test_base_command(self):
+ """Infernal BaseCommand should return the correct BaseCommand"""
+ c = Cmscore()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmscore']))
+ c.Parameters['-l'].on()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmscore -l']))
+
+
+ def test_changing_working_dir(self):
+ """Infernal BaseCommand should change according to WorkingDir"""
+ c = Cmscore(WorkingDir='/tmp/cmscore_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmscore_test','/"; ','cmscore']))
+ c = Cmscore()
+ c.WorkingDir = '/tmp/cmscore_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmscore_test2','/"; ','cmscore']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/cmscore_test')
+ rmdir('/tmp/cmscore_test2')
+
+ def test_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ shutil.rmtree(self.temp_dir_spaces)
+
+
+class CmsearchTests(GeneralSetUp):
+ """Tests for the Cmsearch application controller"""
+
+ def test_base_command(self):
+ """Infernal BaseCommand should return the correct BaseCommand"""
+ c = Cmsearch()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmsearch']))
+ c.Parameters['-p'].on()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmsearch -p']))
+
+
+ def test_changing_working_dir(self):
+ """Infernal BaseCommand should change according to WorkingDir"""
+ c = Cmsearch(WorkingDir='/tmp/cmsearch_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmsearch_test','/"; ','cmsearch']))
+ c = Cmsearch()
+ c.WorkingDir = '/tmp/cmsearch_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmsearch_test2','/"; ','cmsearch']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/cmsearch_test')
+ rmdir('/tmp/cmsearch_test2')
+
+ def test_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ shutil.rmtree(self.temp_dir_spaces)
+
+ def test_cmsearch_from_alignment_no_hits(self):
+ """cmsearch_from_alignment should work as expected
+ """
+ search_res = cmsearch_from_alignment(aln=self.seqs2_aligned,\
+ structure_string=self.struct2_aligned_string,\
+ seqs=self.seqs1_unaligned,moltype=RNA)
+ self.assertEqual(search_res,[])
+
+ def test_cmsearch_from_alignment(self):
+ """cmsearch_from_alignment should work as expected
+ """
+ exp_search_res = [['a', 5, 23, 1, 19, 12.85, '-', 37],\
+ ['b', 1, 19, 1, 19, 14.359999999999999, '-', 47]]
+ search_res = cmsearch_from_alignment(aln=self.seqs2_aligned,\
+ structure_string=self.struct2_aligned_string,\
+ seqs=self.seqs2_unaligned,moltype=RNA)
+ for search, exp in zip(search_res, exp_search_res):
+ self.assertEqual(search[1:],exp)
+
+ def test_cmsearch_from_file_no_hits(self):
+ """cmsearch_from_file should work as expected
+ """
+ search_res = cmsearch_from_file(cm_file_path=self.cmfile,\
+ seqs=self.seqs1_unaligned,moltype=RNA)
+ self.assertEqual(search_res,[])
+
+ def test_cmsearch_from_file(self):
+ """cmsearch_from_file should work as expected
+ """
+ exp_search_res = [['a', 5, 23, 1, 19, 12.85, '-', 37],\
+ ['b', 1, 19, 1, 19, 14.359999999999999, '-', 47]]
+ search_res = cmsearch_from_file(cm_file_path=self.cmfile,\
+ seqs=self.seqs2_unaligned,moltype=RNA)
+ for search, exp in zip(search_res, exp_search_res):
+ self.assertEqual(search[1:],exp)
+
+class CmstatTests(GeneralSetUp):
+ """Tests for the Cmstat application controller"""
+
+ def test_base_command(self):
+ """Infernal BaseCommand should return the correct BaseCommand"""
+ c = Cmstat()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmstat']))
+ c.Parameters['-g'].on()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','cmstat -g']))
+
+
+ def test_changing_working_dir(self):
+ """Infernal BaseCommand should change according to WorkingDir"""
+ c = Cmstat(WorkingDir='/tmp/cmstat_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmstat_test','/"; ','cmstat']))
+ c = Cmstat()
+ c.WorkingDir = '/tmp/cmstat_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/cmstat_test2','/"; ','cmstat']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/cmstat_test')
+ rmdir('/tmp/cmstat_test2')
+
+ def test_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ shutil.rmtree(self.temp_dir_spaces)
+
+ALN1_CM = """INFERNAL-1 [1.0rc1]
+NAME aln1-1
+STATES 61
+NODES 18
+ALPHABET 1
+ELSELF -0.08926734
+WBETA 1e-07
+NSEQ 3
+EFFNSEQ 3.000
+CLEN 19
+BCOM cmbuild aln1.cm aln1.sto
+BDATE Sun Oct 5 18:45:35 2008
+NULL 0.000 0.000 0.000 0.000
+MODEL:
+ [ ROOT 0 ]
+ S 0 -1 0 1 4 -2.071 -2.210 -1.649 -2.140
+ IL 1 1 2 1 4 -0.556 -5.022 -1.818 -7.508 0.000 0.000 0.000 0.000
+ IR 2 2 3 2 3 -0.310 -2.439 -6.805 0.000 0.000 0.000 0.000
+ [ MATL 1 ]
+ ML 3 2 3 5 3 -8.003 -0.020 -6.657 -0.389 0.377 -1.236 0.597
+ D 4 2 3 5 3 -7.923 -3.436 -0.146
+ IL 5 5 3 5 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 2 ]
+ ML 6 5 3 8 3 -8.003 -0.020 -6.657 0.711 -1.015 -1.162 0.507
+ D 7 5 3 8 3 -7.923 -3.436 -0.146
+ IL 8 8 3 8 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 3 ]
+ ML 9 8 3 11 3 -8.003 -0.020 -6.657 -0.389 0.377 -1.236 0.597
+ D 10 8 3 11 3 -7.923 -3.436 -0.146
+ IL 11 11 3 11 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 4 ]
+ ML 12 11 3 14 3 -8.003 -0.020 -6.657 -0.392 0.246 -1.238 0.703
+ D 13 11 3 14 3 -7.923 -3.436 -0.146
+ IL 14 14 3 14 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 5 ]
+ ML 15 14 3 17 3 -8.003 -0.020 -6.657 -1.340 -2.411 1.644 -1.777
+ D 16 14 3 17 3 -7.923 -3.436 -0.146
+ IL 17 17 3 17 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 6 ]
+ ML 18 17 3 20 3 -8.003 -0.020 -6.657 0.830 0.106 -1.204 -0.492
+ D 19 17 3 20 3 -7.923 -3.436 -0.146
+ IL 20 20 3 20 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 7 ]
+ ML 21 20 3 23 3 -8.003 -0.020 -6.657 -1.143 -1.575 -1.925 1.560
+ D 22 20 3 23 3 -7.923 -3.436 -0.146
+ IL 23 23 3 23 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 8 ]
+ ML 24 23 3 26 3 -8.391 -0.018 -6.709 0.821 -1.044 -1.178 0.385
+ D 25 23 3 26 3 -6.905 -0.258 -2.688
+ IL 26 26 3 26 3 -1.925 -0.554 -4.164 0.000 0.000 0.000 0.000
+ [ MATR 9 ]
+ MR 27 26 3 29 5 -7.411 -0.031 -7.227 -7.439 -8.330 -0.726 0.967 -1.567 0.142
+ D 28 26 3 29 5 -5.352 -0.707 -2.978 -4.409 -2.404
+ IR 29 29 3 29 5 -2.408 -0.496 -5.920 -4.087 -5.193 0.000 0.000 0.000 0.000
+ [ MATP 10 ]
+ MP 30 29 3 34 6 -9.266 -9.205 -0.019 -7.982 -8.261 -8.656 -1.570 -1.865 -1.898 0.327 -1.331 -2.318 0.651 0.994 -1.872 0.282 -2.224 -0.666 1.972 -1.608 -0.242 1.187
+ ML 31 29 3 34 6 -6.250 -6.596 -1.310 -1.005 -6.446 -3.975 0.660 -0.612 -0.293 -0.076
+ MR 32 29 3 34 6 -6.988 -5.717 -1.625 -5.695 -0.829 -3.908 0.660 -0.612 -0.293 -0.076
+ D 33 29 3 34 6 -9.049 -7.747 -3.544 -4.226 -4.244 -0.319
+ IL 34 34 5 34 6 -2.579 -2.842 -0.760 -4.497 -5.274 -4.934 0.000 0.000 0.000 0.000
+ IR 35 35 6 35 5 -2.408 -0.496 -5.920 -4.087 -5.193 0.000 0.000 0.000 0.000
+ [ MATP 11 ]
+ MP 36 35 6 40 4 -7.331 -7.538 -0.041 -5.952 -4.114 0.397 -4.664 0.815 -4.665 -4.015 -0.462 -4.315 -3.939 3.331 -3.732 -0.830 -0.398 -3.640 -1.958 -3.517
+ ML 37 35 6 40 4 -3.758 -3.940 -0.507 -2.670 0.660 -0.612 -0.293 -0.076
+ MR 38 35 6 40 4 -4.809 -3.838 -1.706 -0.766 0.660 -0.612 -0.293 -0.076
+ D 39 35 6 40 4 -4.568 -4.250 -2.265 -0.520
+ IL 40 40 5 40 4 -1.686 -2.369 -1.117 -4.855 0.000 0.000 0.000 0.000
+ IR 41 41 6 41 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 12 ]
+ ML 42 41 6 44 5 -7.411 -0.031 -7.227 -7.439 -8.330 1.826 -2.947 -2.856 -2.413
+ D 43 41 6 44 5 -4.959 -0.803 -4.221 -2.596 -2.508
+ IL 44 44 3 44 5 -2.408 -0.496 -4.087 -5.920 -5.193 0.000 0.000 0.000 0.000
+ [ MATP 13 ]
+ MP 45 44 3 49 4 -7.331 -7.538 -0.041 -5.952 -1.592 -1.722 -1.807 0.471 -1.387 -2.146 1.822 0.774 -1.836 0.505 -2.076 -0.521 1.055 -1.515 -0.260 0.958
+ ML 46 44 3 49 4 -3.758 -3.940 -0.507 -2.670 0.660 -0.612 -0.293 -0.076
+ MR 47 44 3 49 4 -4.809 -3.838 -1.706 -0.766 0.660 -0.612 -0.293 -0.076
+ D 48 44 3 49 4 -4.568 -4.250 -2.265 -0.520
+ IL 49 49 5 49 4 -1.686 -2.369 -1.117 -4.855 0.000 0.000 0.000 0.000
+ IR 50 50 6 50 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 14 ]
+ ML 51 50 6 53 3 -8.323 -0.016 -6.977 0.481 -1.091 -0.011 0.192
+ D 52 50 6 53 3 -6.174 -1.687 -0.566
+ IL 53 53 3 53 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 15 ]
+ ML 54 53 3 56 3 -8.323 -0.016 -6.977 1.148 -1.570 -0.075 -1.007
+ D 55 53 3 56 3 -6.174 -1.687 -0.566
+ IL 56 56 3 56 3 -1.442 -0.798 -4.142 0.000 0.000 0.000 0.000
+ [ MATL 16 ]
+ ML 57 56 3 59 2 * 0.000 -0.726 0.967 -1.567 0.142
+ D 58 56 3 59 2 * 0.000
+ IL 59 59 3 59 2 -1.823 -0.479 0.000 0.000 0.000 0.000
+ [ END 17 ]
+ E 60 59 3 -1 0
+//
+"""
+
+CMALIGN_STDOUT = """# cmalign :: align sequences to an RNA CM
+# INFERNAL 1.0rc1 (June 2008)
+# Copyright 2007-2009 (C) 2008 HHMI Janelia Farm Research Campus
+# Freely distributed under the GNU General Public License (GPL)
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+# command: cmalign --withali aln1.sto -o all_aligned.sto aln1.cm seqs1.fasta
+# date: Sun Oct 5 22:04:30 2008
+#
+# cm name algorithm config sub bands tau
+# ------------------------- --------- ------ --- ----- ------
+# aln1-1 opt acc global no hmm 1e-07
+#
+# bit scores
+# ------------------
+# seq idx seq name len total struct avg prob elapsed
+# ------- -------- ----- -------- -------- -------- -----------
+ 1 1 23 -9.98 5.71 0.260 00:00:00.01
+ 2 2 13 -6.79 6.73 0.710 00:00:00.00
+ 3 3 17 -7.43 5.86 0.754 00:00:00.01
+
+# Alignment saved in file all_aligned.sto.
+#
+# CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00
+"""
+
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_mafft.py b/bfillings/tests/test_mafft.py
new file mode 100644
index 0000000..d4ca8db
--- /dev/null
+++ b/bfillings/tests/test_mafft.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir, path
+import tempfile
+import shutil
+from unittest import TestCase, main
+
+from cogent.core.moltype import RNA
+from cogent.util.misc import flatten
+from bfillings.mafft import (Mafft, align_unaligned_seqs, add_seqs_to_alignment,
+ align_two_alignments)
+
+
+class GeneralSetUp(TestCase):
+
+ def setUp(self):
+ """Mafft general setUp method for all tests"""
+ self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+ 'GCGGCUAUUAGAUCGUA']
+
+ self.labels1 = ['>1','>2','>3']
+ self.lines1 = flatten(zip(self.labels1,self.seqs1))
+
+ self.aligned1 = {'1': 'acugcuagcuaguagcguacgua',\
+ '2': 'gcuacguagcuac----------',\
+ '3': 'gcggcuauuagau------cgua',\
+ }
+
+
+ self.seqs2=['UAGGCUCUGAUAUAAUAGCUCUC','UAUCGCUUCGACGAUUCUCUGAUAGAGA',
+ 'UGACUACGCAU']
+ self.labels2=['>a','>b','>c']
+ self.lines2 = flatten(zip(self.labels2,self.seqs2))
+
+ self.aligned2 = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC---------',\
+ 'b': 'UA----UCGCUUCGACGAUUCUCUGAUAGAGA',\
+ 'c': 'UG------------ACUACGCAU---------',\
+ }
+
+
+ self.temp_dir = tempfile.mkdtemp()
+ self.temp_dir_spaces = '/tmp/test for mafft/'
+ try:
+ mkdir(self.temp_dir_spaces)
+ except OSError:
+ pass
+ try:
+ #create sequence files
+ f = open(path.join(self.temp_dir, 'seq1.txt'),'w')
+ f.write('\n'.join(self.lines1))
+ f.close()
+ g = open(path.join(self.temp_dir, 'seq2.txt'),'w')
+ g.write('\n'.join(self.lines2))
+ g.close()
+ except OSError:
+ pass
+
+
+class MafftTests(GeneralSetUp):
+ """Tests for the Mafft application controller"""
+
+ def test_base_command(self):
+ """Mafft BaseCommand should return the correct BaseCommand"""
+ c = Mafft()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','mafft']))
+ c.Parameters['--quiet'].on()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','mafft --quiet']))
+ c.Parameters['--globalpair'].on()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','mafft --globalpair --quiet']))
+ c.Parameters['--maxiterate'].on(1000)
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ',"""mafft --maxiterate 1000 --globalpair --quiet"""]))
+
+ def test_changing_working_dir(self):
+ """Mafft BaseCommand should change according to WorkingDir"""
+ c = Mafft(WorkingDir='/tmp/mafft_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/mafft_test','/"; ','mafft']))
+ c = Mafft()
+ c.WorkingDir = '/tmp/mafft_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/mafft_test2','/"; ','mafft']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/mafft_test')
+ rmdir('/tmp/mafft_test2')
+
+ def test_general_cleanUp(self):
+ """Last test executed: cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ shutil.rmtree(self.temp_dir_spaces)
+
+ def test_align_unaligned_seqs(self):
+ """align_unaligned_seqs should work as expected"""
+ res = align_unaligned_seqs(self.seqs1, RNA)
+ self.assertEqual(res.toFasta(), align1)
+ res = align_unaligned_seqs(self.lines2, RNA)
+ self.assertEqual(res.toFasta(), align2)
+
+ def test_add_seqs_to_alignment(self):
+ """add_seqs_to_alignment should work as expected."""
+ res = add_seqs_to_alignment(self.lines1,self.aligned2, RNA)
+ self.assertEqual(res.toFasta(), add_seqs_align)
+
+ def test_align_two_alignments(self):
+ """align_two_alignments should work as expected."""
+ res = align_two_alignments(self.aligned1, self.aligned2, RNA)
+ self.assertEqual(res.toFasta(), align_two_align)
+
+align1 = ">seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\nGCUACGUAGCUAC----------\n>seq_2\nGCGGCUAUUAGAU------CGUA"
+
+align2 = ">a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\nUA----UCGCUUCGACGAUUCUCUGAUAGAGA\n>c\nUG------------ACUACGCAU---------"
+
+add_seqs_align = """>1\nACUGC-UAGCUAGUAGCGUACGUA--------\n>2\nGCUACGUAGCUA-----------C--------\n>3\nGCGGCUAUUAGAUCGUA---------------\n>a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\nUA----UCGCUUCGACGAUUCUCUGAUAGAGA\n>c\nUG------------ACUACGCAU---------"""
+
+align_two_align = """>1\nACUGCUAGCUAGUAGCGUACGUA---------\n>2\nGCUACGUAGCUAC-------------------\n>3\nGCGGCUAUUAGAU------CGUA---------\n>a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\nUA----UCGCUUCGACGAUUCUCUGAUAGAGA\n>c\nUG------------ACUACGCAU---------"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_mothur.py b/bfillings/tests/test_mothur.py
new file mode 100644
index 0000000..7a1cd1e
--- /dev/null
+++ b/bfillings/tests/test_mothur.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from __future__ import with_statement
+from cStringIO import StringIO
+from os import remove, rmdir
+import os.path
+from shutil import rmtree
+from tempfile import mkdtemp, mkstemp, NamedTemporaryFile
+from unittest import TestCase, main
+
+from bfillings.mothur import (Mothur, mothur_from_file, MothurClassifySeqs,
+ mothur_classify_file)
+
+
+__author__ = "Kyle Bittinger"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Kyle Bittinger", "Jose Carlos Clemente Litran"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "Kyle Bittinger"
+__email__ = "kylebittinger at gmail.com"
+__status__ = "Development"
+
+
+class MothurTests(TestCase):
+ def setUp(self):
+ self.small_fasta = (
+ '>aaaaaa\nTAGGCTCTGATATAATAGCTCTC---------\n'
+ '>cccccc\n------------TGACTACGCAT---------\n'
+ '>bbbbbb\n----TATCGCTTCGACGATTCTCTGATAGAGA\n'
+ )
+ self.small_otus = (
+ 'unique\t3\taaaaaa\tcccccc\tbbbbbb\t\n'
+ '0.62\t2\taaaaaa\tbbbbbb,cccccc\t\n'
+ '0.67\t1\tbbbbbb,cccccc,aaaaaa\t\n'
+ )
+ self.small_otus_parsed = [
+ (float('0'), [['aaaaaa'], ['cccccc'], ['bbbbbb']]),
+ (float('0.62'), [['aaaaaa'], ['bbbbbb', 'cccccc']]),
+ (float('0.67'), [['bbbbbb', 'cccccc', 'aaaaaa']]),
+ ]
+ self.complement_fasta = (
+ '>a\n--AGGGGTAATAA--\n'
+ '>b\n--TTATTACCCCT--\n'
+ '>c\n-------AAAAAA--\n'
+ )
+ self.complement_otus = (
+ 'unique\t3\ta\tb\tc\t\n'
+ '0.43\t2\tc,a\tb\t\n'
+ '1.00\t1\tb,c,a\t\n'
+ )
+ self.work_dir = mkdtemp()
+
+ def tearDown(self):
+ rmtree(self.work_dir)
+
+ def test_get_help(self):
+ """Mothur.getHelp() should return help string"""
+ expected_help = (
+ 'See manual, available on the MOTHUR wiki:\n'
+ 'http://schloss.micro.umass.edu/mothur/'
+ )
+ self.assertEqual(Mothur.getHelp(), expected_help)
+
+ def test_compile_mothur_script(self):
+ """Mothur._compile_mothur_script() should return valid Mothur script"""
+ app = Mothur()
+ app._input_filename = 'test.fasta'
+ observed_script = app._compile_mothur_script()
+ expected_script = (
+ '#unique.seqs(fasta=test.fasta); '
+ 'dist.seqs(fasta=test.unique.fasta); '
+ 'read.dist(column=test.unique.dist, name=test.names); '
+ 'cluster(method=furthest)')
+ self.assertEqual(observed_script, expected_script)
+
+ def test_get_result_paths(self):
+ """Mothur._get_result_paths() should guess correct output paths"""
+ app = Mothur()
+ app._input_filename = 'test.fasta'
+ observed_paths = {
+ 'distance matrix': app._derive_dist_path(),
+ 'otu list': app._derive_list_path(),
+ 'rank abundance': app._derive_rank_abundance_path(),
+ 'species abundance': app._derive_species_abundance_path(),
+ 'unique names': app._derive_names_path(),
+ 'unique seqs': app._derive_unique_path(),
+ }
+ expected_paths = {
+ 'distance matrix': 'test.unique.dist',
+ 'otu list': 'test.unique.fn.list',
+ 'rank abundance': 'test.unique.fn.rabund',
+ 'species abundance': 'test.unique.fn.sabund',
+ 'unique names': 'test.names',
+ 'unique seqs': 'test.unique.fasta',
+ }
+ self.assertEqual(observed_paths, expected_paths)
+
+ def test_working_directory(self):
+ """Mothur.WorkingDir attribute should not be cast to FilePath object"""
+ app = Mothur(WorkingDir='/tmp')
+ self.assertEquals(str(app.WorkingDir), '/tmp')
+
+ def test_working_directory_used(self):
+ """Mothur input file should be created in the working dir."""
+ app = Mothur(WorkingDir=self.work_dir)
+ result = app(self.small_fasta, remove_tmp=False)
+ input_dir, _ = os.path.split(app._input_filename)
+ self.assertEqual(input_dir, self.work_dir)
+ result.cleanUp()
+
+ def test_call_with_multiline_string(self):
+ """Mothur.__call__() should return correct otu's for input as single string"""
+ app = Mothur()
+ result = app(self.small_fasta)
+ observed_otus = result['otu list'].read()
+ self.assertEquals(observed_otus, self.small_otus)
+ result.cleanUp()
+
+ def test_call_with_lines(self):
+ """Mothur.__call__() should return correct otu's for input as lines"""
+ lines = self.small_fasta.split('\n')
+ app = Mothur(InputHandler='_input_as_lines')
+ result = app(lines)
+ observed_otus = result['otu list'].read()
+ self.assertEquals(observed_otus, self.small_otus)
+ result.cleanUp()
+
+ def test_call_with_path(self):
+ """Mothur.__call__() should return correct otu's for input as path"""
+ working_dir = mkdtemp()
+ _, filename = mkstemp(dir=working_dir, suffix='.fasta')
+ with open(filename, 'w') as f:
+ f.write(self.small_fasta)
+ app = Mothur(InputHandler='_input_as_path', WorkingDir=working_dir)
+ result = app(filename)
+ observed_otus = result['otu list'].read()
+ self.assertEquals(observed_otus, self.small_otus)
+ remove(filename)
+ result.cleanUp()
+ rmdir(working_dir)
+
+ def test_call_with_working_dir(self):
+ """Mothur.__call__() should return correct otu's when input dir is changed"""
+ working_dir = mkdtemp()
+ app = Mothur(WorkingDir=working_dir)
+ result = app(self.small_fasta)
+ observed_otus = result['otu list'].read()
+ self.assertEquals(observed_otus, self.small_otus)
+ result.cleanUp()
+ rmdir(working_dir)
+
+ def test_call_with_complement(self):
+ """Mothur.__call__() should return correct otu's for input sequences which are reverse complements"""
+ app = Mothur()
+ result = app(self.complement_fasta)
+ observed_otus = result['otu list'].read()
+ self.assertEquals(observed_otus, self.complement_otus)
+ result.cleanUp()
+
+ def test_mothur_from_file(self):
+ """mothur_from_file() should return parsed otus"""
+ f = StringIO(self.small_fasta)
+ f.seek(0)
+ parsed_otus = mothur_from_file(f)
+ self.assertEquals(parsed_otus, self.small_otus_parsed)
+
+
+class TestMothurClassifySeqs(TestCase):
+ def setUp(self):
+ self.ref_file = NamedTemporaryFile()
+ self.ref_file.write(mothur_ref_seqs)
+ self.ref_file.seek(0)
+
+ self.tax_file = NamedTemporaryFile()
+ self.tax_file.write(mothur_taxonomy)
+ self.tax_file.seek(0)
+
+ self.work_dir = mkdtemp()
+
+ def tearDown(self):
+ rmtree(self.work_dir)
+
+ def test_app(self):
+ app = MothurClassifySeqs({
+ 'reference': self.ref_file.name,
+ 'taxonomy': self.tax_file.name,
+ }, WorkingDir=self.work_dir)
+ res = app(mothur_seqs)
+ assignments = res['assignments'].read()
+ self.assertEqual(assignments, mothur_assignments)
+ summary = res['summary'].read()
+ # Later versions of mothur add a tab before the newline. We
+ # do not care about trailing whitespace as long as content is
+ # the same.
+ summary = summary.replace("\t\n", "\n")
+ self.assertEqual(summary, mothur_summary)
+ res.cleanUp()
+
+ def test_format_function_arguments(self):
+ app = MothurClassifySeqs({
+ 'reference': '/home/myuser/ref-seqs.fasta',
+ 'taxonomy': '/home/MyUser/data/tax.txt',
+ 'cutoff': 80,
+ })
+ obs_args = app._format_function_arguments(
+ ['reference', 'taxonomy', 'cutoff', 'iters'])
+ exp_args = (
+ "reference=/home/myuser/ref\\-seqs.fasta, "
+ "taxonomy=/home/MyUser/data/tax.txt, cutoff=80")
+ self.assertEqual(obs_args, exp_args)
+
+ def test_compile_mothur_script(self):
+ app = MothurClassifySeqs({
+ 'reference': '/home/myuser/ref-seqs.fasta',
+ 'taxonomy': '/home/MyUser/data/tax.txt',
+ 'cutoff': 80,
+ })
+ app._input_filename = "/my/input.fasta"
+ exp_script = (
+ "#classify.seqs(fasta=/my/input.fasta, "
+ "reference=/home/myuser/ref\-seqs.fasta, "
+ "taxonomy=/home/MyUser/data/tax.txt, "
+ "cutoff=80)")
+ self.assertEqual(app._compile_mothur_script(), exp_script)
+
+ def test_mothur_classify_file(self):
+ query_file = StringIO(mothur_seqs)
+ res = mothur_classify_file(
+ query_file, self.ref_file.name, self.tax_file.name)
+ exp_res = {
+ 'A': (['k__Bacteria', 'p__Firmicutes', 'c__Clostridia',
+ 'o__Clostridale', 'f__Eubacteriaceae', 'g__Eubacterium',
+ 's__Eubacteriumfoedans'], 1.0),
+ 'Very': (['k__Bacteria', 'p__Bacteriodetes'], 1.0),
+ '01': (['k__Bacteria', 'p__Firmicutes'], 1.0),
+ }
+ self.assertEqual(res, exp_res)
+
+ def test_unclassifiable_sequence(self):
+ query_file = StringIO(
+ ">MostlyTs\nTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+ "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+ "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n")
+ res = mothur_classify_file(
+ query_file, self.ref_file.name, self.tax_file.name)
+ exp_res = {
+ 'MostlyTs': (['Unknown'], 0.0),
+ }
+ self.assertEqual(res, exp_res)
+
+
+mothur_assignments = """\
+01 k__Bacteria(100);p__Firmicutes(100);unclassified;unclassified;unclassified;unclassified;unclassified;
+A k__Bacteria(100);p__Firmicutes(100);c__Clostridia(100);o__Clostridale(100);f__Eubacteriaceae(100);g__Eubacterium(100);s__Eubacteriumfoedans(100);
+Very k__Bacteria(100);p__Bacteriodetes(100);unclassified;unclassified;unclassified;unclassified;unclassified;
+"""
+
+mothur_summary = """\
+taxlevel rankID taxon daughterlevels total
+0 0 Root 1 3
+1 0.1 k__Bacteria 2 3
+2 0.1.1 p__Bacteriodetes 1 1
+3 0.1.1.1 unclassified 1 1
+4 0.1.1.1.1 unclassified 1 1
+5 0.1.1.1.1.1 unclassified 1 1
+6 0.1.1.1.1.1.1 unclassified 1 1
+7 0.1.1.1.1.1.1.1 unclassified 0 1
+2 0.1.2 p__Firmicutes 2 2
+3 0.1.2.1 c__Clostridia 1 1
+4 0.1.2.1.1 o__Clostridale 1 1
+5 0.1.2.1.1.1 f__Eubacteriaceae 1 1
+6 0.1.2.1.1.1.1 g__Eubacterium 1 1
+7 0.1.2.1.1.1.1.1 s__Eubacteriumfoedans 0 1
+3 0.1.2.2 unclassified 1 1
+4 0.1.2.2.1 unclassified 1 1
+5 0.1.2.2.1.1 unclassified 1 1
+6 0.1.2.2.1.1.1 unclassified 1 1
+7 0.1.2.2.1.1.1.1 unclassified 0 1
+"""
+
+mothur_seqs = """\
+>01
+GGAGTCTGGGCCGGTGTCGTCAAGGTCCCAATCTGGCTGGTCGGTCTCTCAACCCAGCTACCCATCATTGCCTTGGTAGGCCGTTACCCACCAACAAGCTAACAGGCCGCGGGCCCATCCCTCTCCGCCGGAGCTTTCTCGAGTCTTCCATGCGGAAGTCCCGAAGTATTCGGTATTATCCACGGTTTCCCGTGGCTATCCCAATGAGAGGGGCAGGTTGCCCACGTGTTACTCAGCCGTTCGCCACTTTATACACACCCGAAGGTGCTTTAATCGTTCGACTTGCATGTGTTAGGCGCGCCGCCAGCGTTCATC
+>A
+GGAGTCTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCACCCTCTCAGGTCGGCTATGCATCACGGCCTTGGTGAGCCGTTACCTCACCAACTAGCTAATGCACCGCGGGTCCATCCATCAGCAGAAGCTTGCGCCTCTTTTCCTCTTCAAACCATGCGGTTCGAAGACCTATGCGGTTTTAGCATCCGTTTCCGAATGTTATCCCCCTCTGATGGGCAGGTTACCCACGTGTTACTCACCCGTTCGCCACTAGATTGACCAGTGCAAGCACCGGTCGCTCTCGTTCGACTTGCATGTATTAGGCACGCCGCCAGCGTTCGTC
+>Very long seq name with many spaces!
+GGAGTCTGGACCGTGTCTCAGTTCCAGTGTGACTGATCATCCTCTCAGACCAGTTATGCGTCATAGCCTTGGTGAGCCATTACCTCACCAACTAGCTGATACAATATAGCCTCATCCTACACCGAAAAACTTTCCCTATCTAACTTATGTTAGAGAGGAGTATAGAGTATTAGCAGTCGTTTCCAACTGTTGTCCTCTAGTGTAGGGCAGATTAGCTACACATTACTCACCCGTGCGCCACTAACTCATAAGAGCAAGCTCTTACTTGTCCGTTCGACTTGCATGTATTAGGCACGCCGCCAGCGTTCACT
+"""
+
+mothur_ref_seqs = """\
+>ref1
+GGAGTCTGGGCCGGTGTCGTCAAGGTCCCAATCTGGCTGGTCGGTCTCTCTGGTAGGCCGTTACCCACCAACAAGCTAACAGGCCGCGGGCCCATCCCTCTCCGCCGGAGCTTTCTCGAGTCTTCCATGCGGAAGTCCCTGCGGAAGTCCCGAAGTATTCGGTATTATCCACGGTTTCCCGTGGCTATCCCAATGAGAGGGGCAGGTTGCCCACGTGTTACTCAGCCGTTCGCCACTTTATACACACCCGAAGGTGCTTTAATCGTTCGACTTGCATGTGTTAGGCGCGCCGCCAGCGTTCATC
+>ref2
+GGAGTCTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCACCCTCTCAGGTCGGCTATGCATCACGGCCTTGGTGAGCCGTTACCTCACCAACTAGCTACTCTTTTCCTCTTCAAACCATGCGGTTCGAAGACCTATGCGGTTTTAGCATCCGTAAACTTTCCCTATCTAACTTATGTTAGAGAGGAGTATAGAGTATTAGCAGTCGTTTCCAACTTCCGAATGTTATCCCCCTCTGATGGGCAGGTTACCCACGTGTTACTCACCCGTTCGCCACTAGATTGACCAGTGCAAGCACCGGTCGCTCTCGTTCGACTTGCATGTATTAGGCACGCCGCCAGCGTTCGTC
+>3333
+GGAGTCTGGACCGTGTCTCAGTTCCAGTGTGACTGATCATCCTCTCAGACAGTTATGCGTCATAGCCTTGGTGAGCCATTACCTCACCAACTAGCTGATACAATATAGCCTCATCCTACACCGAAAAACTTTCCCTATCTCTTATGTTAGAGAGGAGTATAGAGTATTAGCAGTCGTTTCCAACTGTTGTCCTCTAGTGTAGGGCAGATTAGCACACATTACTCACCCGTGCGCCACTAACTCATAAGAGCAAGCTCTTACTTGTCCGTTCGACTTGCATGTATTAGGCACGCCGCCAGCGTTCACT
+"""
+
+mothur_taxonomy = """\
+ref1 k__Bacteria;p__Firmicutes;
+ref2 k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridale;f__Eubacteriaceae;g__Eubacterium;s__Eubacteriumfoedans;
+3333 k__Bacteria;p__Bacteriodetes;
+"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_muscle_v38.py b/bfillings/tests/test_muscle_v38.py
new file mode 100644
index 0000000..39fd5e0
--- /dev/null
+++ b/bfillings/tests/test_muscle_v38.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir, path
+from subprocess import Popen, PIPE, STDOUT
+import tempfile
+import shutil
+from unittest import TestCase, main
+
+from cogent.core.moltype import RNA, DNA
+from cogent.util.misc import flatten
+
+from bfillings.muscle_v38 import (Muscle, muscle_seqs, aln_tree_seqs,
+ align_unaligned_seqs, build_tree_from_alignment,
+ align_and_build_tree, add_seqs_to_alignment,
+ align_two_alignments)
+
+
+class GeneralSetUp(TestCase):
+
+ def setUp(self):
+ """Muscle general setUp method for all tests"""
+ # Check if muscle version is supported for this test
+ acceptable_version = (3,8)
+ command = "muscle -version"
+ proc = Popen(command,shell=True,universal_newlines=True,\
+ stdout=PIPE,stderr=STDOUT)
+ stdout = proc.stdout.read()
+ version_string = stdout.strip().split(' ')[1].strip()[1:]
+ try:
+ version = tuple(map(int,version_string.split('.')))
+ pass_test = version[:2] == acceptable_version
+ except ValueError:
+ pass_test = False
+ version_string = stdout
+ self.assertTrue(pass_test,\
+ "Unsupported muscle version. %s is required, but running %s." \
+ % ('.'.join(map(str,acceptable_version)), version_string))
+
+ self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+ 'GCGGCUAUUAGAUCGUA']
+
+ self.labels1 = ['>1','>2','>3']
+ self.lines1 = flatten(zip(self.labels1,self.seqs1))
+
+ self.seqs2=['UAGGCUCUGAUAUAAUAGCUCUC','UAUCGCUUCGACGAUUCUCUGAUAGAGA',
+ 'UGACUACGCAU']
+ self.labels2=['>a','>b','>c']
+ self.lines2 = flatten(zip(self.labels2,self.seqs2))
+
+ self.temp_dir = tempfile.mkdtemp()
+ self.temp_dir_spaces = '/tmp/test for muscle/'
+ try:
+ mkdir(self.temp_dir_spaces)
+ except OSError:
+ pass
+ try:
+ #create sequence files
+ f = open(path.join(self.temp_dir, 'seq1.txt'),'w')
+ f.write('\n'.join(self.lines1))
+ f.close()
+ g = open(path.join(self.temp_dir, 'seq2.txt'),'w')
+ g.write('\n'.join(self.lines2))
+ g.close()
+ except OSError:
+ pass
+
+ def tearDown(self):
+ """cleans up all files initially created"""
+ # remove the tempdir and contents
+ shutil.rmtree(self.temp_dir)
+ shutil.rmtree(self.temp_dir_spaces)
+
+class MuscleTests(GeneralSetUp):
+ """Tests for the Muscle application controller"""
+
+ def test_base_command(self):
+ """Muscle BaseCommand should return the correct BaseCommand"""
+ c = Muscle()
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','muscle']))
+ c.Parameters['-in'].on('seq.txt')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','muscle -in "seq.txt"']))
+ c.Parameters['-cluster2'].on('neighborjoining')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "',getcwd(),'/"; ','muscle -cluster2 neighborjoining' +
+ ' -in "seq.txt"']))
+
+ def test_maxmb(self):
+ """maxmb option should not break Muscle"""
+ app = Muscle()
+ app.Parameters['-maxmb'].on('250')
+ outfile = tempfile.NamedTemporaryFile()
+ app.Parameters['-out'].on(outfile.name)
+
+ infile = tempfile.NamedTemporaryFile()
+ infile.write(
+ ">Seq1\nAAAGGGTTTCCCCT\n"
+ ">Seq2\nAAAGGGGGTTTCCACT\n")
+ infile.flush()
+ result = app(infile.name)
+
+ observed = result['MuscleOut'].read()
+ expected = (
+ ">Seq1\nAAA--GGGTTTCCCCT\n"
+ ">Seq2\nAAAGGGGGTTTCCACT\n"
+ )
+ self.assertEqual(observed, expected)
+
+ def test_changing_working_dir(self):
+ """Muscle BaseCommand should change according to WorkingDir"""
+ c = Muscle(WorkingDir='/tmp/muscle_test')
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/muscle_test','/"; ','muscle']))
+ c = Muscle()
+ c.WorkingDir = '/tmp/muscle_test2'
+ self.assertEqual(c.BaseCommand,\
+ ''.join(['cd "','/tmp/muscle_test2','/"; ','muscle']))
+
+ #removing the dirs is proof that they were created at the same time
+ #if the dirs are not there, an OSError will be raised
+ rmdir('/tmp/muscle_test')
+ rmdir('/tmp/muscle_test2')
+
+ def test_aln_tree_seqs(self):
+ "aln_tree_seqs returns the muscle alignment and tree from iteration2"
+ tree, aln = aln_tree_seqs(path.join(self.temp_dir, 'seq1.txt'),
+ tree_type="neighborjoining",
+ WorkingDir=self.temp_dir,
+ clean_up=True)
+ self.assertEqual(str(tree), '((1:1.125,2:1.125):0.375,3:1.5);')
+ self.assertEqual(len(aln), 6)
+ self.assertEqual(aln[-2], '>3\n')
+ self.assertEqual(aln[-1], 'GCGGCUAUUAGAUCGUA------\n')
+
+ def test_aln_tree_seqs_spaces(self):
+ "aln_tree_seqs should work on filename with spaces"
+ try:
+ #create sequence files
+ f = open(path.join(self.temp_dir_spaces, 'muscle_test_seq1.txt'),'w')
+ f.write('\n'.join(self.lines1))
+ f.close()
+ except OSError:
+ pass
+ tree, aln = aln_tree_seqs(path.join(self.temp_dir_spaces,\
+ 'muscle_test_seq1.txt'),
+ tree_type="neighborjoining",
+ WorkingDir=getcwd(),
+ clean_up=True)
+ self.assertEqual(str(tree), '((1:1.125,2:1.125):0.375,3:1.5);')
+ self.assertEqual(len(aln), 6)
+ self.assertEqual(aln[-2], '>3\n')
+ self.assertEqual(aln[-1], 'GCGGCUAUUAGAUCGUA------\n')
+ remove(self.temp_dir_spaces+'/muscle_test_seq1.txt')
+
+ def test_align_unaligned_seqs(self):
+ """align_unaligned_seqs should work as expected"""
+ res = align_unaligned_seqs(self.seqs1, RNA)
+ self.assertEqual(res.toFasta(), align1)
+
+ def test_build_tree_from_alignment(self):
+ """Muscle should return a tree built from the passed alignment"""
+ tree_short = build_tree_from_alignment(build_tree_seqs_short, DNA)
+ num_seqs = flatten(build_tree_seqs_short).count('>')
+ self.assertEqual(len(tree_short.tips()), num_seqs)
+
+ tree_long = build_tree_from_alignment(build_tree_seqs_long, DNA)
+ seq_names = []
+ for line in build_tree_seqs_long.split('\n'):
+ if line.startswith('>'):
+ seq_names.append(line[1:])
+
+ for node in tree_long.tips():
+ if node.Name not in seq_names:
+ self.fail()
+
+ def test_align_and_build_tree(self):
+ """Should align and build a tree from a set of sequences"""
+ res = align_and_build_tree(self.seqs1, RNA)
+ self.assertEqual(res['Align'].toFasta(), align1)
+
+ tree = res['Tree']
+ seq_names = []
+ for line in align1.split('\n'):
+ if line.startswith('>'):
+ seq_names.append(line[1:])
+
+ for node in tree.tips():
+ if node.Name not in seq_names:
+ self.fail()
+
+ def test_add_seqs_to_alignment(self):
+ """Should add sequences to an alignment"""
+ res = add_seqs_to_alignment(seqs_to_add, align1)
+ self.assertEqual(res.toFasta(), added_align_result)
+
+ def test_align_two_alignments(self):
+ """Should align to multiple sequence alignments"""
+ res = align_two_alignments(align1, aln_to_merge)
+ self.assertEqual(res.toFasta(), merged_align_result)
+
+align1 = ">seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+# for use in test_add_seqs_to_alignment()
+seqs_to_add = ">foo\nGCUACGUAGCU\n>bar\nGCUACGUAGCC"
+added_align_result = ">bar\n---GCUACGUAGCC---------\n>foo\n---GCUACGUAGCU---------\n>seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+# for use in test_align_two_alignments()
+aln_to_merge = ">foo\nGCUACGUAGCU\n>bar\n--UACGUAGCC"
+merged_align_result = ">bar\n-----UACGUAGCC---------\n>foo\n---GCUACGUAGCU---------\n>seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+build_tree_seqs_short = """>muscle_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>muscle_test_seqs_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>muscle_test_seqs_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>muscle_test_seqs_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>muscle_test_seqs_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>muscle_test_seqs_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>muscle_test_seqs_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>muscle_test_seqs_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>muscle_test_seqs_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>muscle_test_seqs_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+build_tree_seqs_long = """>muscle_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>muscle_test_seqsaaaaaaaa_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>muscle_test_seqsaaaaaaaa_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>muscle_test_seqsaaaaaaaa_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>muscle_test_seqsaaaaaaaa_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>muscle_test_seqsaaaaaaaa_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>muscle_test_seqsaaaaaaaa_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>muscle_test_seqsaaaaaaaa_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>muscle_test_seqsaaaaaaaa_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>muscle_test_seqsaaaaaaaa_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_parsinsert.py b/bfillings/tests/test_parsinsert.py
new file mode 100644
index 0000000..40ac95a
--- /dev/null
+++ b/bfillings/tests/test_parsinsert.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Tests for ParsInsert v1.03 application controller."""
+
+
+from shutil import rmtree
+from os.path import splitext
+from os import getcwd, remove, rmdir, mkdir
+from unittest import TestCase, main
+
+from cogent.core.alignment import Alignment
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA
+
+from skbio.parse.sequences import parse_fasta
+from burrito.util import get_tmp_filename
+
+from bfillings.parsinsert import ParsInsert, insert_sequences_into_tree
+
+
+class ParsInsertTests(TestCase):
+ def setUp(self):
+
+ # create a list of files to cleanup
+ self._paths_to_clean_up = []
+ self._dirs_to_clean_up = []
+
+ # load query seqs
+ self.seqs = Alignment(parse_fasta(QUERY_SEQS.split()))
+
+ # generate temp filename
+ tmp_dir='/tmp'
+ self.outfile = get_tmp_filename(tmp_dir)
+
+ # create and write out reference sequence file
+ self.outfasta=splitext(self.outfile)[0]+'.fasta'
+ fastaout=open(self.outfasta,'w')
+ fastaout.write(REF_SEQS)
+ fastaout.close()
+ self._paths_to_clean_up.append(self.outfasta)
+
+ # create and write out starting tree file
+ self.outtree=splitext(self.outfile)[0]+'.tree'
+ treeout=open(self.outtree,'w')
+ treeout.write(REF_TREE)
+ treeout.close()
+ self._paths_to_clean_up.append(self.outtree)
+
+ def tearDown(self):
+ """cleans up all files initially created"""
+ # remove the tempdir and contents
+ map(remove,self._paths_to_clean_up)
+ map(rmdir,self._dirs_to_clean_up)
+
+ def test_base_command(self):
+ """Base command-calls"""
+
+ app = ParsInsert()
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "',getcwd(),'/"; ','ParsInsert']))
+
+ def test_change_working_dir(self):
+ """Change working dir"""
+
+ app = ParsInsert(WorkingDir='/tmp/ParsInsertTest')
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "','/tmp/ParsInsertTest',\
+ '/"; ','ParsInsert']))
+
+ rmtree('/tmp/ParsInsertTest')
+
+ def test_insert_sequences_into_tree(self):
+ """Inserts sequences into Tree"""
+
+ # define log fp
+ log_fp='/tmp/parsinsert.log'
+ self._paths_to_clean_up.append(log_fp)
+
+ # define tax assignment values fp
+ tax_assign_fp='/tmp/tax_assignments.log'
+ self._paths_to_clean_up.append(tax_assign_fp)
+
+ # set the reference alignment and starting tree
+ param={
+ '-t':self.outtree,
+ '-s':self.outfasta,
+ '-l':log_fp,
+ '-o':tax_assign_fp
+ }
+
+ seqs, align_map = self.seqs.toPhylip()
+
+ # insert sequences into tree
+ tree = insert_sequences_into_tree(seqs, DNA, params=param)
+
+ # rename tips back to query names
+ for node in tree.tips():
+ if node.Name in align_map:
+ node.Name = align_map[node.Name]
+
+ self.assertEqual(tree.getNewick(with_distances=True),exp_tree)
+
+
+
+QUERY_SEQS= """\
+>6
+TGCATGTCAGTATAGCTTTGGTGAAACTGCGAATGGCTCATTAAATCAGT
+>7
+TGCATGTCAGTATAACTTTGGTGAAACTGCGAATGGCTCATTAAATCAGT
+"""
+
+REF_SEQS= """\
+>seq0000011
+TGCATGTCAGTATAGCTTTAGTGAAACTGCGAATGGCTCATTAAATCAGT
+>seq0000012
+TGCATGTCAGTATAGCTTTAGTGAAACTGCGAATGGCTNNTTAAATCAGT
+>seq0000013
+TGCATGTCAGTATAGCATTAGTGAAACTGCGAATGGCTCATTAAATCAGT
+>seq0000014
+TCCATGTCAGTATAACTTTGGTGAAACTGCGAATGGCTCATTAAATCAGG
+>seq0000015
+NNNNNNNNNNTATATCTTATGTGAAACTTCGAATGCCTCATTAAATCAGT
+"""
+
+REF_TREE="""((seq0000014:0.08408,seq0000015:0.13713)0.609:0.00215,seq0000013:0.02032,(seq0000011:0.00014,seq0000012:0.00014)0.766:0.00015);
+"""
+
+exp_tree = """((seq0000014:0.08408,seq0000015:0.13713,7:0.02027):0.00215,seq0000013:0.02032,(seq0000011:0.00014,seq0000012:0.00014,6:0.02027):0.00015):0.0;"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_pplacer.py b/bfillings/tests/test_pplacer.py
new file mode 100644
index 0000000..064003f
--- /dev/null
+++ b/bfillings/tests/test_pplacer.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir
+from os.path import splitext
+from random import randint
+from StringIO import StringIO
+from unittest import TestCase, main
+
+from skbio.parse.sequences import parse_fasta
+from burrito.util import ApplicationError, get_tmp_filename
+
+from cogent.util.misc import flatten
+from cogent.core.tree import PhyloNode
+from cogent.core.moltype import RNA, DNA
+from cogent.core.alignment import Alignment
+
+from bfillings.pplacer import Pplacer, insert_sequences_into_tree
+
+
+class Genericpplacer(TestCase):
+
+ def setUp(self):
+ '''setup the files for testing pplacer'''
+
+ # create a list of files to cleanup
+ self._paths_to_clean_up = []
+ self._dirs_to_clean_up = []
+
+ # get a tmp filename to use
+ basename=splitext(get_tmp_filename())[0]
+
+ # create and write out RAxML stats file
+ self.stats_fname=basename+'.stats'
+ stats_out=open(self.stats_fname,'w')
+ stats_out.write(RAXML_STATS)
+ stats_out.close()
+ self._paths_to_clean_up.append(self.stats_fname)
+
+ # create and write out reference sequence file
+ self.refseq_fname=basename+'_refseqs.fasta'
+ refseq_out=open(self.refseq_fname,'w')
+ refseq_out.write(REF_SEQS)
+ refseq_out.close()
+ self._paths_to_clean_up.append(self.refseq_fname)
+
+ # create and write out query sequence file
+ self.query_fname=basename+'_queryseqs.fasta'
+ query_out=open(self.query_fname,'w')
+ query_out.write(QUERY_SEQS)
+ query_out.close()
+ self._paths_to_clean_up.append(self.query_fname)
+
+ # create and write out starting tree file
+ self.tree_fname=basename+'.tre'
+ tree_out=open(self.tree_fname,'w')
+ tree_out.write(REF_TREE)
+ tree_out.close()
+ self._paths_to_clean_up.append(self.tree_fname)
+
+ def writeTmp(self, outname):
+ """Write data to temp file"""
+ t = open(outname, "w+")
+ t.write(PHYLIP_FILE)
+ t.close()
+
+ #
+ def tearDown(self):
+ """cleans up all files initially created"""
+ # remove the tempdir and contents
+ map(remove,self._paths_to_clean_up)
+ map(rmdir,self._dirs_to_clean_up)
+
+class pplacerTests(Genericpplacer):
+ """Tests for the pplacer application controller"""
+
+ def test_pplacer(self):
+ """Base command-calls"""
+
+ app=Pplacer()
+
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "',getcwd(),'/"; ','pplacer']))
+
+ app.Parameters['--help'].on()
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "',getcwd(),'/"; ','pplacer --help']))
+
+ def test_change_working_dir(self):
+ """Change working dir"""
+
+ working_dir='/tmp/Pplacer'
+ self._dirs_to_clean_up.append(working_dir)
+
+ # define working directory for output
+ app = Pplacer(WorkingDir=working_dir)
+
+ self.assertEqual(app.BaseCommand, \
+ ''.join(['cd "','/tmp/Pplacer','/"; ','pplacer']))
+
+
+ def test_insert_sequences_into_tree(self):
+ """Inserts sequences into Tree"""
+
+ params={}
+ # generate temp filename for output
+ params["-r"] = self.refseq_fname
+ params["-t"] = self.tree_fname
+ params["-s"] = self.stats_fname
+ params["--out-dir"] = "/tmp"
+
+ aln_ref_query=parse_fasta(StringIO(QUERY_SEQS))
+ aln = Alignment(aln_ref_query)
+ seqs, align_map = aln.toPhylip()
+ tree = insert_sequences_into_tree(seqs, DNA, params=params,
+ write_log=False)
+
+ # rename tips back to query names
+ for node in tree.tips():
+ if node.Name in align_map:
+ node.Name = align_map[node.Name]
+
+ self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE)
+
+
+JSON_RESULT="""\
+{"tree":
+ "((seq0000004:0.08408[0],seq0000005:0.13713[1])0.609:0.00215[2],seq0000003:0.02032[3],(seq0000001:0.00014[4],seq0000002:0.00014[5])0.766:0.00015[6]):0[7];",
+ "placements":
+ [
+ {"p":
+ [[0, -113.210938, 0.713818, 0.064504, 0.000006],
+ [1, -114.929894, 0.127954, 0.137122, 0.000007],
+ [2, -114.932766, 0.127587, 0.000008, 0.000006],
+ [6, -117.743534, 0.007675, 0.000141, 0.027211],
+ [3, -117.743759, 0.007674, 0.020310, 0.027207],
+ [4, -117.747386, 0.007646, 0.000131, 0.027266],
+ [5, -117.747396, 0.007646, 0.000131, 0.027266]
+ ], "n": ["seq0000006"]
+ },
+ {"p": [[0, -113.476305, 1.000000, 0.035395, 0.000006]], "n":
+ ["seq0000007"]
+ }
+ ], "metadata":
+ {"invocation":
+ "pplacer -t %s -r %s -s %s --out-dir \/tmp %s"
+ }, "version": 1, "fields":
+ ["edge_num", "likelihood", "like_weight_ratio", "distal_length",
+ "pendant_length"
+ ]
+}
+""".replace('\n','').replace(' ','')
+
+
+QUERY_SEQS= """\
+>6
+TGCATGTCAGTATAGCTTTGGTGAAACTGCGAATGGCTCATTAAATCAGT
+>7
+TGCATGTCAGTATAACTTTGGTGAAACTGCGAATGGCTCATTAAATCAGT
+"""
+
+
+REF_SEQS= """\
+>seq0000011
+TGCATGTCAGTATAGCTTTAGTGAAACTGCGAATGGCTCATTAAATCAGT
+>seq0000012
+TGCATGTCAGTATAGCTTTAGTGAAACTGCGAATGGCTNNTTAAATCAGT
+>seq0000013
+TGCATGTCAGTATAGCATTAGTGAAACTGCGAATGGCTCATTAAATCAGT
+>seq0000014
+TCCATGTCAGTATAACTTTGGTGAAACTGCGAATGGCTCATTAAATCAGG
+>seq0000015
+NNNNNNNNNNTATATCTTATGTGAAACTTCGAATGCCTCATTAAATCAGT
+"""
+
+REF_TREE="""((seq0000014:0.08408,seq0000015:0.13713)0.609:0.00215,seq0000013:0.02032,(seq0000011:0.00014,seq0000012:0.00014)0.766:0.00015);
+"""
+
+RESULT_TREE="""((((seq0000014:0.0353946,7:6.11352e-06):0.0291093,6:6.11352e-06):0.019576,seq0000015:0.13713)0.609:0.00215,seq0000013:0.02032,(seq0000011:0.00014,seq0000012:0.00014)0.766:0.00015);"""
+
+RAXML_STATS="""
+
+
+This is RAxML version 7.2.6 released by Alexandros Stamatakis in February 2010.
+
+With greatly appreciated code contributions by:
+Andre Aberer (TUM)
+Simon Berger (TUM)
+John Cazes (TACC)
+Michael Ott (TUM)
+Nick Pattengale (UNM)
+Wayne Pfeiffer (SDSC)
+
+
+Alignment has 18 distinct alignment patterns
+
+Proportion of gaps and completely undetermined characters in this alignment: 4.80%
+
+RAxML rapid hill-climbing mode
+
+Using 1 distinct models/data partitions with joint branch length optimization
+
+
+Executing 1 inferences on the original alignment using 1 distinct randomized MP trees
+
+All free model parameters will be estimated by RAxML
+ML estimate of 25 per site rate categories
+
+Likelihood of final tree will be evaluated and optimized under GAMMA
+
+GAMMA Model parameters will be estimated up to an accuracy of 0.1000000000 Log Likelihood units
+
+Partition: 0
+Alignment Patterns: 18
+Name: No Name Provided
+DataType: DNA
+Substitution Matrix: GTR
+
+
+
+
+RAxML was called as follows:
+
+raxmlHPC -m GTRCAT -s test_raxml.phy -n results
+
+
+Inference[0]: Time 0.072128 CAT-based likelihood -85.425107, best rearrangement setting 2
+alpha[0]: 1.000000 rates[0] ac ag at cg ct gt: 0.000017 0.037400 0.859448 1.304301 0.000017 1.000000
+
+
+Conducting final model optimizations on all 1 trees under GAMMA-based models ....
+
+Inference[0] final GAMMA-based Likelihood: -107.575676 tree written to file /home/RAxML_result.results
+
+
+Starting final GAMMA-based thorough Optimization on tree 0 likelihood -107.575676 ....
+
+Final GAMMA-based Score of best tree -107.575676
+
+Program execution info written to /home/RAxML_info.results
+Best-scoring ML tree written to: /home/RAxML_bestTree.results
+
+Overall execution time: 0.078965 secs or 0.000022 hours or 0.000001 days
+"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_raxml_v730.py b/bfillings/tests/test_raxml_v730.py
new file mode 100644
index 0000000..84738fd
--- /dev/null
+++ b/bfillings/tests/test_raxml_v730.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+
+from os import getcwd, remove, rmdir, mkdir
+from os.path import splitext
+import re
+from random import choice, randint
+from random import randint
+from StringIO import StringIO
+from subprocess import Popen, PIPE, STDOUT
+from unittest import TestCase, main
+
+from cogent.util.misc import flatten
+from cogent.parse.phylip import get_align_for_phylip
+from cogent.core.tree import PhyloNode
+from cogent.core.moltype import RNA, DNA
+from cogent.util.misc import app_path
+from cogent.core.alignment import Alignment
+
+from burrito.util import ApplicationError, get_tmp_filename
+
+from bfillings.raxml_v730 import (Raxml, raxml_alignment,
+ build_tree_from_alignment,
+ insert_sequences_into_tree)
+
+
+class GenericRaxml(TestCase):
+
+ def setUp(self):
+ """Check if Raxml version is supported for this test"""
+ acceptable_version = (7,3,0)
+ self.assertTrue(app_path('raxmlHPC'),
+ "raxmlHPC not found. This may or may not be a problem depending on "+\
+ "which components of QIIME you plan to use.")
+ command = "raxmlHPC -v | grep version"
+ proc = Popen(command,shell=True,universal_newlines=True,\
+ stdout=PIPE,stderr=STDOUT)
+ stdout = proc.stdout.read()
+ version_string = stdout.strip().split(' ')[4].strip()
+ try:
+ version = tuple(map(int,version_string.split('.')))
+ pass_test = version == acceptable_version
+ except ValueError:
+ pass_test = False
+ version_string = stdout
+ self.assertTrue(pass_test,\
+ "Unsupported raxmlHPC version. %s is required, but running %s." \
+ % ('.'.join(map(str,acceptable_version)), version_string))
+
+
+ """Setup data for raxml tests"""
+ self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+ 'GCGGCUAUUAGAUCGUA']
+ self.labels1 = ['>1','>2','>3']
+ self.lines1 = flatten(zip(self.labels1,self.seqs1))
+
+ self.test_model = "GTRCAT"
+
+ self.align1 = get_align_for_phylip(StringIO(PHYLIP_FILE))
+
+ self.test_fn1 = "/tmp/raxml_test1.txt"
+ self.test_fn2 = "raxml_test1.txt"
+ self.test_fn1_space = "/tmp/raxml test1.txt"
+
+ def writeTmp(self, outname):
+ """Write data to temp file"""
+ t = open(outname, "w+")
+ t.write(PHYLIP_FILE)
+ t.close()
+
+
+class RaxmlTests(GenericRaxml):
+ """Tests for the Raxml application controller"""
+
+ def test_raxml(self):
+ """raxml BaseCommand should return the correct BaseCommand"""
+ r = Raxml()
+ self.assertEqual(r.BaseCommand, \
+ ''.join(['cd \"',getcwd(),'/\"; ','raxmlHPC -f d -# 1']))
+ r.Parameters['-s'].on('seq.nexus')
+ self.assertEqual(r.BaseCommand,\
+ ''.join(['cd \"',getcwd(),'/\"; ',\
+ 'raxmlHPC -f d -s seq.nexus -# 1']))
+
+
+ def test_raxml_params(self):
+ """raxml should raise exception if missing required params"""
+
+ r = Raxml(WorkingDir="/tmp")
+
+ r.SuppressStdout = True
+ r.SuppressStderr = True
+ # raise error by default
+ self.assertRaises(ValueError, r)
+
+ # specify output name
+ r.Parameters['-n'].on("test_name")
+ r.Parameters["-p"].on(randint(1,100000))
+ self.assertRaises(ApplicationError, r)
+
+ # specify model
+ r.Parameters['-m'].on("GTRCAT")
+ self.assertRaises(ApplicationError, r)
+
+ r.Parameters['-s'].on(self.test_fn1)
+ self.assertRaises(ApplicationError, r)
+
+
+ self.writeTmp(self.test_fn1)
+
+ o = r()
+ o.cleanUp()
+
+ remove(self.test_fn1)
+
+
+ def test_raxml_from_file(self):
+ """raxml should run correctly using filename"""
+ r = Raxml(WorkingDir="/tmp")
+
+ r.Parameters['-s'].on(self.test_fn1)
+ r.Parameters['-m'].on("GTRCAT")
+ r.Parameters['-n'].on("test_me")
+ r.Parameters["-p"].on(randint(1,100000))
+
+ # test with abs filename
+ cur_out = self.test_fn1
+ self.writeTmp(cur_out)
+ out = r()
+ out.cleanUp()
+ remove(cur_out)
+
+ # test with rel + working dir
+ r.Parameters['-s'].on(self.test_fn2)
+ r.Parameters['-n'].on("test_me2")
+ r.Parameters['-w'].on("/tmp/")
+ r.Parameters["-p"].on(randint(1,100000))
+ self.writeTmp(self.test_fn1)
+ out = r()
+ out.cleanUp()
+ remove(self.test_fn1)
+
+ r.Parameters['-s'].on("\"%s\"" % self.test_fn1_space)
+ r.Parameters['-n'].on("test_me3")
+ r.Parameters['-w'].on("/tmp/")
+ r.Parameters["-p"].on(randint(1,100000))
+ #print r.BaseCommand
+ self.writeTmp(self.test_fn1_space)
+ out = r()
+ out.cleanUp()
+ remove(self.test_fn1_space)
+
+ def test_raxml_alignment(self):
+ """raxml_alignment should work as expected"""
+ phy_node, parsimony_phy_node, log_likelihood, total_exec \
+ = raxml_alignment(self.align1)
+
+ def test_build_tree_from_alignment(self):
+ """Builds a tree from an alignment"""
+
+ tree = build_tree_from_alignment(self.align1, RNA, False)
+
+ self.assertTrue(isinstance(tree, PhyloNode))
+ self.assertEqual(len(tree.tips()), 7)
+ self.assertRaises(NotImplementedError, build_tree_from_alignment, \
+ self.align1, RNA, True)
+
+ def test_insert_sequences_into_tree(self):
+ """Inserts sequences into Tree using params - test handles tree-insertion"""
+
+ # generate temp filename for output
+ outfname=splitext(get_tmp_filename('/tmp/'))[0]
+
+ # create starting tree
+ outtreefname=outfname+'.tre'
+ outtree=open(outtreefname,'w')
+ outtree.write(REF_TREE)
+ outtree.close()
+
+ # set params for tree-insertion
+ params={}
+ params["-w"]="/tmp/"
+ params["-n"] = get_tmp_filename().split("/")[-1]
+ params["-f"] = 'v'
+ #params["-G"] = '0.25'
+ params["-t"] = outtreefname
+ params["-m"] = 'GTRGAMMA'
+
+ aln_ref_query=get_align_for_phylip(StringIO(PHYLIP_FILE_DNA_REF_QUERY))
+ aln = Alignment(aln_ref_query)
+ seqs, align_map = aln.toPhylip()
+
+ tree = insert_sequences_into_tree(seqs, DNA, params=params,
+ write_log=False)
+
+ for node in tree.tips():
+ removed_query_str=re.sub('QUERY___','',str(node.Name))
+ new_node_name=re.sub('___\d+','',str(removed_query_str))
+ if new_node_name in align_map:
+ node.Name = align_map[new_node_name]
+
+ self.assertTrue(isinstance(tree, PhyloNode))
+ self.assertEqual(tree.getNewick(with_distances=True),RESULT_TREE)
+ self.assertEqual(len(tree.tips()), 7)
+ self.assertRaises(NotImplementedError, build_tree_from_alignment, \
+ self.align1, RNA, True)
+
+ remove(outtreefname)
+
+PHYLIP_FILE= """ 7 50
+Species001 UGCAUGUCAG UAUAGCUUUA GUGAAACUGC GAAUGGCUCA UUAAAUCAGU
+Species002 UGCAUGUCAG UAUAGCUUUA GUGAAACUGC GAAUGGCUNN UUAAAUCAGU
+Species003 UGCAUGUCAG UAUAGCAUUA GUGAAACUGC GAAUGGCUCA UUAAAUCAGU
+Species004 UCCAUGUCAG UAUAACUUUG GUGAAACUGC GAAUGGCUCA UUAAAUCAGG
+Species005 NNNNNNNNNN UAUAUCUUAU GUGAAACUUC GAAUGCCUCA UUAAAUCAGU
+Species006 UGCAUGUCAG UAUAGCUUUG GUGAAACUGC GAAUGGCUCA UUAAAUCAGU
+Species007 UGCAUGUCAG UAUAACUUUG GUGAAACUGC GAAUGGCUCA UUAAAUCAGU
+"""
+
+
+PHYLIP_FILE_DNA_REF_QUERY= """ 7 50
+Species001 TGCATGTCAG TATAGCTTTA GTGAAACTGC GAATGGCTCA TTAAATCAGT
+Species002 TGCATGTCAG TATAGCTTTA GTGAAACTGC GAATGGCTNN TTAAATCAGT
+Species003 TGCATGTCAG TATAGCATTA GTGAAACTGC GAATGGCTCA TTAAATCAGT
+Species004 TCCATGTCAG TATAACTTTG GTGAAACTGC GAATGGCTCA TTAAATCAGG
+Species005 NNNNNNNNNN TATATCTTAT GTGAAACTTC GAATGCCTCA TTAAATCAGT
+Species006 TGCATGTCAG TATAGCTTTG GTGAAACTGC GAATGGCTCA TTAAATCAGT
+Species007 TGCATGTCAG TATAACTTTG GTGAAACTGC GAATGGCTCA TTAAATCAGT
+"""
+
+REF_TREE="""((seq0000004:0.08408,seq0000005:0.13713)0.609:0.00215,seq0000003:0.02032,(seq0000001:0.00014,seq0000002:0.00014)0.766:0.00015);
+"""
+
+RESULT_TREE="""(Species003:0.0194919169324,(Species001:4.34281710439e-07,Species002:4.34281710439e-07):4.34281710439e-07,(((Species006:0.0,Species007:0.0):0.0,Species004:0.0438017433031):0.0438017433031,Species005:0.171345128781):0.00331197405878);"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_rdp_classifier.py b/bfillings/tests/test_rdp_classifier.py
new file mode 100644
index 0000000..5efed73
--- /dev/null
+++ b/bfillings/tests/test_rdp_classifier.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Tests for the rdp_classifier_2.0.1 application controller"""
+
+from cStringIO import StringIO
+from os import getcwd, environ, remove, listdir
+from shutil import rmtree
+import tempfile
+from unittest import TestCase, main
+
+from bfillings.rdp_classifier import (RdpClassifier, RdpTrainer, assign_taxonomy,
+ train_rdp_classifier,
+ train_rdp_classifier_and_assign_taxonomy,
+ parse_rdp_assignment)
+
+
+class RdpClassifierTests(TestCase):
+ def setUp(self):
+ # fetch user's RDP_JAR_PATH
+ if 'RDP_JAR_PATH' in environ:
+ self.user_rdp_jar_path = environ['RDP_JAR_PATH']
+ else:
+ self.user_rdp_jar_path = 'rdp_classifier-2.2.jar'
+ self.output_file = tempfile.NamedTemporaryFile()
+
+ def test_default_java_vm_parameters(self):
+ """RdpClassifier should store default arguments to Java VM."""
+ a = RdpClassifier()
+ self.assertTrue('-Xmx' in a.Parameters)
+ self.assertEqual(a.Parameters['-Xmx'].Value, '1000m')
+
+ def test_parameters_list(self):
+ a = RdpClassifier()
+ parameters = a.Parameters.keys()
+ parameters.sort()
+ self.assertEqual(parameters, ['-Xmx', '-f', '-o', '-t'])
+
+ def test_assign_jvm_parameters(self):
+ """RdpCalssifier should pass alternate parameters to Java VM."""
+ app = RdpClassifier()
+ app.Parameters['-Xmx'].on('75M')
+ exp = ''.join([
+ 'cd "', getcwd(), '/"; java -Xmx75M -jar "',
+ self.user_rdp_jar_path, '" -q'])
+ self.assertEqual(app.BaseCommand, exp)
+
+ def test_basecommand_property(self):
+ """RdpClassifier BaseCommand property should use overridden method."""
+ app = RdpClassifier()
+ self.assertEqual(app.BaseCommand, app._get_base_command())
+
+ def test_base_command(self):
+ """RdpClassifier should return expected shell command."""
+ app = RdpClassifier()
+ exp = ''.join([
+ 'cd "', getcwd(), '/"; java -Xmx1000m -jar "',
+ self.user_rdp_jar_path, '" -q'])
+ self.assertEqual(app.BaseCommand, exp)
+
+ def test_change_working_dir(self):
+ """RdpClassifier should run program in expected working directory."""
+ test_dir = '/tmp/RdpTest'
+
+ app = RdpClassifier(WorkingDir=test_dir)
+ exp = ''.join([
+ 'cd "', test_dir, '/"; java -Xmx1000m -jar "',
+ self.user_rdp_jar_path, '" -q'])
+ self.assertEqual(app.BaseCommand, exp)
+
+ rmtree(test_dir)
+
+ def test_sample_fasta(self):
+ """RdpClassifier should classify its own sample data correctly"""
+ test_dir = '/tmp/RdpTest'
+ app = RdpClassifier(WorkingDir=test_dir)
+ _, output_fp = tempfile.mkstemp(dir=test_dir)
+ app.Parameters['-o'].on(output_fp)
+
+ results = app(StringIO(rdp_sample_fasta))
+
+ assignment_toks = results['Assignments'].readline().split('\t')
+
+ self.assertEqual(assignment_toks[0], 'X67228')
+ lineage = [x.strip('"') for x in assignment_toks[2::3]]
+ self.assertEqual(lineage, [
+ 'Root', 'Bacteria', 'Proteobacteria', 'Alphaproteobacteria',
+ 'Rhizobiales', 'Rhizobiaceae', 'Rhizobium'])
+ rmtree(test_dir)
+
+
+class RdpTrainerTests(TestCase):
+ """Tests of the trainer for the RdpClassifier app
+ """
+
+ def setUp(self):
+ self.reference_file = StringIO(rdp_training_sequences)
+ self.reference_file.seek(0)
+
+ self.taxonomy_file = tempfile.NamedTemporaryFile(
+ prefix="RdpTaxonomy", suffix=".txt")
+ self.taxonomy_file.write(rdp_training_taxonomy)
+ self.taxonomy_file.seek(0)
+
+ self.training_dir = tempfile.mkdtemp(prefix='RdpTrainer_output_')
+
+ def tearDown(self):
+ rmtree(self.training_dir)
+
+ def test_call(self):
+ app = RdpTrainer()
+ app.Parameters['taxonomy_file'].on(self.taxonomy_file.name)
+ app.Parameters['model_output_dir'].on(self.training_dir)
+ results = app(self.reference_file)
+
+ exp_file_list = [
+ 'bergeyTrainingTree.xml', 'genus_wordConditionalProbList.txt',
+ 'logWordPrior.txt', 'RdpClassifier.properties',
+ 'wordConditionalProbIndexArr.txt',
+ ]
+ obs_file_list = listdir(self.training_dir)
+ exp_file_list.sort()
+ obs_file_list.sort()
+ self.assertEqual(obs_file_list, exp_file_list)
+
+ autogenerated_headers = {
+ 'bergeyTree': 'bergeyTrainingTree',
+ 'probabilityList': 'genus_wordConditionalProbList',
+ 'wordPrior': 'logWordPrior',
+ 'probabilityIndex': 'wordConditionalProbIndexArr',
+ }
+ for id, basename in autogenerated_headers.iteritems():
+ obs_header = results[id].readline()
+ exp_header = exp_training_header_template % basename
+ self.assertEqual(exp_header, obs_header)
+
+
+class RdpWrapperTests(TestCase):
+ """ Tests of RDP classifier wrapper functions
+ """
+ def setUp(self):
+ self.num_trials = 10
+
+ self.test_input1 = rdp_test_fasta.split('\n')
+ self.expected_assignments1 = rdp_expected_out
+
+ # Files for training
+ self.reference_file = StringIO(rdp_training_sequences)
+ self.reference_file.seek(0)
+
+ self.taxonomy_file = StringIO(rdp_training_taxonomy)
+ self.taxonomy_file.seek(0)
+
+ self.training_dir = tempfile.mkdtemp(prefix='RdpTrainer_output_')
+
+ # Sequences for trained classifier
+ self.test_trained_input = rdp_trained_fasta.split("\n")
+
+ def tearDown(self):
+ rmtree(self.training_dir)
+
+ def test_parse_rdp_assignment(self):
+ seqid, direction, assignments = parse_rdp_assignment(
+ "X67228\t\t"
+ "Root\tnorank\t1.0\t"
+ "Bacteria\tdomain\t1.0\t"
+ "\"Proteobacteria\"\tphylum\t1.0\t"
+ "Alphaproteobacteria\tclass\t0.9\t"
+ "Rhizobiales\torder\t0.9\t"
+ "Rhizobiaceae\tfamily\t0.47\t"
+ "Rhizobium\tgenus\t0.46")
+ self.assertEqual(seqid, "X67228")
+
+ def test_assign_taxonomy_short_sequence(self):
+ """assign_taxonomy should return Unclassifiable if sequence is too short
+ """
+ assignments = assign_taxonomy([
+ '>MySeq 1',
+ 'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGA',
+ ])
+ self.assertEqual(assignments, {'MySeq 1': ('Unassignable', 1.0)})
+
+ def test_assign_taxonomy(self):
+ """assign_taxonomy wrapper functions as expected
+
+ This test may fail periodicially, but failure should be rare.
+ """
+ unverified_seq_ids = set(self.expected_assignments1.keys())
+ for i in range(self.num_trials):
+ obs_assignments = assign_taxonomy(self.test_input1)
+ for seq_id in list(unverified_seq_ids):
+ obs_lineage, obs_confidence = obs_assignments[seq_id]
+ exp_lineage = self.expected_assignments1[seq_id]
+ if (obs_lineage == exp_lineage):
+ unverified_seq_ids.remove(seq_id)
+ if not unverified_seq_ids:
+ break
+
+ messages = []
+ for seq_id in unverified_seq_ids:
+ messages.append("Unable to verify %s trials" % self.num_trials)
+ messages.append(" Sequence ID: %s" % seq_id)
+ messages.append(" Expected: %s" % self.expected_assignments1[seq_id])
+ messages.append(" Observed: %s" % obs_assignments[seq_id][0])
+ messages.append(" Confidence: %s" % obs_assignments[seq_id][1])
+
+ # make sure all taxonomic results were correct at least once
+ self.assertFalse(unverified_seq_ids, msg='\n'.join(messages))
+
+ def test_assign_taxonomy_alt_confidence(self):
+ """assign_taxonomy wrapper functions as expected with alt confidence
+ """
+ obs_assignments = assign_taxonomy(
+ self.test_input1, min_confidence=0.95)
+
+ for seq_id, assignment in obs_assignments.items():
+ obs_lineage, obs_confidence = assignment
+ exp_lineage = self.expected_assignments1[seq_id]
+ message = "Sequence ID: %s, assignment: %s" % (seq_id, assignment)
+ self.assertTrue(
+ exp_lineage.startswith(obs_lineage) or \
+ (obs_lineage == "Unclassified"),
+ msg=message,
+ )
+ self.assertTrue(obs_confidence >= 0.95, msg=message)
+
+ def test_assign_taxonomy_file_output(self):
+ """ assign_taxonomy wrapper writes correct file output when requested
+
+ This function tests for sucessful completion of assign_taxonomy
+ when writing to file, that the lines in the file roughly look
+ correct by verifying how many are written (by zipping with
+ expected), and that each line starts with the correct seq id.
+ Actual testing of taxonomy data is performed elsewhere.
+
+ """
+ _, output_fp = tempfile.mkstemp(prefix='RDPAssignTaxonomyTests',
+ suffix='.txt')
+ # convert the expected dict to a list of lines to match
+ # file output
+ expected_file_headers = self.expected_assignments1.keys()
+ expected_file_headers.sort()
+
+ actual_return_value = assign_taxonomy(\
+ self.test_input1,min_confidence=0.95,output_fp=output_fp)
+
+ actual_file_output = list(open(output_fp))
+ actual_file_output.sort()
+
+ # remove the output_fp before running the tests, so if they
+ # fail the output file is still cleaned-up
+ remove(output_fp)
+
+ # None return value on write to file
+ self.assertEqual(actual_return_value,None)
+
+ # check that each line starts with the correct seq_id -- not
+ # checking the taxonomies or confidences here as these are variable and
+ # tested elsewhere
+ for a,e in zip(actual_file_output,expected_file_headers):
+ self.assertTrue(a.startswith(e))
+
+ def test_train_rdp_classifier(self):
+ results = train_rdp_classifier(
+ self.reference_file, self.taxonomy_file, self.training_dir)
+
+ exp_file_list = [
+ 'bergeyTrainingTree.xml', 'genus_wordConditionalProbList.txt',
+ 'logWordPrior.txt', 'RdpClassifier.properties',
+ 'wordConditionalProbIndexArr.txt',
+ ]
+ obs_file_list = listdir(self.training_dir)
+ exp_file_list.sort()
+ obs_file_list.sort()
+ self.assertEqual(obs_file_list, exp_file_list)
+
+ autogenerated_headers = {
+ 'bergeyTree': 'bergeyTrainingTree',
+ 'probabilityList': 'genus_wordConditionalProbList',
+ 'wordPrior': 'logWordPrior',
+ 'probabilityIndex': 'wordConditionalProbIndexArr',
+ }
+ for id, basename in autogenerated_headers.iteritems():
+ obs_header = results[id].readline()
+ exp_header = exp_training_header_template % basename
+ self.assertEqual(exp_header, obs_header)
+
+ def test_train_rdp_classifier_and_assign_taxonomy(self):
+ obs = train_rdp_classifier_and_assign_taxonomy(self.reference_file,
+ self.taxonomy_file, self.test_trained_input, min_confidence=0.80,
+ model_output_dir=self.training_dir)
+ exp = {'X67228': (
+ 'Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;'
+ 'Rhizobiaceae;Rhizobium', 1.0
+ )}
+ self.assertEqual(obs, exp)
+
+ def test_train_rdp_classifier_and_assign_taxonomy_no_model_output(self):
+ obs = train_rdp_classifier_and_assign_taxonomy(
+ self.reference_file, self.taxonomy_file, self.test_trained_input)
+ exp = {'X67228': (
+ 'Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;'
+ 'Rhizobiaceae;Rhizobium', 1.0
+ )}
+ self.assertEqual(obs, exp)
+
+# Sample data copied from rdp_classifier-2.0, which is licensed under
+# the GPL 2.0 and Copyright 2008 Michigan State University Board of
+# Trustees
+
+rdp_training_sequences = """>X67228 Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Rhizobium
+aacgaacgctggcggcaggcttaacacatgcaagtcgaacgctccgcaaggagagtggcagacgggtgagtaacgcgtgggaatctacccaaccctgcggaatagctctgggaaactggaattaataccgcatacgccctacgggggaaagatttatcggggatggatgagcccgcgttggattagctagttggtggggtaaaggcctaccaaggcgacgatccatagctggtctgagaggatgatcagccacattgggactgagacacggcccaaa
+>X73443 Bacteria;Firmicutes;Clostridia;Clostridiales;Clostridiaceae;Clostridium
+nnnnnnngagatttgatcctggctcaggatgaacgctggccggccgtgcttacacatgcagtcgaacgaagcgcttaaactggatttcttcggattgaagtttttgctgactgagtggcggacgggtgagtaacgcgtgggtaacctgcctcatacagggggataacagttagaaatgactgctaataccnnataagcgcacagtgctgcatggcacagtgtaaaaactccggtggtatgagatggacccgcgtctgattagctagttggtggggt
+>AB004750 Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Enterobacter
+acgctggcggcaggcctaacacatgcaagtcgaacggtagcagaaagaagcttgcttctttgctgacgagtggcggacgggtgagtaatgtctgggaaactgcccgatggagggggataactactggaaacggtagctaataccgcataacgtcttcggaccaaagagggggaccttcgggcctcttgccatcggatgtgcccagatgggattagctagtaggtggggtaacggctcacctaggcgacgatccctagctggtctgagaggatgaccagccacactggaactgagacacggtccagactcctacgggaggcagcagtggggaatattgca
+>xxxxxx Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas
+ttgaacgctggcggcaggcctaacacatgcaagtcgagcggcagcannnncttcgggaggctggcgagcggcggacgggtgagtaacgcatgggaacttacccagtagtgggggatagcccggggaaacccggattaataccgcatacgccctgagggggaaagcgggctccggtcgcgctattggatgggcccatgtcggattagttagttggtggggtaatggcctaccaaggcgacgatccgtagctggtctgagaggatgatcagccacaccgggactgagacacggcccggactcctacgggaggcagcagtggggaatattggacaatgggggcaaccctgatccagccatgccg
+>AB004748 Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Enterobacter
+acgctggcggcaggcctaacacatgcaagtcgaacggtagcagaaagaagcttgcttctttgctgacgagtggcggacgggtgagtaatgtctgggaaactgcccgatggagggggataactactggaaacggtagctaataccgcataacgtcttcggaccaaagagggggaccttcgggcctcttgccatcggatgtgcccagatgggattagctagtaggtggggtaacggctcacctaggcgacgatccctagctggtctgagaggatgaccagccacactggaactgagacacggtccagactcctacgggaggcagcagtggggaatattgcacaatgggcgcaagcctgatgcagccatgccgcgtgtatgaagaaggccttcgggttg
+>AB000278 Bacteria;Proteobacteria;Gammaproteobacteria;Vibrionales;Vibrionaceae;Photobacterium
+caggcctaacacatgcaagtcgaacggtaanagattgatagcttgctatcaatgctgacgancggcggacgggtgagtaatgcctgggaatataccctgatgtgggggataactattggaaacgatagctaataccgcataatctcttcggagcaaagagggggaccttcgggcctctcgcgtcaggattagcccaggtgggattagctagttggtggggtaatggctcaccaaggcgacgatccctagctggtctgagaggatgatcagccacactggaactgagacacggtccagactcctacgggaggcagcagtggggaatattgcacaatgggggaaaccctgatgcagccatgccgcgtgta
+>AB000390 Bacteria;Proteobacteria;Gammaproteobacteria;Vibrionales;Vibrionaceae;Vibrio
+tggctcagattgaacgctggcggcaggcctaacacatgcaagtcgagcggaaacgantnntntgaaccttcggggnacgatnacggcgtcgagcggcggacgggtgagtaatgcctgggaaattgccctgatgtgggggataactattggaaacgatagctaataccgcataatgtctacggaccaaagagggggaccttcgggcctctcgcttcaggatatgcccaggtgggattagctagttggtgaggtaatggctcaccaaggcgacgatccctagctggtctgagaggatgatcagccacactggaactgag
+"""
+
+rdp_training_taxonomy = """\
+1*Bacteria*0*0*domain
+765*Firmicutes*1*1*phylum
+766*Clostridia*765*2*class
+767*Clostridiales*766*3*order
+768*Clostridiaceae*767*4*family
+769*Clostridium*768*5*genus
+160*Proteobacteria*1*1*phylum
+433*Gammaproteobacteria*160*2*class
+586*Vibrionales*433*3*order
+587*Vibrionaceae*586*4*family
+588*Vibrio*587*5*genus
+592*Photobacterium*587*5*genus
+552*Pseudomonadales*433*3*order
+553*Pseudomonadaceae*552*4*family
+554*Pseudomonas*553*5*genus
+604*Enterobacteriales*433*3*order
+605*Enterobacteriaceae*604*4*family
+617*Enterobacter*605*5*genus
+161*Alphaproteobacteria*160*2*class
+260*Rhizobiales*161*3*order
+261*Rhizobiaceae*260*4*family
+262*Rhizobium*261*5*genus"""
+
+exp_training_header_template = "<trainsetNo>1</trainsetNo><version>version1</version><modversion>cogent</modversion><file>%s</file>\n"
+
+rdp_trained_fasta = """>X67228
+aacgaacgctggcggcaggcttaacacatgcaagtcgaacgctccgcaaggagagtggcagacgggtgagtaacgcgtgggaatctacccaaccctgcggaatagctctgggaaactggaattaataccgcatacgccctacgggggaaagatttatcggggatggatgagcccgcgttggattagctagttggtggggtaaaggcctaccaaggcgacgatccatagctggtctgagaggatgatcagccacattgggactgagacacggcccaaa
+"""
+
+rdp_sample_fasta = """>X67228 Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Rhizobium
+aacgaacgctggcggcaggcttaacacatgcaagtcgaacgctccgcaaggagagtggcagacgggtgagtaacgcgtgggaatctacccaaccctgcggaatagctctgggaaactggaattaataccgcatacgccctacgggggaaagatttatcggggatggatgagcccgcgttggattagctagttggtggggtaaaggcctaccaaggcgacgatccatagctggtctgagaggatgatcagccacattgggactgagacacggcccaaa
+"""
+
+rdp_sample_classification = """>X67228 reverse=false
+Root; 1.0; Bacteria; 1.0; Proteobacteria; 1.0; Alphaproteobacteria; 1.0; Rhizobiales; 1.0; Rhizobiaceae; 1.0; Rhizobium; 0.95;
+"""
+
+rdp_test_fasta = """>AY800210 description field
+TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGAT [...]
+>EU883771
+TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAG [...]
+>EF503699
+AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAG [...]
+>random_seq
+AAGCTCCGTCGCGTGAGCTAAAAACCATGCTGACTTATGAGACCTAAAAGCGATGCGCCGACCTGACGATGCTCTGTTCAGTTTCATCACGATCACCGGTAGTCAGGGTACCCTCCAGACCGCGCATAGTGACTATGTTCCCGCACCTGTATATGTAATTCCCATTATACGTCTACGTTATGTAGTAAAGTTGCTCACGCCAGGCACAGTTTGTCTTGATACATAGGGTAGCTTAAGTCCCGTCCATTTCACCGCGATTGTAATAGACGAATCAGCAGTGGTGCAATCAAGTCCCAACAGTTATATTTCAAAAATCTTCCGATAGTCGTGGGCGAAGTTGTCAACCTACCTACCATGGCTATAAGGCCCAGTTTACTTCAGTTGAACGTGACGGTAACCCTACTGAGTGCACGATACCTGCTCAACAACGGCCCAAAACCCGTGCGACACATTGGGCACTACAATAATCTTAGAGGACCATGGATCTGGTGG [...]
+>DQ260310
+GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGG [...]
+>EF503697
+TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAG [...]
+>short_seq
+TAAAATGACTAGCCTGCGAGTCAC
+"""
+
+rdp_expected_out = {
+ 'AY800210 description field': 'Archaea;Euryarchaeota',
+ 'EU883771': 'Archaea;Euryarchaeota;Methanomicrobia;Methanomicrobiales;Methanomicrobiaceae;Methanomicrobium',
+ 'EF503699': 'Archaea;Crenarchaeota;Thermoprotei',
+ 'random_seq': 'Bacteria',
+ 'DQ260310': 'Archaea;Euryarchaeota;Methanobacteria;Methanobacteriales;Methanobacteriaceae;Methanosphaera',
+ 'EF503697': 'Archaea;Crenarchaeota;Thermoprotei',
+ 'short_seq': 'Unassignable',
+ }
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_rtax.py b/bfillings/tests/test_rtax.py
new file mode 100644
index 0000000..572caac
--- /dev/null
+++ b/bfillings/tests/test_rtax.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+from unittest import TestCase, main
+from tempfile import mkstemp
+
+from skbio.util import remove_files
+
+from bfillings.rtax import Rtax, assign_taxonomy
+
+
+class RtaxClassifierTests(TestCase):
+ """ Tests of the RTAX classifier module """
+
+ def setUp(self):
+ self.maxDiff = None
+
+ _, self.id_to_taxonomy_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+ suffix='.txt')
+ _, self.input_seqs_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+ suffix='.fasta')
+ _, self.reference_seqs_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+ suffix='.fasta')
+ _, self.read_1_seqs_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+ suffix='.fasta')
+ _, self.read_2_seqs_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+ suffix='.fasta')
+
+ self._paths_to_clean_up = [self.id_to_taxonomy_fp,self.input_seqs_fp,self.reference_seqs_fp, self.read_1_seqs_fp,self.read_2_seqs_fp]
+
+ a = open(self.id_to_taxonomy_fp,'w')
+ a.write(rtax_reference_taxonomy)
+ a.close()
+ b = open(self.reference_seqs_fp,'w')
+ b.write(rtax_reference_fasta)
+ b.close()
+ c = open(self.input_seqs_fp,'w')
+ c.write(rtax_test_repset_fasta)
+ c.close()
+ d = open(self.read_1_seqs_fp,'w')
+ d.write(rtax_test_read1_fasta)
+ d.close()
+ e = open(self.read_2_seqs_fp,'w')
+ e.write(rtax_test_read2_fasta)
+ e.close()
+
+ def tearDown(self):
+ remove_files(set(self._paths_to_clean_up),error_on_missing=False)
+
+ def test_paired_end_classification(self):
+ self._paths_to_clean_up += cleanAll(self.read_1_seqs_fp)
+ self._paths_to_clean_up += cleanAll(self.read_2_seqs_fp)
+ result = assign_taxonomy(self.input_seqs_fp, self.reference_seqs_fp, self.id_to_taxonomy_fp, self.read_1_seqs_fp, self.read_2_seqs_fp,single_ok=False,header_id_regex="\\S+\\s+(\\S+?)\/")
+ self.assertEqual(result, rtax_expected_result_paired)
+
+ def test_paired_end_classification_with_fallback(self):
+ self._paths_to_clean_up += cleanAll(self.read_1_seqs_fp)
+ self._paths_to_clean_up += cleanAll(self.read_2_seqs_fp)
+ result = assign_taxonomy(self.input_seqs_fp, self.reference_seqs_fp, self.id_to_taxonomy_fp, self.read_1_seqs_fp, self.read_2_seqs_fp,single_ok=True,header_id_regex="\\S+\\s+(\\S+?)\/")
+ self.assertEqual(result, rtax_expected_result_paired_with_fallback)
+
+ def test_single_end_classification(self):
+ self._paths_to_clean_up += cleanAll(self.read_1_seqs_fp)
+ result = assign_taxonomy(self.input_seqs_fp, self.reference_seqs_fp, self.id_to_taxonomy_fp, self.read_1_seqs_fp, None ,header_id_regex="\\S+\\s+(\\S+?)\/")
+ self.assertEqual(result, rtax_expected_result_single)
+
+ # I'd like to add tests here that involve the TOOMANYHITS case. However, that requires either a reference
+ # database with >16,000 sequences, which we don't have handy for tests, or adjusting the maxMaxAccepts parameter to rtaxSearch.pl.
+ # However the "rtax" wrapper shell script currently doesn't allow setting that option, and I'd prefer to leave that as is
+ # unless someone actually wants to use it. Thus the TOOMANYHITS situation is not easily testable at the moment.
+
+
+def cleanAll(path):
+ return [path, path + ".pos.db", path + ".pos.dir", path + ".pos.pag", path + ".lines.db", path + ".lines.dir", path + ".lines.pag"]
+
+
+# sample data copied from GreenGenes
+
+
+rtax_reference_taxonomy = """508720 99.0 k__Bacteria p__Actinobacteria c__Actinobacteria o__Actinomycetales f__Propionibacteriaceae g__Propionibacterium s__Propionibacterium acnes
+508050 99.0 k__Bacteria p__Proteobacteria c__Betaproteobacteria o__Burkholderiales f__Comamonadaceae g__Diaphorobacter s__
+502492 99.0 k__Bacteria p__Proteobacteria c__Betaproteobacteria o__Burkholderiales f__ g__Aquabacterium s__
+"""
+
+rtax_reference_fasta = """>508720
+GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAGGCCCTGCTTTTGTGGGGTGCTCGAGTGGCGAACG
+GGTGAGTAACACGTGAGTAACCTGCCCTTGACTTTGGGATAACTTCAGGAAACTGGGGCTAATACCGGATAGGAGCTCCT
+GCTGCATGGTGGGGGTTGGAAAGTTTCGGCGGTTGGGGATGGACTCGCGGCTTATCAGCTTGTTGGTGGGGTAGTGGCTT
+ACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGCCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGA
+GGCAGCAGTGGGGAATATTGCACAATGGGCGGAAGCCTGATGCAGCAACGCCGCGTGCGGGATGACGGCCTTCGGGTTGT
+AAACCGCTTTCGCCTGTGACGAAGCGTGAGTGACGGTAATGGGTAAAGAAGCACCGGCTAACTACGTGCCAGCAGCCGCG
+GTGATACGTAGGGTGCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGCTCGTAGGTGGTTGATCGCGTCGGAAGTGTAA
+TCTTGGGGCTTAACCCTGAGCGTGCTTTCGATACGGGTTGACTTGAGGAAGGTAGGGGAGAATGGAATTCCTGGTGGAGC
+GGTGGAATGCGCAGATATCAGGAGGAACACCAGTGGCGAAGGCGGTTCTCTGGGCCTTTCCTGACGCTGAGGAGCGAAAG
+CGTGGGGAGCGAACAGGCTTAGATACCCTGGTAGTCCACGCTGTAAACGGTGGGTACTAGGTGTGGGGTCCATTCCACGG
+GTTCCGTGCCGTAGCTAACGCTTTAAGTACCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCAAAGGAATTGACGGG
+GCCCCGCACAAGCGGCGGAGCATGCGGATTAATTCGATGCAACGCGTAGAACCTTACCTGGGTTTGACATGGATCGGGAG
+TGCTCAGAGATGGGTGTGCCTCTTTTGGGGTCGGTTCACAGGTGGTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTT
+GGGTTAAGTCCCGCAACGAGCGCAACCCTTGTTCACTGTTGCCAGCACGTTATGGTGGGGACTCAGTGGAGACCGCCGGG
+GTCAACTCGGAGGAAGGTGGGGATGACGTCAAGTCATCATGCCCCTTATGTCCAGGGCTTCACGCATGCTACAATGGCTG
+GTACAGAGAGTGGCGAGCCTGTGAGGGTGAGCGAATCTCGGAAAGCCGGTCTCAGTTCGGATTGGGGTCTGCAACTCGAC
+CTCATGAAGTCGGAGTCGCTAGTAATCGCAGATCAGCAACGCTGCGGTGAATACGTTCCCGGGGCT
+>508050
+ATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGTAACAGGTCTTCGGATGCTGACGAGTGGCGAACGGGTG
+AGTAATACATCGGAACGTGCCCGATCGTGGGGGATAACGAGGCGAAAGCTTTGCTAATACCGCATACGATCTACGGATGA
+AAGCGGGGGATCTTCGGACCTCGCGCGGACGGAGCGGCCGATGGCAGATTAGGTAGTTGGTGGGATAAAAGCTTACCAAG
+CCGACGATCTGTAGCTGGTCTGAGAGGATGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGC
+AGTGGGGAATTTTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGCAGGATGAAGGCCTTCGGGTTGTAAACTG
+CTTTTGTACGGAACGAAAAGCCTCTTTCTAATAAAGAGGGGTCATGACGGTACCGTAAGAATAAGCACCGGCTAACTACG
+TGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTTTGTA
+AGACAGAGGTGAAATCCCCGGGCTCAACCTGGGAACTGCCTTTGTGACTGCAAGGCTGGAGTGCGGCAGAGGGGGATGGA
+ATTCCGCGTGTAGCAGTGAAATGCGTAGATATGCGGAGGAACACCGATGGCGAAGGCAATCCCCTGGGCCTGCACTGACG
+CTCATGCACGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGTCAACTGGTTGTTG
+GGTCTTCACTGACTCAGTAACGAAGCTAACGCGTGAAGTTGACCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAG
+GAATTGACGGGGACCCGCACAAGCGGTGGATGATGTGGTTTAATTCGATGCAACGCGAAAAACCTTACCCACCTTTGACA
+TGGCAGGAAGTTTCCAGAGATGGATTCGTGCCCGAAAGGGAACCTGCACACAGGTGCTGCATGGCTGTCGTCAGCTCGTG
+TCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGCCATTAGTTGCTACGAAAGGGCACTCTAATGGGACTG
+CCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATAGGTGGGGCTACACACGTCATACAAT
+GGCTGGTACAGAGGGTTGCCAACCCGCGAGGGGGAGCTAATCCCATAAAGCCAGTCGTAGTCCGGATCGCAGTCTGCAAC
+TCGACTGCGTGAAGTCGGAATCGCTAGTAATCGCGGATCAGAATGTCGCGGTGAATACGTTCCCGGGTCT
+>502492
+ATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGTAACGGGTCCTTCGGGATGCCGACGAGTGGCGAACGGG
+TGAGTAATATATCGGAACGTGCCCAGTAGTGGGGGATAACTGCTCGAAAGAGCAGCTAATACCGCATACGACCTGAGGGT
+GAAAGGGGGGGATCGCAAGACCTCTCGCTATTGGAGCGGCCGATATCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCA
+AGGCAACGATCTGTAGTTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+GCAGTGGGGAATTTTGGACAATGGGCGCAAGCCTGATCCAGCAATGCCGCGTGCAGGAAGAAGGCCTTCGGGTTGTAAAC
+TGCTTTTGTCAGGGAAGAAATCTTCTGGGCTAATACCCCGGGAGGATGACGGTACCTGAAGAATAAGCACCGGCTAACTA
+CGTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTTTG
+CAAGACAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATTTGTGACTGCAAGGCTAGAGTACGGCAGAGGGGGATG
+GAATTCCGCGTGTAGCAGTGAAATGCGTAGATATGCGGAGGAACACCAATGGCGAAGGCAATCCCCTGGGCCTGTACTGA
+CGCTCATGCACGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGTCAACTGGTTGT
+TGGACGGCTTGCTGTTCAGTAACGAAGCTAACGCGTGAAGTTGACCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAA
+AGGAATTGACGGGGACCCGCACAAGCGGTGGATGATGTGGTTTAATTCGATGCAACGCGAAAAACCTTACCTACCCTTGA
+CATGTCAAGAATTCTGCAGAGATGTGGAAGTGCTCGAAAGAGAACTTGAACACAGGTGCTGCATGGCCGTCGTCAGCTCG
+TGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCATTAGTTGCTACGCAAGAGCACTCTAATGAGAC
+TGCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAGGTCCTCATGGCCCTTATGGGTAGGGCTACACACGTCATACA
+ATGGCCGGTACAGAGGGCTGCCAACCCGCGAGGGGGAGCCAATCCCAGAAAACCGGTCGTAGTCCGGATCGTAGTCTGCA
+ACTCGACTGCGTGAAGTCGGAATCGCTAGTAATCGCGGATCAGCTTGCCGCGGTGAATACGTTCCCGGGTCT
+"""
+
+
+rtax_test_repset_fasta = """>clusterIdA splitRead1IdA
+ACCAAGGCTTTGACGGGTAGCCGGCCTGAGTGGGTGACCGGCCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGA
+>clusterIdB splitRead1IdB
+CCGACGATCTGTAGCTGGTCTGAGAGGATGTTCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGC
+>clusterIdC splitRead1IdC
+AGGCAACGATCTGTAGTTGGTCTGAGAGGAGGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+>clusterIdD splitRead1IdD
+AGGCAACGATCTGTAGTTGGTCTGAGAGGAGGACCAGCCACACTGGGACGGGGGGGGGGCCCAGACTCCTACGGGAGGCA
+"""
+
+# these reads are the 4th and 14th lines from the reference seqs
+
+#rtax_test_read1_fasta = """>splitRead1IdA ampliconId_34563456/1
+#ACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGCCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGA
+#>splitRead1IdB ampliconId_
+#CCGACGATCTGTAGCTGGTCTGAGAGGATGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGC
+#>splitRead1IdC ampliconId_
+#AGGCAACGATCTGTAGTTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+#"""
+#
+#rtax_test_read2_fasta = """>splitRead2IdA ampliconId_34563456/3
+#GGGTTAAGTCCCGCAACGAGCGCAACCCTTGTTCACTGTTGCCAGCACGTTATGGTGGGGACTCAGTGGAGACCGCCGGG
+#>splitRead2IdB ampliconId_
+#TCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGCCATTAGTTGCTACGAAAGGGCACTCTAATGGGACTG
+#>splitRead2IdC ampliconId_
+#TGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCATTAGTTGCTACGCAAGAGCACTCTAATGAGAC
+#"""
+
+
+# these reads are the 4th and 14th lines from the reference seqs, with one nucleotide changed each
+# except D and E, which are unique to one read or the other
+# and F and G, which are just decoys
+
+rtax_test_read1_fasta = """>splitRead1IdA ampliconId_34563456/1
+ACCAAGGCTTTGACGGGTAGCCGGCCTGAGTGGGTGACCGGCCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGA
+>splitRead1IdB ampliconId_12341234/1
+CCGACGATCTGTAGCTGGTCTGAGAGGATGTTCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGC
+>splitRead1IdC ampliconId_23452345/1
+AGGCAACGATCTGTAGTTGGTCTGAGAGGAGGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+>splitRead1IdD ampliconId_45674567/1
+AGGCAACGATCTGTAGTTGGTCTGAGAGGAGGACCAAAAAAAAAAAGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+>splitRead1IdF ampliconId_56785678/1
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+"""
+
+rtax_test_read2_fasta = """>splitRead2IdA ampliconId_34563456/3
+GGGTTAAGTCCCGCAACGAGCGCAACCCTTATTCACTGTTGCCAGCACGTTATGGTGGGGACTCAGTGGAGACCGCCGGG
+>splitRead2IdB ampliconId_12341234/3
+TCGTGAGATGTTGGGTTAAGTCCCGCAACGTGCGCAACCCTTGCCATTAGTTGCTACGAAAGGGCACTCTAATGGGACTG
+>splitRead2IdC ampliconId_23452345/3
+TGTCGTGAGATGTTGGGTTAAGTCCCGCAAAGAGCGCAACCCTTGTCATTAGTTGCTACGCAAGAGCACTCTAATGAGAC
+>splitRead2IdE ampliconId_67896789/3
+TGTCGTGAGATGTTGGGTTAAAAAAAAAAAAAAACGCAACCCTTGTCATTAGTTGCTACGCAAGAGCACTCTAATGAGAC
+>splitRead2IdG ampliconId_78907890/3
+TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+"""
+
+
+rtax_expected_result_paired = {
+ 'clusterIdA splitRead1IdA': ('k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Propionibacteriaceae; g__Propionibacterium; s__Propionibacterium acnes', 1.0),
+ 'clusterIdB splitRead1IdB': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Diaphorobacter; s__', 1.0),
+ 'clusterIdC splitRead1IdC': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+ 'clusterIdD splitRead1IdD': ('NOMATEPAIR', 1.0),
+ }
+
+rtax_expected_result_paired_with_fallback = {
+ 'clusterIdA splitRead1IdA': ('k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Propionibacteriaceae; g__Propionibacterium; s__Propionibacterium acnes', 1.0),
+ 'clusterIdB splitRead1IdB': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Diaphorobacter; s__', 1.0),
+ 'clusterIdC splitRead1IdC': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+ 'clusterIdD splitRead1IdD': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+ }
+
+rtax_expected_result_single = {
+ 'clusterIdA splitRead1IdA': ('k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Propionibacteriaceae; g__Propionibacterium; s__Propionibacterium acnes', 1.0),
+ 'clusterIdB splitRead1IdB': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Diaphorobacter; s__', 1.0),
+ 'clusterIdC splitRead1IdC': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+ 'clusterIdD splitRead1IdD': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+ }
+
+if __name__ == "__main__":
+ main()
diff --git a/bfillings/tests/test_sortmerna_v2.py b/bfillings/tests/test_sortmerna_v2.py
new file mode 100644
index 0000000..1945ebe
--- /dev/null
+++ b/bfillings/tests/test_sortmerna_v2.py
@@ -0,0 +1,855 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Unit tests for the SortMeRNA version 2.0 Application controller
+===============================================================
+"""
+
+
+from unittest import TestCase, main
+import re
+from os import close
+from os.path import abspath, exists, join, dirname
+from tempfile import mkstemp, mkdtemp
+from shutil import rmtree
+
+from skbio.util import remove_files
+from skbio.parse.sequences import parse_fasta
+
+from bfillings.sortmerna_v2 import (build_database_sortmerna,
+ sortmerna_ref_cluster,
+ sortmerna_map)
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+
+# Test class and cases
+class SortmernaV2Tests(TestCase):
+ """ Tests for SortMeRNA version 2.0 functionality """
+
+ def setUp(self):
+ self.output_dir = mkdtemp()
+ self.reference_seq_fp = reference_seqs_fp
+ self.read_seqs_fp = read_seqs_fp
+
+ # create temporary file with reference sequences defined
+ # in reference_seqs_fp
+ f, self.file_reference_seq_fp = mkstemp(prefix='temp_references_',
+ suffix='.fasta')
+ close(f)
+
+ # write _reference_ sequences to tmp file
+ with open(self.file_reference_seq_fp, 'w') as tmp:
+ tmp.write(self.reference_seq_fp)
+ tmp.close()
+
+ # create temporary file with read sequences defined in read_seqs_fp
+ f, self.file_read_seqs_fp = mkstemp(prefix='temp_reads_',
+ suffix='.fasta')
+ close(f)
+
+ # write _read_ sequences to tmp file
+ with open(self.file_read_seqs_fp, 'w') as tmp:
+ tmp.write(self.read_seqs_fp)
+ tmp.close()
+
+ # list of files to remove
+ self.files_to_remove = [self.file_reference_seq_fp,
+ self.file_read_seqs_fp]
+
+ def tearDown(self):
+ remove_files(self.files_to_remove)
+ rmtree(self.output_dir)
+
+ def test_indexdb_default_param(self):
+ """ Test indexing a database using SortMeRNA
+ """
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ expected_db_files = set(sortmerna_db + ext
+ for ext in ['.bursttrie_0.dat', '.kmer_0.dat',
+ '.pos_0.dat', '.stats'])
+
+ # Make sure all db_files exist
+ for fp in expected_db_files:
+ self.assertTrue(exists(fp))
+
+ # Add files to be remove
+ self.files_to_remove.extend(db_files_to_remove)
+
+ def test_empty_fasta_path(self):
+ """ Indexdb should fail with an empty fasta path
+ """
+ self.assertRaises(ValueError,
+ build_database_sortmerna,
+ fasta_path=None,
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ def test_empty_inputs(self):
+ """ (1) Indexdb should set output_dir to the same directory
+ as where the input FASTA file is located;
+ (2) SortMeRNA should fail if an empty result path is
+ passed;
+ (3) SortMeRNA should fail if an empty seq path is passed
+ """
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=None)
+
+ self.files_to_remove.extend(db_files_to_remove)
+
+ fasta_dir = dirname(abspath(self.file_reference_seq_fp))
+ out_dir = dirname(sortmerna_db)
+
+ self.assertEqual(fasta_dir, out_dir)
+
+ self.assertRaises(ValueError,
+ sortmerna_ref_cluster,
+ seq_path=self.file_read_seqs_fp,
+ sortmerna_db=sortmerna_db,
+ refseqs_fp=self.file_reference_seq_fp,
+ result_path=None)
+
+ self.assertRaises(ValueError,
+ sortmerna_ref_cluster,
+ seq_path=None,
+ sortmerna_db=sortmerna_db,
+ refseqs_fp=self.file_reference_seq_fp,
+ result_path=join(self.output_dir,
+ "sortmerna_otus.txt"))
+
+ def test_tabular_output(self):
+ """ SortMeRNA should output a BLAST tabular output
+ """
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ self.files_to_remove.extend(db_files_to_remove)
+
+ # Run SortMeRNA
+ clusters, failures, smr_files_to_remove = sortmerna_ref_cluster(
+ seq_path=self.file_read_seqs_fp,
+ sortmerna_db=sortmerna_db,
+ refseqs_fp=self.file_reference_seq_fp,
+ result_path=join(self.output_dir, "sortmerna_otus.txt"),
+ tabular=True)
+
+ self.assertTrue(exists(join(self.output_dir,
+ "sortmerna_otus.blast")))
+
+ def test_empty_result_path(self):
+ """ SortMeRNA should fail with an empty indexed database
+ """
+ self.assertRaises(ValueError,
+ sortmerna_ref_cluster,
+ seq_path=self.file_read_seqs_fp,
+ sortmerna_db=None,
+ refseqs_fp=self.file_reference_seq_fp,
+ result_path=join(self.output_dir,
+ "sortmerna_otus.txt")
+ )
+
+ def test_sortmerna_default_param(self):
+ """ SortMeRNA version 2.0 reference OTU picking works with default settings
+ """
+ # rebuild the index
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ # Files created by indexdb_rna to be deleted
+ self.files_to_remove.extend(db_files_to_remove)
+
+ # Run SortMeRNA
+ cluster_map, failures, smr_files_to_remove = sortmerna_ref_cluster(
+ seq_path=self.file_read_seqs_fp,
+ sortmerna_db=sortmerna_db,
+ refseqs_fp=self.file_reference_seq_fp,
+ result_path=join(self.output_dir, "sortmerna_otus.txt"))
+
+ # Check all sortmerna output files exist
+ output_files = [join(self.output_dir, ext)
+ for ext in ['sortmerna_otus_otus.txt',
+ 'sortmerna_otus.log',
+ 'sortmerna_otus_denovo.fasta',
+ 'sortmerna_otus.fasta']]
+
+ # Check output files exist
+ for fp in output_files:
+ self.assertTrue(exists(fp))
+
+ # Files created sortmerna to be deleted (StdErr and StdOut were already
+ # removed in sortmerna_ref_cluster)
+ self.files_to_remove.extend(output_files)
+
+ # Random reads that should not appear in any output file
+ random_reads = ['simulated_random_reads.fa.000000000',
+ 'simulated_random_reads.fa.000000001',
+ 'simulated_random_reads.fa.000000002',
+ 'simulated_random_reads.fa.000000003',
+ 'simulated_random_reads.fa.000000004',
+ 'simulated_random_reads.fa.000000005',
+ 'simulated_random_reads.fa.000000006',
+ 'simulated_random_reads.fa.000000007',
+ 'simulated_random_reads.fa.000000008',
+ 'simulated_random_reads.fa.000000009']
+
+ # Reads passing E-value threshold and with similarity/coverage >=97%
+ otu_reads = ['HMPMockV1.2.Staggered2.673827_47',
+ 'HMPMockV1.2.Staggered2.673827_115',
+ 'HMPMockV1.2.Staggered2.673827_122',
+ 'HMPMockV1.2.Staggered2.673827_161',
+ 'HMPMockV1.2.Staggered2.673827_180',
+ 'HMPMockV1.2.Staggered2.673827_203',
+ 'HMPMockV1.2.Staggered2.673827_207',
+ 'HMPMockV1.2.Staggered2.673827_215',
+ 'HMPMockV1.2.Staggered2.673827_218',
+ 'HMPMockV1.2.Staggered2.673827_220']
+
+ # Reads passing E-value threshold and with similarity/coverage <97%
+ denovo_reads = ['HMPMockV1.2.Staggered2.673827_0',
+ 'HMPMockV1.2.Staggered2.673827_1',
+ 'HMPMockV1.2.Staggered2.673827_2',
+ 'HMPMockV1.2.Staggered2.673827_3',
+ 'HMPMockV1.2.Staggered2.673827_4',
+ 'HMPMockV1.2.Staggered2.673827_5',
+ 'HMPMockV1.2.Staggered2.673827_6',
+ 'HMPMockV1.2.Staggered2.673827_7',
+ 'HMPMockV1.2.Staggered2.673827_8',
+ 'HMPMockV1.2.Staggered2.673827_9']
+
+ # Check correct number of OTU clusters in file
+ otu_clusters = ['295053']
+
+ f_aligned = open(output_files[3], "U")
+ f_otumap = open(output_files[0], "U")
+ f_denovo = open(output_files[2], "U")
+
+ # Verify the aligned FASTA file
+ for label, seq in parse_fasta(f_aligned):
+ id = label.split()[0]
+ # Read is not random
+ self.assertNotIn(id, random_reads)
+ # Read is either in otu_reads or denovo_reads
+ self.assertIn(id, otu_reads+denovo_reads)
+ f_aligned.close()
+
+ # Verify the de novo reads FASTA file
+ for label, seq in parse_fasta(f_denovo):
+ id = label.split()[0]
+ # Read is not random
+ self.assertNotIn(id, random_reads)
+ # Read is not an OTU read
+ self.assertNotIn(id, otu_reads)
+ # Read is a de novo read
+ self.assertIn(id, denovo_reads)
+ f_denovo.close()
+
+ # Check the OTU map
+ for line in f_otumap:
+ otu_entry = line.split()
+ # Cluster ID is correct
+ self.assertIn(otu_entry[0], otu_clusters)
+ # Each read in the cluster must exclusively be an OTU read
+ for read in otu_entry[1:]:
+ self.assertNotIn(read, random_reads)
+ self.assertNotIn(read, denovo_reads)
+ self.assertIn(read, otu_reads)
+ f_otumap.close()
+
+ # Check returned list of lists of clusters
+ expected_cluster = ['HMPMockV1.2.Staggered2.673827_47',
+ 'HMPMockV1.2.Staggered2.673827_115',
+ 'HMPMockV1.2.Staggered2.673827_122',
+ 'HMPMockV1.2.Staggered2.673827_161',
+ 'HMPMockV1.2.Staggered2.673827_180',
+ 'HMPMockV1.2.Staggered2.673827_203',
+ 'HMPMockV1.2.Staggered2.673827_207',
+ 'HMPMockV1.2.Staggered2.673827_215',
+ 'HMPMockV1.2.Staggered2.673827_218',
+ 'HMPMockV1.2.Staggered2.673827_220']
+
+ # Should only have 1 cluster
+ self.assertEqual(1, len(cluster_map))
+ for actual_cluster in cluster_map.itervalues():
+ actual_cluster.sort()
+ expected_cluster.sort()
+ self.assertEqual(actual_cluster, expected_cluster)
+
+ # Check log file number of clusters and failures corresponds to
+ # the results in the output files
+ f_log = open(output_files[1], "U")
+ num_clusters = 0
+ num_failures = 0
+ for line in f_log:
+ if line.startswith(" Total OTUs"):
+ num_clusters = (re.split(' = ', line)[1]).strip()
+ elif line.startswith(" Total reads for de novo clustering"):
+ num_failures = (re.split(' = ', line)[1]).strip()
+ f_log.close()
+
+ self.assertEqual(int(num_clusters), len(otu_clusters))
+ self.assertEqual(int(num_failures), len(denovo_reads))
+
+ def test_sortmerna_map_default(self):
+ """ SortMeRNA version 2.0 for mapping sequences onto a reference
+ using default parameters
+ """
+
+ # Rebuild the index
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ # Files created by indexdb_rna to be deleted
+ self.files_to_remove.extend(db_files_to_remove)
+
+ # Run SortMeRNA mapper
+ app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+ output_dir=self.output_dir,
+ refseqs_fp=self.file_reference_seq_fp,
+ sortmerna_db=sortmerna_db)
+
+ # Check all sortmerna output files exist
+ output_files = [join(self.output_dir, ext)
+ for ext in ['sortmerna_map.blast',
+ 'sortmerna_map.log']]
+
+ # Check output files exist
+ for fp in output_files:
+ self.assertTrue(exists(fp))
+
+ blast_alignments_fp = app_result['BlastAlignments'].name
+
+ # Check there are 30 alignments (1 per read)
+ with open(blast_alignments_fp, 'U') as blast_actual:
+ entries = (line.strip().split('\t') for line in blast_actual)
+ actual_alignments = {r[0]: r[1:] for r in entries}
+
+ self.assertEqual(30, len(actual_alignments))
+
+ # Check this alignment exists
+ self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
+ in actual_alignments)
+ self.assertEqual("97.3", actual_alignments[
+ "HMPMockV1.2.Staggered2.673827_47"][1])
+ self.assertEqual("100", actual_alignments[
+ "HMPMockV1.2.Staggered2.673827_47"][12])
+
+ # Check alignment for random read is NULL
+ self.assertTrue("simulated_random_reads.fa.000000000"
+ in actual_alignments)
+ self.assertEqual("*", actual_alignments[
+ "simulated_random_reads.fa.000000000"][0])
+
+ def test_sortmerna_map_sam_alignments(self):
+ """ SortMeRNA version 2.0 for mapping sequences onto a reference
+ outputting Blast and SAM alignments
+ """
+
+ # Rebuild the index
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ # Files created by indexdb_rna to be deleted
+ self.files_to_remove.extend(db_files_to_remove)
+
+ # Run SortMeRNA mapper
+ app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+ output_dir=self.output_dir,
+ refseqs_fp=self.file_reference_seq_fp,
+ sortmerna_db=sortmerna_db,
+ output_sam=True)
+
+ # Check all sortmerna output files exist
+ output_files = [join(self.output_dir, ext)
+ for ext in ['sortmerna_map.blast',
+ 'sortmerna_map.sam',
+ 'sortmerna_map.log']]
+
+ # Check output files exist
+ for fp in output_files:
+ self.assertTrue(exists(fp))
+
+ sam_alignments_fp = app_result['SAMAlignments'].name
+
+ # Check there are 30 alignments in the SAM output (1 per read)
+ with open(sam_alignments_fp, 'U') as sam_actual:
+ entries = (line.strip().split('\t') for line in sam_actual)
+ actual_alignments = {r[0]: r[1:] for r in entries}
+
+ # 30 alignments expected + 2 lines for @HD and @PG fields
+ self.assertEqual(32, len(actual_alignments))
+
+ # Check this alignment exists
+ self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
+ in actual_alignments)
+ self.assertEqual("295053", actual_alignments[
+ "HMPMockV1.2.Staggered2.673827_47"][1])
+ self.assertEqual("AS:i:418", actual_alignments[
+ "HMPMockV1.2.Staggered2.673827_47"][10])
+
+ # Check alignment for random read is NULL
+ self.assertTrue("simulated_random_reads.fa.000000000"
+ in actual_alignments)
+ self.assertEqual("*", actual_alignments[
+ "simulated_random_reads.fa.000000000"][1])
+
+ def test_sortmerna_map_sam_alignments_with_tags(self):
+ """ SortMeRNA version 2.0 for mapping sequences onto a reference
+ outputting SAM alignments with @SQ tags
+ """
+
+ # Rebuild the index
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ # Files created by indexdb_rna to be deleted
+ self.files_to_remove.extend(db_files_to_remove)
+
+ # Run SortMeRNA mapper
+ app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+ output_dir=self.output_dir,
+ refseqs_fp=self.file_reference_seq_fp,
+ sortmerna_db=sortmerna_db,
+ output_sam=True,
+ sam_SQ_tags=True,
+ blast_format=None)
+
+ # Check all sortmerna output files exist
+ output_files = [join(self.output_dir, ext)
+ for ext in ['sortmerna_map.sam',
+ 'sortmerna_map.log']]
+
+ # Check output files exist
+ for fp in output_files:
+ self.assertTrue(exists(fp))
+
+ sam_alignments_fp = app_result['SAMAlignments'].name
+
+ # Check there are 30 alignments in the SAM output (1 per read)
+ with open(sam_alignments_fp, 'U') as sam_actual:
+ actual_entries = [line.strip().split('\t') for line in sam_actual]
+
+ # 30 alignments expected + 2 lines for @HD and @PG fields + 5 lines
+ # for the @SQ tags
+ self.assertEqual(37, len(actual_entries))
+
+ # Check all expected @SQ tags have been included
+ SQ_array = [['@SQ', 'SN:42684', 'LN:1501'],
+ ['@SQ', 'SN:342684', 'LN:1486'],
+ ['@SQ', 'SN:426848', 'LN:1486'],
+ ['@SQ', 'SN:295053', 'LN:1389'],
+ ['@SQ', 'SN:879972', 'LN:1371']]
+ for entry in SQ_array:
+ self.assertTrue(entry in actual_entries)
+
+ def test_sortmerna_map_blast_no_null_alignments(self):
+ """ SortMeRNA version 2.0 for mapping sequences onto a reference
+ using Blast with --print_all_reads option set to False
+ (no NULL alignments output)
+ """
+
+ # Rebuild the index
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ # Files created by indexdb_rna to be deleted
+ self.files_to_remove.extend(db_files_to_remove)
+
+ # Run SortMeRNA mapper
+ app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+ output_dir=self.output_dir,
+ refseqs_fp=self.file_reference_seq_fp,
+ sortmerna_db=sortmerna_db,
+ print_all_reads=False)
+
+ # Check all sortmerna output files exist
+ output_files = [join(self.output_dir, ext)
+ for ext in ['sortmerna_map.blast',
+ 'sortmerna_map.log']]
+
+ # Check output files exist
+ for fp in output_files:
+ self.assertTrue(exists(fp))
+
+ blast_alignments_fp = app_result['BlastAlignments'].name
+
+ # Check there are 20 alignments (1 per read)
+ with open(blast_alignments_fp, 'U') as blast_actual:
+ entries = (line.strip().split('\t') for line in blast_actual)
+ actual_alignments = {r[0]: r[1:] for r in entries}
+
+ self.assertEqual(20, len(actual_alignments))
+
+ # Check this alignment exists
+ self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
+ in actual_alignments)
+ self.assertEqual("97.3", actual_alignments[
+ "HMPMockV1.2.Staggered2.673827_47"][1])
+ self.assertEqual("100", actual_alignments[
+ "HMPMockV1.2.Staggered2.673827_47"][12])
+
+ # Check alignment for random read does not exist
+ self.assertFalse("simulated_random_reads.fa.000000000"
+ in actual_alignments)
+
+ def test_sortmerna_map_num_alignments(self):
+ """ SortMeRNA version 2.0 for mapping sequences onto a reference
+ outputting first INT num_alignments passing the E-value threshold
+ (rather than first INT best alignments)
+ """
+
+ # Rebuild the index
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ # Files created by indexdb_rna to be deleted
+ self.files_to_remove.extend(db_files_to_remove)
+
+ # Run SortMeRNA mapper
+ app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+ output_dir=self.output_dir,
+ refseqs_fp=self.file_reference_seq_fp,
+ sortmerna_db=sortmerna_db,
+ num_alignments=1)
+
+ # Check all sortmerna output files exist
+ output_files = [join(self.output_dir, ext)
+ for ext in ['sortmerna_map.blast',
+ 'sortmerna_map.log']]
+
+ # Check output files exist
+ for fp in output_files:
+ self.assertTrue(exists(fp))
+
+ blast_alignments_fp = app_result['BlastAlignments'].name
+
+ # Check there are 30 alignments (1 per read)
+ with open(blast_alignments_fp, 'U') as blast_actual:
+ entries = (line.strip().split('\t') for line in blast_actual)
+ actual_alignments = {r[0]: r[1:] for r in entries}
+
+ self.assertEqual(30, len(actual_alignments))
+
+ # Check this alignment exists
+ self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
+ in actual_alignments)
+ self.assertEqual("97.3", actual_alignments[
+ "HMPMockV1.2.Staggered2.673827_47"][1])
+ self.assertEqual("100", actual_alignments[
+ "HMPMockV1.2.Staggered2.673827_47"][12])
+
+ # Check alignment for random read is NULL
+ self.assertTrue("simulated_random_reads.fa.000000000"
+ in actual_alignments)
+ self.assertEqual("*", actual_alignments[
+ "simulated_random_reads.fa.000000000"][0])
+
+ def test_blast_or_sam(self):
+ """ SortMeRNA should fail with output_sam and blast_format both
+ set to False
+ """
+ # Rebuild the index
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ # Files created by indexdb_rna to be deleted
+ self.files_to_remove.extend(db_files_to_remove)
+
+ self.assertRaises(ValueError,
+ sortmerna_map,
+ seq_path=self.file_read_seqs_fp,
+ output_dir=self.output_dir,
+ refseqs_fp=self.file_reference_seq_fp,
+ sortmerna_db=sortmerna_db,
+ output_sam=False,
+ blast_format=None)
+
+ def test_best_or_num_alignments(self):
+ """ SortMeRNA should fail with "best" and "num_alignments" both
+ set to True
+ """
+ # Rebuild the index
+ sortmerna_db, db_files_to_remove = build_database_sortmerna(
+ abspath(self.file_reference_seq_fp),
+ max_pos=250,
+ output_dir=self.output_dir)
+
+ # Files created by indexdb_rna to be deleted
+ self.files_to_remove.extend(db_files_to_remove)
+
+ self.assertRaises(ValueError,
+ sortmerna_map,
+ seq_path=self.file_read_seqs_fp,
+ output_dir=self.output_dir,
+ refseqs_fp=self.file_reference_seq_fp,
+ sortmerna_db=sortmerna_db,
+ best=1,
+ num_alignments=1)
+
+
+# Reference sequence database
+reference_seqs_fp = """>426848
+AGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGCAGGCTTAATACATGCAAGTCGAGGGGCAGCACTGGTAGCAATAC
+CTGGTGGCGACCGGCGGACGGGTGCGTAACACGTATGCAACCTACCCTGTACAGGGGGATAGCCCGAGGAAATTCGGATT
+AATACCCCATACGATAAGAATCGGCATCGATTTTTATTGAAAGCTCCGGCGGTACAGGATGGGCATGCGCCCCATTAGCT
+AGTTGGTGAGGTAACGGCTCACCAAGGCTACGATGGGTAGGGGGCCTGAGAGGGTGATCCCCCACACTGGAACTGAGACA
+CGGTCCAGACTCCTACGGGAGGCAGCAGTAAGGAATATTGGTCAATGGGCGCAAGCCTGAACCAGCCATGCCGCGTGCAG
+GAAGACTGCCATTATGGTTGTAAACTGCTTTTATATGGGAAGAAACCTCCGGACGTGTCCGGAGCTGACGGTACCATGTG
+AATAAGGATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATCCAAGCGTTATCCGGATTTATTGGGTTTAAA
+GGGTGCGTAGGCGGCGTGTTAAGTCAGAGGTGAAATTCGGCAGCTCAACTGTCAAATTGCCTTTGATACTGGCACACTTG
+AATGCGATTGAGGTAGGCGGAATGTGACATGTAGCGGTGAAATGCTTAGACATGTGACAGAACACCGATTGCGAAGGCAG
+CTTACCAAGTCGTTATTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTA
+AACGATGATAACTCGACGTTAGCGATACACTGTTAGCGTCCAAGCGAAAGCGTTAAGTTATCCACCTGGGAAGTACGATC
+GCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGATACGCGAGGA
+ACCTTACCAGGGCTTAAATGGGGAACGACCTTCTGGGAAACCAGAATTTCTTTTAGACGGTCCTCAAGGTGCTGCATGGT
+TGTCGTCAGCTCGTGCCGTGAGGTGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTACTGTTAGTTGCCAGCGGATAAT
+GCCGGGGACTCTAGCGGAACTGCCTGTGCAAACAGAGAGGAAGGTGGGGATGACGTCAAATCATCACGGCCCTTACGTCC
+TGGGCTACACACGTGCTACAATGGCCGGTACAGAGGGCAGCCACTTCGTGAGAAGGAGCGAATCCTTAAAGCCGGTCTCA
+GTTCGGATTGTAGTCTGCAACTCGACTACATGAAGCTGGAATCGCTAGTAATCGCGTATCAGCCATGACGCGGTGAATAC
+GTTCCCGGGCCTTGTACACACCGCCCGTCAAGCCATGGGAATTGGGAGTACCTAAAGTCGGTAACCGCAAGGAGCCGCCT
+AAGGTAATACCAGTGACTGGGGCTAAGTCGTAACAAGGTAGCCGTA
+>42684
+AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCTTTACACATGCAAGTCGGACGGCAGCACAGAGGAGCTTGC
+TTCTTGGGTGGCGAGTGGCGAACGGGTGAGTGACGCATCGGAACGTACCGAGTAATGGGGGATAACTGTCCGAAAGGACA
+GCTAATACCGCATACGCCCTGAGGGGGAAAGCGGGGGATCTTAGGACCTCGCGTTATTCGAGCGGCCGATGTCTGATTAG
+CTGGTTGGCGGGGTAAAGGCCCACCAAGGCGACGATCAGTAGCGGGTCTGAGAGGATGATCCGCCACACTGGGACTGAGA
+CACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGT
+CTGAAGAAGGCCTTCGGGTTGTAAAGGACTTTTGTCAGGGAAGAAAAGGAACGTGTTAATACCATGTTCTGATGACGGTA
+CCTGAAGAATAAGCACCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGG
+GCGTAAAGCGGGCGCAGACGGTTACTTAAGCGGGATGTGAAATCCCCGGGCTCAACCCGGGAACTGCGTTCCGAACTGGG
+TGGCTAGAGTGTGTCAGAGGGGGGTAGAATTCCACGTGTAGCAGTGAAATGCGTAGAGATGTGGAGGAATACCGATGGCG
+AAGGCAGCCCCCTGGGATAACACTGACGTTCATGCCCGAAAGCGTGGGTAGCAAACAGGGTTAGATACCCTGGTAGTCCA
+CGCCCTAAACGATGTCGATTAGCTGTTGGGGCACTTGATGCCTTAGTAGCGTAGCTAACGCGTGAAATCGACCGCCTGGG
+GAGTACGGTCGCAAGATTAAAACTCAAAGGAATTGACGGGGACCCGCACAAGCGGTGGATGATGTGGATTAATTCGATGC
+AACGCGAAGAACCTTACCTGGTCTTGACATGTACGGAATCTTCCAGAGACGGAAGGGTGCCTTCGGGAGCCGTAACACAG
+GTGCTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCATTAGTTG
+CCATCACTTGGTTGGGCACTCTAATGAGACTGCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGC
+CCTTATGACCAGGGCTTCACACGTCATACAATGGTCGGTACAGAGGGTAGCCAAGCCGCGAGGCGGAGCCAATCCCAGAA
+AACCGATCGTAGTCCGGATTGCACTCTGCAACTCGAGTGCATGAAGTCGGAATCGCTAGTAATCGCAGGTCAGCATACTG
+CGGTGAATACGTTCCCGGGTCTTGTACACACCGCCCGTCACACCATGGGAGTGGGGGATACCAGAAGCAGGTAGGCTAAC
+CGCAAGGAGGCCGCTTGCCACGGTATGCTTCATGACTGGGGTGAAGTCGTAACAAGGTAAC
+>342684
+AGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGCAGGCTTAACACATGCAAGTCGAGGGGCATCGCGGGTAGCAATAC
+CTGGCGGCGACCGGCGGAAGGGTGCGTAACGCGTGAGCGACATACCCGTGACAGGGGGATAACAGATGGAAACGTCTCCT
+AATACCCCATAAGATCATATATCGCATGGTATGTGATTGAAAGGTGAGAACCGGTCACGGATTGGCTCGCGTCCCATCAG
+GTAGACGGCGGGGCAGCGGCCCGCCGTGCCGACGACGGGTAGGGGCTCTGAGAGGAGTGACCCCCACAATGGAACTGAGA
+CACGGTCCATACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGGCGGAAGCCTGAACCAGCCATGCCGCGTGC
+GGGAGGACGGCCCTATGGGTTGTAAACCGCTTTTGAGTGAGAGCAATAAGGTTCACGTGTGGACCGATGAGAGTATCATT
+CGAATAAGCATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTCATTGGGTTTA
+AAGGGTGCGTAGGCGGACATGTAAGTCCGAGGTGAAAGACCGGGGCCCAACCCCGGGGTTGCCTCGGATACTGTGTGTCT
+GGAGTGGACGTGCCGCCGGGGGAATGAGTGGTGTAGCGGTGAAATGCATAGATGTCACTCAGAACACCGATTGCGAAGGC
+ACCTGGCGAATGTCTTACTGACGCTGAGGCACGAAAGCGTGGGGATCGAACAGGATTAGATACCCTGGTAGTCCACGCAG
+TAAACGATGATGGCTGTCCGTTCGCTCCGATAGGAGTGAGTAGACAAGCGAAAGCGCTAAGCCATCCACCTGGGGAGTAC
+GGCCGCAAGGCTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCG
+AGGAACCTTACCCGGGCTCGAACGGCAGGTGAACGATGCAGAGATGCAAAGGCCCTTCGGGGCGTCTGTCGAGGTGCTGC
+ATGGTTGTCGTCAGCTCGTGCCGTGAGGTGTCGGCTCAAGTGCCATAACGAGCGCAACCCTTGCCTGCAGTTGCCATCGG
+GTAAAGCCGGGGACTCTGCAGGGACTGCCACCGCAAGGTGAGAGGAGGGGGGGGATGACGTCAAATCAGCACGGCCCTTA
+CGTCCGGGGCGACACACGTGTTACAATGGCGGCCACAGCGGGAAGCCACCCAGTGATGGGGCGCGGATCCCAAAAAAGCC
+GCCTCAGTTCGGATCGGAGTCTGCAACCCGACTCCGTGAAGCTGGATTCGCTAGTAATCGCGCATCAGCCATGGCGCGGT
+GAATACGTTCCCGGGCCTTGTACACACCGCCCGTCAAGCCATGGGAGTCGTGGGCGCCTGAAGGCCGTGACCGCGAGGAG
+CGGCCTAGGGCGAACGCGGTGACTGGGGCTAAGTCGTAACAAGGTA
+>295053
+AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGAGATGCTCCTTCGGGAGT
+ATCTTAGTGGCGAACGGGTGAGTAACGCGTGAGCAACCTGACCTTCACAGGGGGATAACCGCTGGAAACAGCAGCTAATA
+CCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTTGTTG
+GTGGGGTAACGGCTCACCAAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTC
+CAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGA
+AGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAG
+AAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAA
+GCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTG
+AGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGG
+CCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTA
+AACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACG
+GCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGA
+AGAACCTTACCTGGTCTTGACATCCACAGAACTTTCCAGAGATGGATTGGTGCCTTCGGGAACTGTGAGACAGGTGCTGC
+ATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCCTTTGTTGCCAGCGG
+TCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTCATCATGGCCCTTAC
+GACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCGACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCG
+TCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGA
+ATACGTTCCCGGGCCTTGCACACACCGCC
+>879972
+GACGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGAGATTGACCGGTGCTTGCACTGGTCAATCTAGTGGCGAA
+CGGGTGAGTAACACGTGGGTAACCTGCCCATCAGAGGGGGATAACATTCGGAAACGGATGCTAAAACCGCATAGGTCTTC
+GAACCGCATGGTTTGAAGAGGAAAAGAGGCGCAAGCTTCTGCTGATGGATGGACCCGCGGTGTATTAGCTAGTTGGTGGG
+GTAACGGCTCACCAAGGCGACGATACATAGCCGACCTGAGAGGGTGATCGGCCACACTGGGACTGAGACACGGCCCAGAC
+TCCTACGGGAGGCAGCAGTAGGGAATCTTCGGCAATGGACGGAAGTCTGACCGAGCAACGCCGCGTGAGTGAAGAAGGTT
+TTCGGATCGTAAAGCTCTGTTGTAAGAGAAGAACGAGTGTGAGAGTGGAAAGTTCACACTGTGACGGTATCTTACCAGAA
+AGGGACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGA
+GCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTGTGGCTTAACCATAGTACGCTTTGGAAACTGTTTAACTTGAGTGC
+AAGAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTC
+TGGCTTGTAACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGA
+TGAGTGCTAGGTGTTAGACCCTTTCCGGGGTTTAGTGCCGCAGCTAACGCATTAAGCACTCCGCCTGGGGAGTACGACCG
+CAGGGTTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAA
+CCTTACCAGGTCTTGACATCCCTCTGACCGCTCTAGAGATAGAGCTTTCCTTCGGGACAGAGGTGACAGGTGGTGCATGG
+TTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTATTGTTAGTTGCCATCATTCAG
+TTGGGCACTCTAGCGAGACTGCCGGTAATAAACCGGAGGAAGGTGGGGATGACGTCAAATCATCATGCCCCTTATGACCT
+GGGCTACACACGTGCTACAATGGCTGGTACAACGAGTCGCAAGCCGGTGACGGCAAGCTAATCTCTTAAAGCCAGTCTCA
+GTTCGGATTGTAGGCTGCAACTCGCCTACATGAAGTCGGAATCGCTAGTAATCGCGGATCAGCACGCCGCGGTGAATACG
+TTCCCGGGCCT
+"""
+
+# Reads to search against the database
+# - 10 rRNA reads: amplicon reads were taken from Qiime study 1685
+# - 10 random reads: simulated using mason with the following command:
+# mason illumina -N 10 -snN -o simulated_random_reads.fa -n
+# 150 random.fasta
+# - 10 rRNA reads with id < 97: amplicon reads were taken from
+# Qiime study 1685
+read_seqs_fp = """>HMPMockV1.2.Staggered2.673827_47 M141:79:749142:1:1101:16169:1589
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCAAGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATTTGATACTGGCAAGCTTGAGTCTCGTAGAGGAGGGTAGAATTCCAGGTGTAGCGGGG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCTCCATGGACGAAGACTGACGCT
+>HMPMockV1.2.Staggered2.673827_115 M141:79:749142:1:1101:14141:1729
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CCGGCTCAACCTTGGAACTGCATCTGATACGGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCTCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGAGCAAACA
+>HMPMockV1.2.Staggered2.673827_122 M141:79:749142:1:1101:16032:1739
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GTGATCAAACA
+>HMPMockV1.2.Staggered2.673827_161 M141:79:749142:1:1101:17917:1787
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCTCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGAGCAAACA
+>HMPMockV1.2.Staggered2.673827_180 M141:79:749142:1:1101:16014:1819
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGTGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+>HMPMockV1.2.Staggered2.673827_203 M141:79:749142:1:1101:17274:1859
+TACGGAGGTTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CCGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCTCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGATCAAACA
+>HMPMockV1.2.Staggered2.673827_207 M141:79:749142:1:1101:17460:1866
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGAGCAAACA
+>HMPMockV1.2.Staggered2.673827_215 M141:79:749142:1:1101:18390:1876
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACG
+>HMPMockV1.2.Staggered2.673827_218 M141:79:749142:1:1101:18249:1879
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTTCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGAGCACACA
+>HMPMockV1.2.Staggered2.673827_220 M141:79:749142:1:1101:15057:1880
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCTCCTGGACGAAGACTGACGCTC
+>simulated_random_reads.fa.000000000
+AGCCGGGTGTCTACGGTCAGGTGTGTTCTGACTACGTAGTTTGACAGCACGTGTCCTTTCCCCTTCCCAAGGTAACGAAT
+TGTCGTTATCAACGTTTCGATCCGTAATTTCACGGAACGACATAAAGGCATCAATACTATCGCCAACAGA
+>simulated_random_reads.fa.000000001
+GTGGACGTCGTGGCGGCGTACTAACTTCCTACAGGCATATCCGGAATAACATTCTGCCGCTTGTCGACATAAGCTGTTCC
+CTACATAGACGACGACGGTTGAAGGGTGTATGTATTCTTTGGGTACGGCTCCTCTGGGCGCATGGTAGCA
+>simulated_random_reads.fa.000000002
+CATTCTTTATAGGCCTACAACACTAATCATCGTTAAGCATAAGGGGAGGAGTGTGCGTGGCATCAAGTCCTGGTTCTTCG
+CCTAGTACCACACCGTCTCACACGCAGCCGCCGACGACCAGTGAGGGCGCGTGGGACACCCATTCGGTCC
+>simulated_random_reads.fa.000000003
+TCGCCTTGGTACAAACAGTCGCGGCACGCTGTATGGAGGACCATAGAGGCACAGGCTGAGGACAGGGGCATGGAAGGTTC
+AATCGCCCCCCACAGCTTTAGGTAGGAAGTACTGTTCTAGTGCCAATTTGATTTTAACGGCAGTTACTCG
+>simulated_random_reads.fa.000000004
+CATATTCTAATATCCTACTTCTGATACCCGATTATACACGACACCACCCCAGGACTGTCGTCACATCCTTATCTGGATAA
+ACATCCGGTTCCGTTTGGCCGTGCTCCGCAAGTGATGCGTCTGTGGAATGTACGTGGAGCGTTGACAGTT
+>simulated_random_reads.fa.000000005
+CCGGATTAGGCATGTTTATAGTACAACGGATTCGCAAAAAGGTCAGGGTAACAATTTTGAAATGCTTTCATACTGCGGTC
+TAAATGGACCACCCTTTAGGTGCAGCCAACTATAGTTGGTCGATTCTCTGAACACGTACCGAAGGCAATT
+>simulated_random_reads.fa.000000006
+AACCCATCGGAATAATCTACTGCTTCGTATGGAACGGTCCTACATTTAAATAAACGTGTCCAGTGCCACCCGATACCTCT
+CGTCAATCAGGGGCTCTCCCTGAATCAGCAGTAAACAAACCCAGTACACTGTCGAACACTACTGAGACCG
+>simulated_random_reads.fa.000000007
+CCGAAGGCAAGTCTGTCGTAGAATGGTTTTTGTCGTTGTAACAACCCCGCTCTAGACCCTGAAAACCATAAAGTCAAGCC
+CAACTAATATTAGAGGCATTCTGGCTACTCCCGCTCACCGCAATCTTCACATACTGTGATACCCTCAGCC
+>simulated_random_reads.fa.000000008
+ATATCCGTTAAACCCCGGATTTGACAATTCATCATCAACGCTACTAACGGCTTTCTCAATTTGGGGCTGTGGCCTATCCG
+CATACGGCTACCTGCGCAAGAAGAGAGTACTGTTAGATGTCACGCTGCACTTGCGAAGACCGGTGGGCGT
+>simulated_random_reads.fa.000000009
+AGCGATGAGTACACAAGATGAGTGAAGGGATTAAACTTCAAACCTTGAAGTGTTACCCGATTTCCTACCATTGGGGATTC
+GTTAATGCTTCGAATGGATCTATATCCGGTGTTTAGCTGACTGTTAAAATACTCTCGTTGTACGAAAGTA
+>HMPMockV1.2.Staggered2.673827_0 M141:79:749142:1:1101:17530:1438
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGCAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACCTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTG
+>HMPMockV1.2.Staggered2.673827_1 M141:79:749142:1:1101:17007:1451
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTTACGCTG
+>HMPMockV1.2.Staggered2.673827_2 M141:79:749142:1:1101:16695:1471
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTG
+GGGA
+>HMPMockV1.2.Staggered2.673827_3 M141:79:749142:1:1101:17203:1479
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGTAGAGATATGGAGGAACACCAGTGGCGAAGGCGACGTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTG
+G
+>HMPMockV1.2.Staggered2.673827_4 M141:79:749142:1:1101:14557:1490
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGGCTGTAACTGACGCTGATGTGCGCAAGCGTG
+GTGATCAAACA
+>HMPMockV1.2.Staggered2.673827_5 M141:79:749142:1:1101:16104:1491
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGC
+>HMPMockV1.2.Staggered2.673827_6 M141:79:749142:1:1101:16372:1491
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACAACAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGTAAG
+>HMPMockV1.2.Staggered2.673827_7 M141:79:749142:1:1101:17334:1499
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGT
+>HMPMockV1.2.Staggered2.673827_8 M141:79:749142:1:1101:17273:1504
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCACAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGA
+>HMPMockV1.2.Staggered2.673827_9 M141:79:749142:1:1101:16835:1505
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+ACATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTG
+GGGAT
+"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_sumaclust_v1.py b/bfillings/tests/test_sumaclust_v1.py
new file mode 100644
index 0000000..816ef23
--- /dev/null
+++ b/bfillings/tests/test_sumaclust_v1.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Unit tests for the SumaClust version 1.0 Application controller
+===============================================================
+"""
+
+
+from unittest import TestCase, main
+import filecmp
+from tempfile import mkstemp, mkdtemp
+from os import close
+from os.path import exists, getsize, join
+from shutil import rmtree
+
+from skbio.util import remove_files
+
+from bfillings.sumaclust_v1 import sumaclust_denovo_cluster
+
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+
+class SumaclustV1Tests(TestCase):
+ """ Tests for Sumaclust version 2.0 functionality """
+
+ def setUp(self):
+
+ self.output_dir = mkdtemp()
+ self.read_seqs = reads_seqs
+
+ # create temporary file with read sequences defined in read_seqs
+ f, self.file_read_seqs = mkstemp(prefix='temp_reads_',
+ suffix='.fasta')
+ close(f)
+
+ # write read sequences to tmp file
+ with open(self.file_read_seqs, 'w') as tmp:
+ tmp.write(self.read_seqs)
+
+ # list of files to remove
+ self.files_to_remove = [self.file_read_seqs]
+
+ def tearDown(self):
+ remove_files(self.files_to_remove)
+ rmtree(self.output_dir)
+
+ def check_clusters(self,
+ clusters,
+ result_path):
+
+ # Check the OTU map file exists
+ self.assertTrue(exists(result_path))
+
+ # Checkout output file has the correct size
+ size = getsize(result_path)
+ self.assertTrue(size, 270)
+
+ with open(result_path, "U") as f_otumap:
+ otu_map = [line.strip().split('\t') for line in f_otumap]
+
+ self.assertTrue(len(otu_map),3)
+
+ # Check the returned clusters list of lists is as expected
+ expected_clusters = [['s1_844', 's1_1886', 's1_5347', 's1_5737',
+ 's1_7014', 's1_7881', 's1_7040', 's1_6200',
+ 's1_1271', 's1_8615'],
+ ['s1_8977', 's1_10439', 's1_12366', 's1_15985',
+ 's1_21935', 's1_11650', 's1_11001', 's1_8592',
+ 's1_14735', 's1_4677'],
+ ['s1_630', 's1_4572', 's1_5748', 's1_13961',
+ 's1_2369', 's1_3750', 's1_7634', 's1_8623',
+ 's1_8744', 's1_6846']]
+
+ # Should be 3 clusters
+ self.assertEqual(len(clusters), 3)
+
+ # List of actual clusters matches list of expected clusters
+ for actual_cluster, expected_cluster in zip(clusters,
+ expected_clusters):
+ actual_cluster.sort()
+ expected_cluster.sort()
+ self.assertEqual(actual_cluster, expected_cluster)
+
+ def test_empty_seq_path(self):
+ """ SumaClust should return a ValueError
+ if empty sequence path is passed
+ """
+ result_path = join(self.output_dir, "sumaclust_otus.txt")
+
+ self.assertRaises(ValueError,
+ sumaclust_denovo_cluster,
+ seq_path=None,
+ result_path=result_path)
+
+ def test_empty_result_path(self):
+ """ SumaClust should return a ValueError
+ if empty result path is passed
+ """
+ self.assertRaises(ValueError,
+ sumaclust_denovo_cluster,
+ seq_path=self.file_read_seqs,
+ result_path=None)
+
+ def test_negative_threads(self):
+ """ SumaClust should raise ValueError
+ on negative number of threads
+ """
+ result_path = join(self.output_dir, "sumaclust_otus.txt")
+
+ self.assertRaises(ValueError,
+ sumaclust_denovo_cluster,
+ seq_path=self.file_read_seqs,
+ result_path=result_path,
+ shortest_len=True,
+ similarity=0.97,
+ threads=-2)
+
+ def test_positive_threads(self):
+ """ SumaClust's actual clusters should match
+ the exact clusters when using multithreading
+ """
+ result_path = join(self.output_dir, "sumaclust_otus_exact.txt")
+ clusters = sumaclust_denovo_cluster(seq_path=self.file_read_seqs,
+ result_path=result_path,
+ shortest_len=True,
+ similarity=0.97,
+ threads=3,
+ exact=True)
+
+ self.files_to_remove.append(result_path)
+
+ self.check_clusters(clusters, result_path)
+
+ def test_exact_clustering(self):
+ """ SumaClust's actual clusters should match
+ the exact clusters when using the exact option
+ """
+ result_path = join(self.output_dir, "sumaclust_otus_exact.txt")
+ clusters = sumaclust_denovo_cluster(seq_path=self.file_read_seqs,
+ result_path=result_path,
+ shortest_len=True,
+ similarity=0.97,
+ threads=1,
+ exact=True)
+
+ self.files_to_remove.append(result_path)
+
+ self.check_clusters(clusters, result_path)
+
+ def test_shortest_len_clustering(self):
+ """ SumaClust's actual clusters should match
+ the exact clusters when not using the
+ shortest len option
+ """
+ result_path = join(self.output_dir, "sumaclust_otus_exact.txt")
+ clusters = sumaclust_denovo_cluster(seq_path=self.file_read_seqs,
+ result_path=result_path,
+ shortest_len=False,
+ similarity=0.97,
+ threads=1,
+ exact=True)
+
+ self.files_to_remove.append(result_path)
+
+ self.check_clusters(clusters, result_path)
+
+ def test_sumaclust_denovo_cluster(self):
+ """ Test de novo clustering with SumaClust """
+
+ result_path = join(self.output_dir, "sumaclust_otus.txt")
+
+ clusters = sumaclust_denovo_cluster(seq_path=self.file_read_seqs,
+ result_path=result_path)
+
+ self.files_to_remove.append(result_path)
+
+ self.check_clusters(clusters, result_path)
+
+
+# Reads to cluster
+# there are 30 reads representing 3 species (gives 3 clusters)
+reads_seqs = """>s1_630 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_2369 reference=1049393 amplicon=complement(497..788) errors=73%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTAGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_3750 reference=1049393 amplicon=complement(497..788) errors=100%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCA
+>s1_4572 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_5748 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_6846 reference=1049393 amplicon=complement(497..788) errors=67%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCATAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_7634 reference=1049393 amplicon=complement(497..788) errors=99%T
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTTG
+>s1_8623 reference=1049393 amplicon=complement(497..788) errors=17-
+GTGCCAGCAGCCGCGGAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_8744 reference=1049393 amplicon=complement(497..788) errors=62%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGAGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_13961 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_4677 reference=4382408 amplicon=complement(487..778) errors=74%T
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGTGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_8592 reference=4382408 amplicon=complement(487..778) errors=95+A
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAAGCCCA
+>s1_8977 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_10439 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_11001 reference=4382408 amplicon=complement(487..778) errors=91%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGGGAAAGCCCA
+>s1_11650 reference=4382408 amplicon=complement(487..778) errors=78-
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCGTAAGTCAGAGGTGAAAGCCCA
+>s1_12366 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_14735 reference=4382408 amplicon=complement(487..778) errors=94%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGACAGCCCA
+>s1_15985 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_21935 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_844 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_1271 reference=129416 amplicon=complement(522..813) errors=94%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGACAGCCCA
+>s1_1886 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_5347 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_5737 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_6200 reference=129416 amplicon=complement(522..813) errors=92%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTCAAAGCCCA
+>s1_7014 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_7040 reference=129416 amplicon=complement(522..813) errors=40%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAGTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_7881 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_8615 reference=129416 amplicon=complement(522..813) errors=81%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTGAGTCAGATGTGAAAGCCCA
+"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_swarm_v127.py b/bfillings/tests/test_swarm_v127.py
new file mode 100644
index 0000000..4a79b73
--- /dev/null
+++ b/bfillings/tests/test_swarm_v127.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Unit tests for the Swarm version 1.2.7 Application controller
+=============================================================
+"""
+
+
+from unittest import TestCase, main
+from tempfile import mkstemp
+from os import close
+
+from skbio.util import remove_files
+
+from bfillings.swarm_v127 import swarm_denovo_cluster
+
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+
+class SwarmTests(TestCase):
+ """ Tests for Swarm version 1.2.7 functionality """
+
+ def setUp(self):
+ self.read_seqs = reads_seqs
+
+ # create temporary file with read sequences defined in read_seqs
+ f, self.file_read_seqs = mkstemp(prefix='temp_reads_',
+ suffix='.fasta')
+ close(f)
+
+ # write read sequences to tmp file
+ with open(self.file_read_seqs, 'w') as tmp:
+ tmp.write(self.read_seqs)
+
+ # list of files to remove
+ self.files_to_remove = [self.file_read_seqs]
+
+ def tearDown(self):
+ remove_files(self.files_to_remove)
+
+ def test_default_param(self):
+ """ Swarm should return the correct clusters using
+ default inputs
+ """
+ clusters = swarm_denovo_cluster(seq_path=self.file_read_seqs,
+ d=1,
+ threads=1)
+
+ # Check the returned clusters list of lists is as expected
+ expected_clusters = [['s1_630', 's1_4572', 's1_5748',
+ 's1_13961', 's1_8744', 's1_8623',
+ 's1_7634', 's1_6846', 's1_3750',
+ 's1_2369'],
+ ['s1_8977', 's1_10439', 's1_12366',
+ 's1_15985', 's1_21935', 's1_8592',
+ 's1_4677', 's1_14735', 's1_11650',
+ 's1_11001'],
+ ['s1_844', 's1_1886', 's1_5347',
+ 's1_5737', 's1_7014', 's1_7881',
+ 's1_8615', 's1_7040', 's1_6200',
+ 's1_1271']]
+
+ # Should be 3 clusters
+ self.assertEqual(len(clusters), 3)
+
+ # List of actual clusters matches list of expected clusters
+ for actual_cluster, expected_cluster in zip(clusters,
+ expected_clusters):
+ actual_cluster.sort()
+ expected_cluster.sort()
+ self.assertEqual(actual_cluster, expected_cluster)
+
+ def test_seq_path(self):
+ """ Swarm should raise a ValueError if the sequences
+ filepath does not exist
+ """
+
+ f, tmp_file = mkstemp(prefix='temp_reads_',
+ suffix='.fasta')
+ close(f)
+ remove_files([tmp_file])
+
+ self.assertRaises(ValueError,
+ swarm_denovo_cluster,
+ seq_path=tmp_file,
+ d=1,
+ threads=1)
+
+ def test_negative_resolution(self):
+ """ Swarm should raise a ValueError if the resolution
+ is negative
+ """
+
+ self.assertRaises(ValueError,
+ swarm_denovo_cluster,
+ seq_path=self.file_read_seqs,
+ d=-2,
+ threads=1)
+
+ def test_negative_threads(self):
+ """ Swarm should raise a ValueError if number of threads
+ is negative
+ """
+
+ self.assertRaises(ValueError,
+ swarm_denovo_cluster,
+ seq_path=self.file_read_seqs,
+ d=1,
+ threads=-2)
+
+# Reads to cluster
+# there are 30 reads representing 3 species (gives 3 clusters)
+reads_seqs = """>s1_630 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_2369 reference=1049393 amplicon=complement(497..788) errors=73%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTAGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_3750 reference=1049393 amplicon=complement(497..788) errors=100%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCA
+>s1_4572 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_5748 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_6846 reference=1049393 amplicon=complement(497..788) errors=67%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCATAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_7634 reference=1049393 amplicon=complement(497..788) errors=99%T
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTTG
+>s1_8623 reference=1049393 amplicon=complement(497..788) errors=17-
+GTGCCAGCAGCCGCGGAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_8744 reference=1049393 amplicon=complement(497..788) errors=62%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGAGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_13961 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_4677 reference=4382408 amplicon=complement(487..778) errors=74%T
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGTGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_8592 reference=4382408 amplicon=complement(487..778) errors=95+A
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAAGCCCA
+>s1_8977 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_10439 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_11001 reference=4382408 amplicon=complement(487..778) errors=91%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGGGAAAGCCCA
+>s1_11650 reference=4382408 amplicon=complement(487..778) errors=78-
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCGTAAGTCAGAGGTGAAAGCCCA
+>s1_12366 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_14735 reference=4382408 amplicon=complement(487..778) errors=94%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGACAGCCCA
+>s1_15985 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_21935 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_844 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_1271 reference=129416 amplicon=complement(522..813) errors=94%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGACAGCCCA
+>s1_1886 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_5347 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_5737 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_6200 reference=129416 amplicon=complement(522..813) errors=92%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTCAAAGCCCA
+>s1_7014 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_7040 reference=129416 amplicon=complement(522..813) errors=40%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAGTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_7881 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_8615 reference=129416 amplicon=complement(522..813) errors=81%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTGAGTCAGATGTGAAAGCCCA
+"""
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_uclust.py b/bfillings/tests/test_uclust.py
new file mode 100644
index 0000000..0db9856
--- /dev/null
+++ b/bfillings/tests/test_uclust.py
@@ -0,0 +1,758 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+ : provides unit tests for the uclust.py module
+
+Modified from Daniel McDonald's test_cd_hit.py code on Feb-4-2010 """
+
+from subprocess import Popen, PIPE, STDOUT
+from tempfile import mkstemp, gettempdir
+from os.path import join
+
+from unittest import TestCase, main
+
+from skbio.util import remove_files
+
+from bfillings.uclust import (Uclust,
+ uclust_fasta_sort_from_filepath,
+ uclust_cluster_from_sorted_fasta_filepath,
+ get_output_filepaths, clusters_from_uc_file,
+ get_clusters_from_fasta_filepath,
+ uclust_search_and_align_from_fasta_filepath,
+ process_uclust_pw_alignment_results,
+ UclustParseError)
+
+__author__ = "William Walters"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Daniel McDonald", "William Walters", "Greg Caporaso",
+ "Jai Ram Rideout"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "William Walters"
+__email__ = "William.A.Walters at colorado.edu"
+__status__ = "Production"
+
+
+class UclustTests(TestCase):
+
+ def setUp(self):
+
+ _, self.tmp_unsorted_fasta_filepath = mkstemp(prefix="uclust_test",
+ suffix=".fasta")
+ tmp_unsorted_fasta = open(self.tmp_unsorted_fasta_filepath, "w")
+ tmp_unsorted_fasta.write('\n'.join(raw_dna_seqs))
+ tmp_unsorted_fasta.close()
+
+ _, self.tmp_sorted_fasta_filepath = mkstemp(prefix="uclust_test",
+ suffix=".fasta")
+ tmp_sorted_fasta = open(self.tmp_sorted_fasta_filepath, "w")
+ tmp_sorted_fasta.write('\n'.join(sorted_dna_seqs))
+ tmp_sorted_fasta.close()
+
+ _, self.tmp_uc_filepath = mkstemp(prefix="uclust_test", suffix=".uc")
+ tmp_uc = open(self.tmp_uc_filepath, "w")
+ tmp_uc.write('\n'.join(uc_dna_clusters))
+ tmp_uc.close()
+
+ _, self.tmp_clstr_filepath = mkstemp(prefix="uclust_test",
+ suffix=".clstr")
+
+ self.tmpdir = gettempdir()
+ self.WorkingDir = join(self.tmpdir, 'uclust_test')
+
+ self.files_to_remove = [self.tmp_unsorted_fasta_filepath,
+ self.tmp_sorted_fasta_filepath,
+ self.tmp_uc_filepath,
+ self.tmp_clstr_filepath]
+
+ def tearDown(self):
+ remove_files(self.files_to_remove, error_on_missing=False)
+
+ def test_fasta_sorting(self):
+ """ Should sort fasta seqs from largest to smallest in outfile
+
+ Since a fasta file has to be passed to the app controller for uclust,
+ a temporary fasta file is created, and the raw fasta seqs supplied
+ in this module are written to it. This file is sent to the app
+ controller, and the resulting sorted file is compared to the expected
+ results to ensure proper function of uclust as called by this app
+ controller."""
+
+ test_app = Uclust({'--tmpdir': self.tmpdir})
+
+ test_app_res = test_app(data=
+ {'--mergesort': self.tmp_unsorted_fasta_filepath,
+ '--output': self.tmp_sorted_fasta_filepath})
+
+ sorted_fasta_actual = [l.strip()
+ for l in open(test_app_res['Output'].name, "U")]
+ sorted_fasta_expected = [l.strip() for l in sorted_dna_seqs if l]
+
+ self.assertEqual(sorted_fasta_actual, sorted_fasta_expected)
+
+ test_app_res.cleanUp()
+
+ def test_parameter_availability(self):
+ """ Often used parameters are accessible
+
+ This is just some basic sanity checking.
+
+ """
+ a = Uclust()
+ # if a parameter is not accessible, trying to turn it on will
+ # raise a KeyError
+ a.Parameters['--allhits'].on()
+ a.Parameters['--libonly'].on()
+ a.Parameters['--maxaccepts'].on(42)
+ a.Parameters['--maxrejects'].on(42)
+ a.Parameters['--rev'].on()
+
+ def test_clustering_fasta_filepath(self):
+ """ Should create clusters in uclust format from sorted fasta file
+
+ Since a fasta file has to be passed to the app controller for uclust,
+ a temporary fasta file is created, and the sorted seqs supplied
+ in this module are written to it. This file is sent to the app
+ controller, and the resulting uclust file is compared to the expected
+ results to ensure proper function of uclust as called by this app
+ controller."""
+
+ test_app = Uclust({'--id': 0.9}, HALT_EXEC=False)
+ test_app_res = test_app(data=
+ {'--input': self.tmp_sorted_fasta_filepath,
+ '--uc': self.tmp_uc_filepath})
+
+ uc_file = open(test_app_res['ClusterFile'].name, "U")
+ # compare the actual and expect uc files, ignoring comment lines
+ uc_file_actual = [l.strip() for l in uc_file
+ if not l.startswith('#')]
+ uc_file_expected = [l.strip() for l in uc_dna_clusters
+ if not l.startswith('#')]
+
+ self.assertEqual(uc_file_actual, uc_file_expected)
+
+ test_app_res.cleanUp()
+
+
+class UclustConvenienceWrappers(TestCase):
+
+ """ Unit tests for uclust convenience wrappers """
+
+ def setUp(self):
+
+ _, self.tmp_unsorted_fasta_filepath = mkstemp(prefix="uclust_test",
+ suffix=".fasta")
+ tmp_unsorted_fasta = open(self.tmp_unsorted_fasta_filepath, "w")
+ tmp_unsorted_fasta.write('\n'.join(raw_dna_seqs))
+ tmp_unsorted_fasta.close()
+
+ _, self.tmp_raw_dna_seqs_rc_filepath = mkstemp(prefix="uclust_test",
+ suffix=".fasta")
+ tmp_rc_fasta = open(self.tmp_raw_dna_seqs_rc_filepath, "w")
+ tmp_rc_fasta.write('\n'.join(raw_dna_seqs_rc))
+ tmp_rc_fasta.close()
+
+ _, self.tmp_sorted_fasta_filepath = mkstemp(prefix="uclust_test",
+ suffix=".fasta")
+ tmp_sorted_fasta = open(self.tmp_sorted_fasta_filepath, "w")
+ tmp_sorted_fasta.write('\n'.join(sorted_dna_seqs))
+ tmp_sorted_fasta.close()
+
+ _, self.tmp_uc_filepath = mkstemp(prefix="uclust_test", suffix=".uc")
+ tmp_uc = open(self.tmp_uc_filepath, "w")
+ tmp_uc.write('\n'.join(uc_dna_clusters))
+ tmp_uc.close()
+
+ _, self.tmp_clstr_filepath = mkstemp(prefix="uclust_test",
+ suffix=".clstr")
+
+ self.search_align_out1_expected = search_align_out1_expected
+ self.search_align_out_fasta_pairs1 = search_align_out_fasta_pairs1
+ self.search_align_out_uc1 = search_align_out_uc1
+ _, self.search_align_query1_fp = mkstemp(prefix="uclust_test",
+ suffix=".clstr")
+ open(self.search_align_query1_fp, 'w').write(search_align_query1)
+ _, self.search_align_template1_fp = mkstemp(prefix="uclust_test",
+ suffix=".clstr")
+ open(self.search_align_template1_fp, 'w').write(search_align_template1)
+
+ self.search_align_out2_expected = search_align_out2_expected
+ _, self.search_align_query2_fp = mkstemp(prefix="uclust_test",
+ suffix=".clstr")
+ open(self.search_align_query2_fp, 'w').write(search_align_query2)
+ _, self.search_align_template2_fp = mkstemp(prefix="uclust_test",
+ suffix=".clstr")
+ open(self.search_align_template2_fp, 'w').write(search_align_template2)
+
+ _, self.ref_dna_seqs_fp = mkstemp(prefix="uclust_test",
+ suffix=".fasta")
+ open(self.ref_dna_seqs_fp, 'w').write(ref_dna_seqs)
+
+ self.files_to_remove = [self.tmp_unsorted_fasta_filepath,
+ self.tmp_raw_dna_seqs_rc_filepath,
+ self.tmp_sorted_fasta_filepath,
+ self.tmp_uc_filepath,
+ self.tmp_clstr_filepath,
+ self.search_align_query1_fp,
+ self.search_align_template1_fp,
+ self.search_align_query2_fp,
+ self.search_align_template2_fp,
+ self.ref_dna_seqs_fp]
+
+ self.ref_test_clusters1 = ref_test_clusters1
+ self.ref_test_failures1 = ref_test_failures1
+ self.ref_test_new_seeds1 = ref_test_new_seeds1
+ self.ref_test_clusters2 = ref_test_clusters2
+ self.ref_test_failures2 = ref_test_failures2
+ self.ref_test_new_seeds2 = ref_test_new_seeds2
+ self.uc_dna_clusters = uc_dna_clusters
+ self.uc_lines1 = uc_lines1
+ self.uc_lines_w_multiple_hits_per_query = \
+ uc_lines_w_multiple_hits_per_query
+ self.uc_lines_overlapping_lib_input_seq_ids = \
+ uc_lines_overlapping_lib_input_seq_ids
+
+ self.tmpdir = gettempdir()
+
+ def tearDown(self):
+ remove_files(self.files_to_remove, error_on_missing=False)
+
+ def test_uclust_fasta_sort_from_filepath(self):
+ """ Given an unsorted fasta filepath, will return sorted file """
+
+ app_res = \
+ uclust_fasta_sort_from_filepath(self.tmp_unsorted_fasta_filepath)
+
+ sorted_fasta_actual = [l.strip()
+ for l in open(app_res['Output'].name, "U")]
+ sorted_fasta_expected = [l.strip() for l in sorted_dna_seqs if l]
+
+ self.assertEqual(sorted_fasta_actual, sorted_fasta_expected)
+
+ app_res.cleanUp()
+
+ def test_clusters_from_uc_file(self):
+ """ clusters_from_uc_file functions as expected """
+
+ expected_clusters = {'s2': ['s2', 's3']}
+ expected_failures = ['s1']
+ expected_new_seeds = ['s2']
+ self.assertEqual(clusters_from_uc_file(self.uc_lines1),
+ (expected_clusters, expected_failures, expected_new_seeds))
+
+ def test_clusters_from_uc_file_multiple_hits(self):
+ """ clusters_from_uc_file handles error_on_multiple_hits correctly
+ """
+ # when a query hits multiple hits and error_on_multiple_hits=True
+ # an error should be raised
+ self.assertRaises(UclustParseError,
+ clusters_from_uc_file,
+ self.uc_lines_w_multiple_hits_per_query,
+ error_on_multiple_hits=True)
+
+ # when a query hits multiple hits and error_on_multiple_hits=False
+ # the query should show up in multiple clusters
+ actual = clusters_from_uc_file(self.uc_lines_w_multiple_hits_per_query,
+ error_on_multiple_hits=False)
+ expected_clusters = {'s2': ['s2', 's3'],
+ 's4': ['s4', 's3']}
+ expected_failures = ['s1']
+ expected_new_seeds = ['s2', 's4']
+ self.assertEqual(actual,
+ (expected_clusters, expected_failures, expected_new_seeds))
+
+ def test_clusters_from_uc_file_error(self):
+ """ clusters_from_uc_file raises error when lib/input seq ids overlap
+ """
+ self.assertRaises(UclustParseError,
+ clusters_from_uc_file,
+ self.uc_lines_overlapping_lib_input_seq_ids)
+
+ def test_uclust_cluster_from_sorted_fasta_filepath(self):
+ """ Given a sorted fasta filepath, will return uclust (.uc) file """
+
+ app_res = \
+ uclust_cluster_from_sorted_fasta_filepath(
+ self.tmp_sorted_fasta_filepath,
+ percent_ID=0.90, HALT_EXEC=False)
+
+ uc_file = open(app_res['ClusterFile'].name, "U")
+ # compare the actual and expect uc files, ignoring comment lines
+ uc_file_actual = [l.strip() for l in uc_file
+ if not l.startswith('#')]
+ uc_file_expected = [l.strip() for l in uc_dna_clusters
+ if not l.startswith('#')]
+
+ self.assertEqual(uc_file_actual, uc_file_expected)
+ app_res.cleanUp()
+
+ def test_get_output_filepaths(self):
+ """ Properly generates output filepath names """
+
+ uc_res = \
+ get_output_filepaths(self.tmpdir, "test_seqs.fasta")
+
+ self.assertEqual(uc_res, join(self.tmpdir, "test_seqs_clusters.uc"))
+
+ def test_get_output_filepaths_multiple_dots(self):
+ """Generates filepath names from names with more than one dot"""
+ obs = get_output_filepaths(self.tmpdir, "test_seqs.filtered.fasta")
+ self.assertEqual(obs, join(self.tmpdir, "test_seqs.filtered_clusters.uc"))
+
+ def test_get_clusters_from_fasta_filepath(self):
+ """ Tests for return of lists of OTUs from given fasta filepath """
+
+ clusters_res = \
+ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
+ original_fasta_path=None, percent_ID=0.90, save_uc_files=False)
+ expected_cluster_list.sort()
+ expected_failure_list.sort()
+ expected_new_seed_list.sort()
+ clusters_res[0].sort()
+ clusters_res[1].sort()
+ clusters_res[2].sort()
+ self.assertEqual(clusters_res, (expected_cluster_list,
+ expected_failure_list,
+ expected_new_seed_list))
+
+ def test_get_clusters_from_fasta_filepath_reference_db_only(self):
+ """ Correct clusters returned when clustering against a database only
+ """
+ clusters_res = get_clusters_from_fasta_filepath(
+ self.tmp_unsorted_fasta_filepath,
+ original_fasta_path=None,
+ save_uc_files=False,
+ max_accepts=7, max_rejects=12,
+ percent_ID=0.90,
+ subject_fasta_filepath=self.ref_dna_seqs_fp,
+ suppress_new_clusters=True,
+ HALT_EXEC=False)
+
+ self.ref_test_clusters1.sort()
+ self.ref_test_failures1.sort()
+ self.ref_test_new_seeds1.sort()
+
+ clusters_res[0].sort()
+ clusters_res[1].sort()
+ clusters_res[2].sort()
+ self.assertEqual(clusters_res, (self.ref_test_clusters1,
+ self.ref_test_failures1,
+ self.ref_test_new_seeds1))
+
+ def test_get_clusters_from_fasta_filepath_extending_reference_db(self):
+ """ Correct clusters when clustering against db and adding new clusters
+ """
+ clusters_res = get_clusters_from_fasta_filepath(
+ self.tmp_unsorted_fasta_filepath,
+ original_fasta_path=None,
+ max_accepts=7, max_rejects=12,
+ percent_ID=0.90,
+ subject_fasta_filepath=self.ref_dna_seqs_fp,
+ suppress_new_clusters=False, enable_rev_strand_matching=True,
+ HALT_EXEC=False,
+ save_uc_files=False)
+
+ self.ref_test_clusters2.sort()
+ self.ref_test_failures2.sort()
+ self.ref_test_new_seeds2.sort()
+
+ clusters_res[0].sort()
+ clusters_res[1].sort()
+ clusters_res[2].sort()
+ self.assertEqual(clusters_res, (self.ref_test_clusters2,
+ self.ref_test_failures2,
+ self.ref_test_new_seeds2))
+
+ def test_get_clusters_from_fasta_filepath_optimal(self):
+ """ Test OTUs from filepath functions with optimal
+ """
+ # need to compile a small test where optimal has an affect --
+ # this currently is only testing that we don't get a failure with
+ # optimal
+ clusters_res = \
+ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
+ original_fasta_path=None, save_uc_files=False,
+ percent_ID=0.90, optimal=True)
+ expected_cluster_list.sort()
+ expected_failure_list.sort()
+ expected_new_seed_list.sort()
+ clusters_res[0].sort()
+ clusters_res[1].sort()
+ clusters_res[2].sort()
+
+ self.assertEqual(clusters_res, (expected_cluster_list,
+ expected_failure_list,
+ expected_new_seed_list))
+
+ def test_get_clusters_from_fasta_filepath_suppress_sort(self):
+ """ Test OTUs from filepath functions with suppress sort
+ """
+ expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
+ ['uclust_test_seqs_2'], ['uclust_test_seqs_3'],
+ ['uclust_test_seqs_4'], ['uclust_test_seqs_5'],
+ ['uclust_test_seqs_6', 'uclust_test_seqs_8'],
+ ['uclust_test_seqs_7'], ['uclust_test_seqs_9']]
+ clusters_res = \
+ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
+ original_fasta_path=None,
+ percent_ID=0.90, suppress_sort=True, save_uc_files=False)
+ expected_cluster_list.sort()
+ expected_failure_list.sort()
+ expected_new_seed_list.sort()
+ clusters_res[0].sort()
+ clusters_res[1].sort()
+ clusters_res[2].sort()
+
+ self.assertEqual(clusters_res, (expected_cluster_list,
+ expected_failure_list,
+ expected_new_seed_list))
+
+ def test_get_clusters_from_fasta_filepath_rev_strand_match(self):
+ """ Test OTUs from filepath functions with rev strand match
+ """
+ # seq and its rc don't cluster when enable_rev_strand_matching = False
+ expected_cluster_list = [['uclust_test_seqs_0'],
+ ['uclust_test_seqs_0_rc']]
+ expected_failure_list = []
+ expected_new_seed_list = [
+ 'uclust_test_seqs_0',
+ 'uclust_test_seqs_0_rc']
+ clusters_res = \
+ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
+ original_fasta_path=None, save_uc_files=False,
+ percent_ID=0.90, enable_rev_strand_matching=False)
+
+ expected_cluster_list.sort()
+ expected_failure_list.sort()
+ expected_new_seed_list.sort()
+ clusters_res[0].sort()
+ clusters_res[1].sort()
+ clusters_res[2].sort()
+ self.assertEqual(clusters_res, (expected_cluster_list,
+ expected_failure_list,
+ expected_new_seed_list))
+
+ # seq and its rc cluster when enable_rev_strand_matching = False
+ expected_cluster_list = [
+ ['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']]
+ expected_failure_list = []
+ expected_new_seed_list = ['uclust_test_seqs_0']
+ clusters_res = \
+ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
+ original_fasta_path=None, save_uc_files=False,
+ percent_ID=0.90, enable_rev_strand_matching=True)
+
+ expected_cluster_list.sort()
+ expected_failure_list.sort()
+ expected_new_seed_list.sort()
+ clusters_res[0].sort()
+ clusters_res[1].sort()
+ clusters_res[2].sort()
+ self.assertEqual(clusters_res, (expected_cluster_list,
+ expected_failure_list,
+ expected_new_seed_list))
+
+ def test_process_uclust_pw_alignment_results(self):
+ """parsing of pairwise alignment fasta pairs file functions as expected
+ """
+ actual = list(process_uclust_pw_alignment_results(
+ self.search_align_out_fasta_pairs1, self.search_align_out_uc1))
+ expected = self.search_align_out1_expected
+
+ # iterate over results so error output will highlight the bad match
+ for a, e in zip(actual, expected):
+ self.assertEqual(a, e)
+
+ # make sure the full result objects are the same
+ self.assertEqual(actual, expected)
+
+ def test_uclust_search_and_align_from_fasta_filepath(self):
+ """ uclust_search_and_align_from_fasta_filepath functions as expected """
+ # rev comp matches allowed (default)
+ actual = list(uclust_search_and_align_from_fasta_filepath(
+ self.search_align_query1_fp, self.search_align_template1_fp))
+ self.assertEqual(actual, self.search_align_out1_expected)
+
+ # rev comp matches not allowed
+ actual = list(uclust_search_and_align_from_fasta_filepath(
+ self.search_align_query1_fp, self.search_align_template1_fp,
+ enable_rev_strand_matching=False))
+ self.assertEqual(actual, self.search_align_out1_expected[:2])
+
+ def test_uclust_search_and_align_from_fasta_filepath_protein(self):
+ """ uclust_search_and_align_from_fasta_filepath functions with protein """
+ # rev comp matches allowed (default)
+ actual = list(uclust_search_and_align_from_fasta_filepath(
+ self.search_align_query2_fp, self.search_align_template2_fp))
+ self.assertEqual(actual, self.search_align_out2_expected)
+
+ def test_uclust_supported_version(self):
+ """uclust version is supported """
+ command = 'uclust --version'
+ proc = Popen(command, shell=True, universal_newlines=True,
+ stdout=PIPE, stderr=STDOUT)
+ stdout = proc.stdout.read()
+ version_string = stdout.strip().split('v')[-1].strip('q')
+ try:
+ version = tuple(map(int, version_string.split('.')))
+ acceptable_version = version >= (1, 2, 22)
+ except ValueError:
+ acceptable_version = False
+
+ self.assertTrue(acceptable_version,
+ "Unsupported uclust version. 1.2.22 or later " +
+ "is required, but running %s." % version_string)
+
+raw_dna_seqs = """>uclust_test_seqs_0
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_1
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>uclust_test_seqs_2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>uclust_test_seqs_4
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>uclust_test_seqs_6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>uclust_test_seqs_7
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>uclust_test_seqs_9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+""".split('\n')
+
+ref_dna_seqs = """>ref1 25 random bases appended to uclust_test_seqs_0 and one mismatch
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATATTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCTATAGCAGCCCCAGCGTTTACTTCTA
+>ref2 15 random bases prepended to uclust_test_seqs_1 and one mismatch
+GCTGCGGCGTCCTGCGCCACGGTGGGTACAACACGTCCACTACATCTGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>ref3 5 random bases prepended and 10 random bases appended to uclust_test_seqs_2
+ATAGGCCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACTGCCTGATTCA
+>ref4 exact match to uclust_test_seqs_3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+"""
+
+ref_test_clusters1 = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
+ ['uclust_test_seqs_2'], ['uclust_test_seqs_3']]
+ref_test_failures1 = ['uclust_test_seqs_4', 'uclust_test_seqs_5',
+ 'uclust_test_seqs_6', 'uclust_test_seqs_7',
+ 'uclust_test_seqs_8', 'uclust_test_seqs_9']
+ref_test_new_seeds1 = []
+
+ref_test_clusters2 = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
+ ['uclust_test_seqs_2'], ['uclust_test_seqs_3'],
+ ['uclust_test_seqs_4'], ['uclust_test_seqs_5'],
+ ['uclust_test_seqs_6', 'uclust_test_seqs_8'],
+ ['uclust_test_seqs_7'], ['uclust_test_seqs_9']]
+ref_test_failures2 = []
+ref_test_new_seeds2 = [
+ 'uclust_test_seqs_4', 'uclust_test_seqs_5', 'uclust_test_seqs_6',
+ 'uclust_test_seqs_7', 'uclust_test_seqs_9']
+
+
+raw_dna_seqs_rc = """>uclust_test_seqs_0
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_0_rc
+AGCTCTGACACAAAACTGACGTGATGTGCCTTAAGTATCCAACCCGTTGGATGGGACGTCTTGTAGCCACCGT
+""".split('\n')
+
+sorted_dna_seqs = """>uclust_test_seqs_7
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_4
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>uclust_test_seqs_1
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>uclust_test_seqs_5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>uclust_test_seqs_6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>uclust_test_seqs_0
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>uclust_test_seqs_9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+""".split('\n')
+
+# Clusters are created at a 0.90% identity
+uc_dna_clusters = """# uclust --input /tmp/uclust_testBGwZvcikrbNefYGRTk0u.fasta --id 0.9 --uc /tmp/uclust_testrbcO0CyBVpV9AwH3OIK1.uc
+# version=1.1.577
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+S 0 80 * * * * * uclust_test_seqs_7 *
+S 1 79 * * * * * uclust_test_seqs_4 *
+S 2 78 * * * * * uclust_test_seqs_2 *
+S 3 77 * * * * * uclust_test_seqs_3 *
+S 4 76 * * * * * uclust_test_seqs_1 *
+S 5 75 * * * * * uclust_test_seqs_5 *
+S 6 74 * * * * * uclust_test_seqs_6 *
+S 7 73 * * * * * uclust_test_seqs_0 *
+H 6 72 91.7 + 0 0 2I72M uclust_test_seqs_8 uclust_test_seqs_6
+S 8 71 * * * * * uclust_test_seqs_9 *
+C 0 1 * * * * * uclust_test_seqs_7 *
+C 1 1 * * * * * uclust_test_seqs_4 *
+C 2 1 * * * * * uclust_test_seqs_2 *
+C 3 1 * * * * * uclust_test_seqs_3 *
+C 4 1 * * * * * uclust_test_seqs_1 *
+C 5 1 * * * * * uclust_test_seqs_5 *
+C 6 2 91.7 * * * * uclust_test_seqs_6 *
+C 7 1 * * * * * uclust_test_seqs_0 *
+C 8 1 * * * * * uclust_test_seqs_9 *""".split('\n')
+
+expected_cluster_list = [['uclust_test_seqs_7'],
+ ['uclust_test_seqs_4'],
+ ['uclust_test_seqs_2'],
+ ['uclust_test_seqs_3'],
+ ['uclust_test_seqs_1'],
+ ['uclust_test_seqs_5'],
+ ['uclust_test_seqs_6',
+ 'uclust_test_seqs_8'],
+ ['uclust_test_seqs_0'],
+ ['uclust_test_seqs_9']]
+expected_failure_list = []
+expected_new_seed_list = [
+ 'uclust_test_seqs_7', 'uclust_test_seqs_4', 'uclust_test_seqs_2',
+ 'uclust_test_seqs_3', 'uclust_test_seqs_1', 'uclust_test_seqs_5', 'uclust_test_seqs_6',
+ 'uclust_test_seqs_0', 'uclust_test_seqs_9']
+
+search_align_query1 = """>1_like
+TACGGCTACCTTGTTACGACTTCATCCCAATCATTTGTTCCACCTTCGACGGCTA
+>2_like
+ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG
+>2_like_rc
+CTTAGTTGCCATCCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAATCATCAT
+>rand
+TTGCGACGAGCGGACGGCCGGGTGTATGTCGTCATATATATGTGTCTGCCTATCGTTACGTACACTCGTCGTCT
+"""
+
+search_align_template1 = """>1
+AGAAAGGAGGTGATCCAGCCGCACCTTCCGATACGGCTACCTTGTTACGACTTCACCCCAATCATTTGTTCCACCTTCGACGGCTAGCTCCAAATGGTTACTCCACCGGCTTCGGGTGTTACAAACTC
+>2
+AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT
+"""
+
+search_align_query2 = """>1_like
+PRTEINACYYPL
+>2_like
+AGGYTPPLVN
+>rand
+GGTYPARREE
+"""
+
+search_align_template2 = """>1
+PRTELNACYYPL
+>2
+AGGYTRPPLVN
+"""
+
+search_align_out2_expected = [
+ ('1_like', '1', 'PRTEINACYYPL', 'PRTELNACYYPL', 91.70000),
+ ('2_like', '2', 'AGGYT-PPLVN', 'AGGYTRPPLVN', 100.0)]
+
+search_align_out_fasta_pairs1 = """>1_like
+-------------------------------TACGGCTACCTTGTTACGACTTCATCCCAATCATTTGTTCCACCTTCGACGGCTA------------------------------------------
+>1+
+AGAAAGGAGGTGATCCAGCCGCACCTTCCGATACGGCTACCTTGTTACGACTTCACCCCAATCATTTGTTCCACCTTCGACGGCTAGCTCCAAATGGTTACTCCACCGGCTTCGGGTGTTACAAACTC
+
+>2_like
+-------------------ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG---------------
+>2+
+AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT
+
+>2_like_rc
+---------------CTTAGTTGCCATCCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAATCATCAT-------------------
+>2-
+AGCGCAACCCTTAAGCTTAGTTGCCATCCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAATCATCATGCCCCTTATGATTTGGGCT
+""".split('\n')
+
+search_align_out_uc1 = """# uclust --input sm_query.fasta --lib sm_template.fasta --id 0.75 --libonly --rev --maxaccepts 8 --maxrejects 32 --fastapairs sm_pw.fasta --uc sm_result.uc
+# version=1.1.577
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+L 0 128 * * * * * 1 *
+H 0 55 98.2 + 0 0 31I55M42I 1_like 1
+L 1 92 * * * * * 2 *
+H 1 58 100.0 + 0 0 19I58M15I 2_like 2
+H 1 58 100.0 - 0 0 15I58M19I 2_like_rc 2
+N * 74 * * * * * rand *
+D 0 2 * * * * 98.2 1 *
+D 1 3 * * * * 100.0 2 *
+""".split('\n')
+
+search_align_out1_expected = [
+ ('1_like', '1', '-------------------------------TACGGCTACCTTGTTACGACTTCATCCCAATCATTTGTTCCACCTTCGACGGCTA------------------------------------------',
+ 'AGAAAGGAGGTGATCCAGCCGCACCTTCCGATACGGCTACCTTGTTACGACTTCACCCCAATCATTTGTTCCACCTTCGACGGCTAGCTCCAAATGGTTACTCCACCGGCTTCGGGTGTTACAAACTC', 98.2),
+
+ ('2_like', '2', '-------------------ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG---------------',
+ 'AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT', 100.0),
+
+ ('2_like_rc RC', '2', '-------------------ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG---------------', 'AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT', 100.0)]
+
+uc_lines1 = """# uclust --input q.fasta --lib r.fasta --uc results.uc --id 0.90 --libonly --rev
+# version=1.1.579
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+N * 80 * * * * * s1 some comment *
+S 4 80 * * * * * s2 some other comment *
+H 2 78 100.0 + 0 0 5I78M10I s3 yet another comment s2""".split('\n')
+
+uc_lines_w_multiple_hits_per_query = """# uclust --input q.fasta --lib r.fasta --uc results.uc --id 0.90 --libonly --rev
+# version=1.1.579
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+N * 80 * * * * * s1 some comment *
+S 4 80 * * * * * s2 some other comment *
+S 4 80 * * * * * s4 *
+H 2 78 100.0 + 0 0 5I78M10I s3 yet another comment s2
+H 2 78 98.0 + 0 0 5I78M10I s3 yet another comment s4
+""".split('\n')
+
+uc_lines_overlapping_lib_input_seq_ids = """# uclust --maxrejects 32 --input /tmp/OtuPickerbb092OWRWLWqlBR2BmTZ.fasta --id 0.97 --uc /tmp/uclust_clustersLf5Oqv0SvGTZo1mVWBqK.uc --rev --usersort --maxaccepts 8 --lib r.fasta
+# version=1.1.16
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+S 1 24 * * * * * 3 *
+H 1 24 100.0 + 0 0 24M 4 3
+L 0 54 * * * * * 3 *
+H 0 54 100.0 + 0 0 54M 2 3
+D 0 2 * * * * 100.0 3 *
+C 1 2 100.0 * * * * 3 *
+""".split('\n')
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_usearch.py b/bfillings/tests/test_usearch.py
new file mode 100755
index 0000000..5e0ba0b
--- /dev/null
+++ b/bfillings/tests/test_usearch.py
@@ -0,0 +1,2000 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+provides unit tests for the usearch.py module
+"""
+
+from os import close
+from os.path import basename, join, exists
+from shutil import rmtree
+from glob import glob
+from unittest import TestCase, main
+from tempfile import mkstemp, mkdtemp
+
+from skbio.util import remove_files
+from skbio.parse.sequences import parse_fasta
+
+from bfillings.usearch import (clusters_from_blast_uc_file,
+ usearch_fasta_sort_from_filepath,
+ usearch_dereplicate_exact_subseqs,
+ usearch_dereplicate_exact_seqs,
+ usearch_sort_by_abundance,
+ usearch_cluster_error_correction,
+ usearch_chimera_filter_de_novo,
+ usearch_chimera_filter_ref_based,
+ usearch_cluster_seqs,
+ enumerate_otus, assign_reads_to_otus,
+ usearch_qf, concatenate_fastas,
+ get_retained_chimeras,
+ assign_dna_reads_to_protein_database,
+ usearch61_ref_cluster,
+ usearch61_denovo_cluster,
+ sort_by_abundance_usearch61,
+ sort_by_length_usearch61,
+ usearch61_cluster_ref,
+ usearch61_fast_cluster,
+ usearch61_smallmem_cluster,
+ parse_dereplicated_uc,
+ parse_usearch61_clusters,
+ merge_clusters_dereplicated_seqs,
+ merge_failures_dereplicated_seqs,
+ parse_usearch61_failures,
+ usearch61_chimera_check_denovo,
+ usearch61_chimera_check_ref)
+
+
+class Usearch61Tests(TestCase):
+
+ """ Tests for usearch 6.1 functionality """
+
+ def setUp(self):
+ # create the temporary input files
+
+ self.output_dir = '/tmp/'
+
+ self.dna_seqs_1 = dna_seqs_1
+ self.usearch_ref_seqs1 = usearch_ref_seqs1
+ self.dna_seqs_1_subset = dna_seqs_1_subset
+ self.dna_seqs_with_dups = dna_seqs_with_dups2
+ self.usearch61_dereplicated_uc_lines = usearch61_dereplicated_uc_lines
+ self.usearch61_clustered_uc_lines = usearch61_clustered_uc_lines
+ self.usearch61_clustered_uc_lines_ref =\
+ usearch61_clustered_uc_lines_ref
+ self.usearch61_clustered_ref_lines = usearch61_clustered_ref_lines
+ self.de_novo_chimera_seqs = de_novo_chimera_seqs
+ self.expected_usearch61_denovo_uchime_file =\
+ expected_usearch61_denovo_uchime_file
+ self.reference_seqs_fp = reference_seqs_fp
+ self.expected_usearch61_ref_uchime_file =\
+ expected_usearch61_ref_uchime_file
+
+ f, self.tmp_dna_seqs_1 = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_dna_seqs_1, 'w')
+ seq_file.write(self.dna_seqs_1)
+ seq_file.close()
+
+ f, self.tmp_usearch_ref_seqs1 = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_usearch_ref_seqs1, 'w')
+ seq_file.write(self.usearch_ref_seqs1)
+ seq_file.close()
+
+ f, self.tmp_dna_seqs_1_subset = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_dna_seqs_1_subset, 'w')
+ seq_file.write(self.dna_seqs_1_subset)
+ seq_file.close()
+
+ f, self.tmp_dna_seqs_with_dups = \
+ mkstemp(prefix='UsearchOtuPickerTest_', suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_dna_seqs_with_dups, "w")
+ seq_file.write(self.dna_seqs_with_dups)
+ seq_file.close()
+
+ f, self.tmp_de_novo_chimera_seqs = \
+ mkstemp(prefix='Usearch61denovoChimera_', suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_de_novo_chimera_seqs, 'w')
+ seq_file.write(self.de_novo_chimera_seqs)
+ seq_file.close()
+
+ f, self.tmp_ref_chimera_seqs = mkstemp(prefix="Usearch61refChimera_",
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_ref_chimera_seqs, "w")
+ seq_file.write(self.reference_seqs_fp)
+ seq_file.close()
+
+ self._files_to_remove =\
+ [self.tmp_dna_seqs_1, self.tmp_usearch_ref_seqs1,
+ self.tmp_dna_seqs_1_subset, self.tmp_dna_seqs_with_dups,
+ self.tmp_de_novo_chimera_seqs, self.tmp_ref_chimera_seqs]
+
+ self._dirs_to_remove = []
+
+ def tearDown(self):
+ remove_files(self._files_to_remove)
+ if self._dirs_to_remove:
+ for curr_dir in self._dirs_to_remove:
+ rmtree(curr_dir)
+
+ def test_usearch61_ref_default_params(self):
+ """ usearch61 reference OTU picking works with default settings """
+
+ clusters, failures = usearch61_ref_cluster(self.tmp_dna_seqs_1,
+ self.tmp_usearch_ref_seqs1, output_dir=self.output_dir,
+ save_intermediate_files=False, remove_usearch_logs=True)
+
+ # Should all fall into single, de novo clusters
+
+ expected_failures = []
+
+ self.assertEqual(failures, expected_failures)
+
+ expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+ ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+ ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+ ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+ self.assertEqual(len(expected_clusters), 10)
+
+ for curr_cluster in clusters.values():
+ self.assertTrue(curr_cluster in expected_clusters)
+
+ def test_usearch61_ref_default_params_suppressed_clusters(self):
+ """ usearch61 reference OTU picking, suppressed clusters """
+
+ clusters, failures = usearch61_ref_cluster(self.tmp_dna_seqs_1,
+ self.tmp_usearch_ref_seqs1, suppress_new_clusters=True,
+ output_dir=self.output_dir,
+ save_intermediate_files=False, remove_usearch_logs=True)
+
+ # Should all fail as the reference database does not match.
+
+ expected_clusters = {}
+
+ expected_failures = ['uclust_test_seqs_0', 'uclust_test_seqs_9',
+ 'uclust_test_seqs_4', 'uclust_test_seqs_7', 'uclust_test_seqs_2',
+ 'uclust_test_seqs_1', 'uclust_test_seqs_3', 'uclust_test_seqs_8',
+ 'uclust_test_seqs_6', 'uclust_test_seqs_5']
+
+ self.assertEqual(clusters, expected_clusters)
+
+ for curr_failure in failures:
+ self.assertTrue(curr_failure in expected_failures)
+
+ def test_usearch61_ref_default_params_matches_ref(self):
+ """ usearch61 reference OTU picking, matches ref OTU IDs """
+
+ clusters, failures = usearch61_ref_cluster(self.tmp_dna_seqs_1,
+ self.tmp_dna_seqs_1, suppress_new_clusters=True,
+ output_dir=self.output_dir,
+ save_intermediate_files=False, remove_usearch_logs=True)
+
+ # Should all fall into single, ref-based clusters
+
+ expected_clusters = {'uclust_test_seqs_5': ['uclust_test_seqs_5'],
+ 'uclust_test_seqs_4': ['uclust_test_seqs_4'],
+ 'uclust_test_seqs_7': ['uclust_test_seqs_7'],
+ 'uclust_test_seqs_6': ['uclust_test_seqs_6'],
+ 'uclust_test_seqs_1': ['uclust_test_seqs_1'],
+ 'uclust_test_seqs_0': ['uclust_test_seqs_0'],
+ 'uclust_test_seqs_3': ['uclust_test_seqs_3'],
+ 'uclust_test_seqs_2': ['uclust_test_seqs_2'],
+ 'uclust_test_seqs_9': ['uclust_test_seqs_9'],
+ 'uclust_test_seqs_8': ['uclust_test_seqs_8']}
+
+ expected_failures = []
+
+ self.assertEqual(clusters, expected_clusters)
+ self.assertEqual(failures, expected_failures)
+
+ def test_usearch61_ref_open_ref(self):
+ """ usearch61 does open reference OTU picking """
+
+ clusters, failures = usearch61_ref_cluster(self.tmp_dna_seqs_1,
+ self.tmp_dna_seqs_1_subset, percent_id=0.98, rev=True,
+ save_intermediate_files=False, minlen=44,
+ output_dir=self.output_dir, remove_usearch_logs=True,
+ verbose=False, wordlength=12, usearch_fast_cluster=False,
+ usearch61_sort_method='abundance', otu_prefix="denovo",
+ usearch61_maxrejects=100, usearch61_maxaccepts=4,
+ sizeorder=True)
+
+ # Should all fall into single, ref-based & denovo clusters
+
+ expected_ref_results = {'uclust_test_seqs_1': ['uclust_test_seqs_1'],
+ 'uclust_test_seqs_0': ['uclust_test_seqs_0'],
+ 'uclust_test_seqs_3': ['uclust_test_seqs_3'],
+ 'uclust_test_seqs_2': ['uclust_test_seqs_2']}
+
+ expected_denovo_results = [['uclust_test_seqs_5'],
+ ['uclust_test_seqs_7'], ['uclust_test_seqs_8'], ['uclust_test_seqs_4'],
+ ['uclust_test_seqs_6'], ['uclust_test_seqs_9']]
+
+ self.assertEqual(len(clusters), 10)
+
+ for curr_ref_result in expected_ref_results:
+ self.assertEqual(clusters[curr_ref_result],
+ expected_ref_results[curr_ref_result])
+ for curr_denovo_result in expected_denovo_results:
+ self.assertTrue(curr_denovo_result in clusters.values())
+
+ expected_failures = []
+
+ self.assertEqual(failures, expected_failures)
+
+ def test_usearch61_denovo_default_params(self):
+ """ usearch61 denovo OTU picking works with default settings """
+
+ clusters = usearch61_denovo_cluster(self.tmp_dna_seqs_1,
+ output_dir=self.output_dir, save_intermediate_files=False,
+ remove_usearch_logs=True)
+
+ # Should all fall into single, de novo clusters
+
+ expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+ ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+ ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+ ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+ self.assertEqual(len(expected_clusters), 10)
+
+ for curr_cluster in clusters.values():
+ self.assertTrue(curr_cluster in expected_clusters)
+
+ def test_usearch61_denovo_length_sorting(self):
+ """ usearch61 denovo OTU picking works with length sorting """
+
+ clusters = usearch61_denovo_cluster(self.tmp_dna_seqs_1,
+ output_dir=self.output_dir, save_intermediate_files=False,
+ remove_usearch_logs=True, usearch61_sort_method='length')
+
+ # Should all fall into single, de novo clusters
+
+ expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+ ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+ ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+ ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+ self.assertEqual(len(expected_clusters), 10)
+
+ for curr_cluster in clusters.values():
+ self.assertTrue(curr_cluster in expected_clusters)
+
+ def test_usearch61_denovo_no_sorting(self):
+ """ usearch61 denovo OTU picking works with no sorting """
+
+ clusters = usearch61_denovo_cluster(self.tmp_dna_seqs_1,
+ output_dir=self.output_dir, save_intermediate_files=False,
+ remove_usearch_logs=True, usearch61_sort_method='None')
+
+ # Should all fall into single, de novo clusters
+
+ expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+ ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+ ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+ ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+ self.assertEqual(len(expected_clusters), 10)
+
+ for curr_cluster in clusters.values():
+ self.assertTrue(curr_cluster in expected_clusters)
+
+ def test_usearch61_denovo_fast_cluster(self):
+ """ usearch61 denovo OTU picking works with fast_cluster sorting """
+
+ clusters = usearch61_denovo_cluster(self.tmp_dna_seqs_1,
+ output_dir=self.output_dir, save_intermediate_files=False,
+ remove_usearch_logs=True, usearch61_sort_method='length',
+ usearch_fast_cluster=True)
+
+ # Should all fall into single, de novo clusters
+
+ expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+ ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+ ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+ ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+ self.assertEqual(len(expected_clusters), 10)
+
+ for curr_cluster in clusters.values():
+ self.assertTrue(curr_cluster in expected_clusters)
+
+ def test_sort_by_abundance_usearch61(self):
+ """ usearch61 sorts by abundance successfully """
+
+ f, sorted_fna_fp = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+ f, sorted_uc_fp = mkstemp(prefix='UsearchOtuPickerTest_', suffix='.uc')
+ close(f)
+
+ output_fna_filepath, output_uc_filepath, app_result =\
+ sort_by_abundance_usearch61(self.tmp_dna_seqs_with_dups,
+ self.output_dir, remove_usearch_logs=True,
+ output_fna_filepath=sorted_fna_fp,
+ output_uc_filepath=sorted_uc_fp, log_name="abundance_sorted.log")
+
+ output_fna = [
+ line for line in parse_fasta(open(output_fna_filepath, "U"))]
+
+ expected_fna = [('seq2;size=3;',
+ 'TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC'),
+ ('seq1;size=1;',
+ 'GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA')]
+
+ self._files_to_remove.append(sorted_fna_fp)
+ self._files_to_remove.append(sorted_uc_fp)
+
+ self.assertEqual(output_fna, expected_fna)
+
+ def test_sort_by_length_usearch61(self):
+ """ usearch61 sorts by length successfully """
+
+ f, sorted_fna_fp = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+
+ output_fna_filepath, app_result =\
+ sort_by_length_usearch61(self.tmp_usearch_ref_seqs1,
+ self.output_dir, remove_usearch_logs=True,
+ output_fna_filepath=sorted_fna_fp)
+
+ output_fna = [
+ line for line in parse_fasta(open(output_fna_filepath, "U"))]
+
+ expected_fna = [('ref1',
+ 'CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCA'),
+ ('L07864',
+ 'GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTAATGCATGGGAATCTGCCATATAGTGGGGGACAACTGGGGAAACCCAGGCTAATACCGCATAATCTCTACGGAGGAAAGGCTTC'),
+ ('EU199232',
+ 'TACGCGCGGAAATCGAGCGAGATTGGGAACGCAAGTTCCTGAGTATTGCGGCGAACGGGTGAGTAAGACGTGGGTGATCTACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC')]
+ self._files_to_remove.append(sorted_fna_fp)
+
+ self.assertEqual(output_fna, expected_fna)
+
+ def test_usearch61_cluster_ref(self):
+ """ usearch61 reference OTU picking application call successful """
+
+ f, output_uc_fp = mkstemp(prefix='UsearchOtuPickerTest_', suffix='.uc')
+ close(f)
+
+ uc_fp, failures = usearch61_cluster_ref(self.tmp_dna_seqs_1,
+ self.tmp_dna_seqs_1, output_dir=self.output_dir,
+ remove_usearch_logs=True, output_uc_filepath=output_uc_fp)
+
+ self._files_to_remove.append(uc_fp)
+
+ actual_uc_lines = [line.strip() for line in open(uc_fp, "U")]
+
+ # Difficult to test output, as numbers change between runs, for now
+ # just testing length of output, and order of lines changes as well.
+
+ self.assertEqual(len(actual_uc_lines), 10)
+
+ def test_usearch61_fast_cluster(self):
+ """ usearch61 fast cluster OTU picking application call successful """
+
+ f, output_uc_fp = mkstemp(prefix='UsearchOtuPickerTest_', suffix='.uc')
+ close(f)
+
+ uc_fp, failures = usearch61_fast_cluster(self.tmp_dna_seqs_1,
+ output_dir=self.output_dir,
+ remove_usearch_logs=True, output_uc_filepath=output_uc_fp)
+
+ self._files_to_remove.append(uc_fp)
+
+ actual_uc_lines = [line.strip() for line in open(uc_fp, "U")]
+
+ # Difficult to test output, as numbers change between runs, for now
+ # just testing length of output, and order of lines changes as well.
+
+ self.assertEqual(len(actual_uc_lines), 20)
+
+ def test_usearch61_cluster_smallmem(self):
+ """ usearch61 smallmem OTU picking application call successful """
+
+ f, output_uc_fp = mkstemp(prefix='UsearchOtuPickerTest_', suffix='.uc')
+ close(f)
+
+ uc_fp, failures = usearch61_smallmem_cluster(self.tmp_dna_seqs_1,
+ output_dir=self.output_dir,
+ remove_usearch_logs=True, output_uc_filepath=output_uc_fp)
+
+ self._files_to_remove.append(uc_fp)
+
+ actual_uc_lines = [line.strip() for line in open(uc_fp, "U")]
+
+ # Difficult to test output, as numbers change between runs, for now
+ # just testing length of output, and order of lines changes as well.
+
+ self.assertEqual(len(actual_uc_lines), 20)
+
+ def test_parse_dereplicated_uc(self):
+ """ Parses dereplicated usearch61 uc file successfully """
+
+ actual_derep_ids =\
+ parse_dereplicated_uc(self.usearch61_dereplicated_uc_lines)
+
+ expected_derep_ids = {'seq2': ['seq3', 'seq4'], 'seq1': []}
+
+ self.assertEqual(actual_derep_ids, expected_derep_ids)
+
+ def test_parse_usearch61_clusters_denovo(self):
+ """ Parses usearch61 de novo clusters uc file correctly """
+
+ actual_parsed_clusters, failures =\
+ parse_usearch61_clusters(self.usearch61_clustered_uc_lines,
+ ref_clustered=False)
+
+ expected_parsed_clusters =\
+ ({'denovo0': ['seq2'], 'denovo1': ['seq1']})
+
+ self.assertEqual(actual_parsed_clusters, expected_parsed_clusters)
+
+ def test_parse_usearch61_clusters_ref(self):
+ """ Parses usearch61 ref clusters uc file correctly """
+
+ actual_parsed_clusters, failures =\
+ parse_usearch61_clusters(self.usearch61_clustered_uc_lines_ref,
+ otu_prefix='', ref_clustered=True)
+
+ expected_parsed_clusters =\
+ ({'seq4': ['seq2'], 'seq1': ['seq1']})
+
+ self.assertEqual(actual_parsed_clusters, expected_parsed_clusters)
+
+ def test_merge_clusters_dereplicated_seqs(self):
+ """ Properly merges dereplicated and clustered sequences """
+
+ derep_ids = {'seq2': ['seq3', 'seq4'], 'seq1': []}
+
+ clustered_ids = ({'seq4': ['seq2'], 'seq1': ['seq1']})
+
+ merged_ids = merge_clusters_dereplicated_seqs(clustered_ids,
+ derep_ids)
+
+ expected_ids = {'seq1': ['seq1'], 'seq4': ['seq2', 'seq3', 'seq4']}
+
+ self.assertEqual(merged_ids, expected_ids)
+
+ def test_merge_failures_dereplicated_seqs(self):
+ """ Usearch61 properly merges dereplicated seqs, ref based failures """
+
+ failures = ['seq2']
+ derep_ids = {'seq2': ['seq3', 'seq4'], 'seq1': []}
+
+ merged_failures = merge_failures_dereplicated_seqs(failures,
+ derep_ids)
+
+ expected_failures = ['seq2', 'seq3', 'seq4']
+
+ self.assertEqual(merged_failures, expected_failures)
+
+ def test_parse_usearch61_failures(self):
+ """ Writes failures out to fasta file """
+
+ failures = ['seq2', 'seq3', 'seq4']
+ f, filtered_fna_fp = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+ output_fp = parse_usearch61_failures(self.tmp_dna_seqs_with_dups,
+ failures, filtered_fna_fp)
+
+ self._files_to_remove.append(output_fp)
+
+ output_fna = [
+ line for line in parse_fasta(open(output_fp, "U"))]
+
+ expected_fna = [(
+ 'seq2',
+ 'TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC'),
+ ('seq3',
+ 'TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC'),
+ ('seq4',
+ 'TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC')]
+ self.assertEqual(output_fna, expected_fna)
+
+ # Chimera tests
+
+ def test_usearch61_denovo_chimera_detection(self):
+ """ usearch61 denovo chimera detection correctly flags chimeras """
+
+ uchime_fp = join(self.output_dir, "uchime_denovo.uchime")
+
+ uchime_fp, app_result =\
+ usearch61_chimera_check_denovo(self.tmp_de_novo_chimera_seqs,
+ uchime_denovo_fp=uchime_fp,
+ output_dir=self.output_dir,
+ remove_usearch_logs=True)
+
+ uchime_f = open(uchime_fp, "U")
+
+ actual_lines = [line.strip() for line in uchime_f]
+
+ # There is some system dependent stochastic effect on calculations
+ # for chimeras, need to pull out only the flags Y or N for chimeras
+
+ expected_chimera_ixs = [11, 16]
+
+ for line in range(len(actual_lines)):
+ curr_chimera_flag = actual_lines[line].split('\t')[-1]
+ if line in expected_chimera_ixs:
+ self.assertEqual(curr_chimera_flag, "Y")
+ else:
+ self.assertEqual(curr_chimera_flag, "N")
+
+ self._files_to_remove.append(uchime_fp)
+
+ def test_usearch61_ref_chimera_detection(self):
+ """ usearch61 ref chimera detection correctly flags chimeras """
+
+ uchime_fp = join(self.output_dir, "uchime_ref.uchime")
+
+ uchime_fp, app_result =\
+ usearch61_chimera_check_ref(self.tmp_de_novo_chimera_seqs,
+ uchime_ref_fp=uchime_fp,
+ reference_seqs_fp=
+ self.tmp_ref_chimera_seqs,
+ output_dir=self.output_dir,
+ remove_usearch_logs=True)
+
+ uchime_f = open(uchime_fp, "U")
+
+ actual_lines = [line.strip() for line in uchime_f]
+
+ self.assertEqual(actual_lines,
+ self.expected_usearch61_ref_uchime_file)
+
+ self._files_to_remove.append(uchime_fp)
+
+
+class UsearchTests(TestCase):
+
+ def setUp(self):
+ # create the temporary input files
+ self.dna_seqs_1 = dna_seqs_1
+ self.dna_seqs_2 = dna_seqs_usearch
+ self.dna_seqs_3 = dna_seqs_3
+ self.dna_seqs_4 = dna_seqs_4
+ self.protein_ref_seqs1 = protein_ref_seqs1
+ self.ref_database = usearch_ref_seqs1
+ self.dna_seqs_with_abundance = dna_seqs_with_abundance
+ self.de_novo_chimera_seqs = de_novo_chimera_seqs
+ self.dna_seqs_with_dups = dna_seqs_with_dups
+ self.dna_seqs_reference_otu_picking = dna_seqs_reference_otu_picking
+
+ # Expected output files
+ self.uc_lines1 = uc_lines1
+ self.expected_otu_assignments = expected_otu_assignments
+ self.expected_enumerated_fasta = expected_enumerated_fasta
+ self.expected_enumerated_fasta_added_options =\
+ expected_enumerated_fasta_added_options
+ self.expected_clusters_w_abundance_default_settings =\
+ expected_clusters_w_abundance_default_settings
+ self.expected_clusters_w_abundance_low_setting =\
+ expected_clusters_w_abundance_low_setting
+ self.expected_reference_filtered_seqs =\
+ expected_reference_filtered_seqs
+ self.expected_de_novo_chimeras_default =\
+ expected_de_novo_chimeras_default
+ self.expected_de_novo_chimera_filtered_skew11 =\
+ expected_de_novo_chimera_filtered_skew11
+ self.expected_cluster_err_seqs =\
+ expected_cluster_err_seqs
+ self.expected_sorted_by_abundance_no_filter =\
+ expected_sorted_by_abundance_no_filter
+ self.expected_derep_seqs = expected_derep_seqs
+ self.expected_abundance_sort_filtered = expected_abundance_sort_filtered
+ self.expected_len_sorted_seqs = expected_len_sorted_seqs
+ self.expected_combined_dna_seqs_1_seqs_usearch =\
+ expected_combined_dna_seqs_1_seqs_usearch
+ self.retained_chimeras_seqs1 = retained_chimeras_seqs1
+ self.retained_chimeras_seqs2 = retained_chimeras_seqs2
+ self.expected_retained_chimeras_union =\
+ expected_retained_chimeras_union
+ self.expected_retained_chimeras_intersection =\
+ expected_retained_chimeras_intersection
+ self.expected_derep_seqs_full_len =\
+ expected_derep_seqs_full_len
+
+ # Create temporary files for use with unit tests
+
+ self.tmp_dir = '/tmp/'
+
+ f, self.tmp_seq_filepath1 = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_seq_filepath1, 'w')
+ seq_file.write(self.dna_seqs_1)
+ seq_file.close()
+
+ f, self.tmp_seq_filepath2 = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_seq_filepath2, 'w')
+ seq_file.write(self.dna_seqs_2)
+ seq_file.close()
+
+ f, self.dna_seqs3_filepath = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.dna_seqs3_filepath, 'w')
+ seq_file.write(self.dna_seqs_3)
+ seq_file.close()
+
+ f, self.dna_seqs4_filepath = mkstemp(prefix='UsearchOtuPickerTest_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.dna_seqs4_filepath, 'w')
+ seq_file.write(self.dna_seqs_4)
+ seq_file.close()
+
+ f, self.protein_ref_seqs1_filepath = \
+ mkstemp(prefix='UsearchOtuPickerTest_', suffix='.fasta')
+ close(f)
+ seq_file = open(self.protein_ref_seqs1_filepath, 'w')
+ seq_file.write(self.protein_ref_seqs1)
+ seq_file.close()
+
+ f, self.tmp_ref_database = mkstemp(prefix='UsearchRefDatabase_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_ref_database, 'w')
+ seq_file.write(self.ref_database)
+ seq_file.close()
+
+ f, self.tmp_seqs_w_abundance = mkstemp(prefix='UsearchSeqsAbundance_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_seqs_w_abundance, 'w')
+ seq_file.write(self.dna_seqs_with_abundance)
+ seq_file.close()
+
+ f, self.tmp_de_novo_chimera_seqs = \
+ mkstemp(prefix='UsearchdenovoChimera_', suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_de_novo_chimera_seqs, 'w')
+ seq_file.write(self.de_novo_chimera_seqs)
+ seq_file.close()
+
+ f, self.tmp_dna_seqs_with_dups = mkstemp(prefix='UsearchDupDNASeqs_',
+ suffix='.fasta')
+ close(f)
+ seq_file = open(self.tmp_dna_seqs_with_dups, 'w')
+ seq_file.write(self.dna_seqs_with_dups)
+ seq_file.close()
+
+ f, self.tmp_retained_chimeras_seqs1 = \
+ mkstemp(prefix="UsearchRetainedChimeras1_", suffix=".fasta")
+ close(f)
+ seq_file = open(self.tmp_retained_chimeras_seqs1, 'w')
+ seq_file.write(self.retained_chimeras_seqs1)
+ seq_file.close()
+
+ f, self.tmp_retained_chimeras_seqs2 = \
+ mkstemp(prefix="UsearchRetainedChimeras1_", suffix=".fasta")
+ close(f)
+ seq_file = open(self.tmp_retained_chimeras_seqs2, 'w')
+ seq_file.write(self.retained_chimeras_seqs2)
+ seq_file.close()
+
+ f, self.tmp_dna_seqs_ref_otu_picking = \
+ mkstemp(prefix="UsearchRefOtuPicking_", suffix=".fasta")
+ close(f)
+ seq_file = open(self.tmp_dna_seqs_ref_otu_picking, "w")
+ seq_file.write(self.dna_seqs_reference_otu_picking)
+ seq_file.close()
+
+ self._files_to_remove =\
+ [self.tmp_seq_filepath1, self.tmp_seq_filepath2,
+ self.tmp_ref_database, self.tmp_seqs_w_abundance,
+ self.tmp_de_novo_chimera_seqs, self.tmp_dna_seqs_with_dups,
+ self.tmp_retained_chimeras_seqs1, self.tmp_retained_chimeras_seqs2,
+ self.tmp_dna_seqs_ref_otu_picking, self.dna_seqs3_filepath,
+ self.protein_ref_seqs1_filepath, self.dna_seqs4_filepath]
+
+ self._dirs_to_remove = []
+
+ def tearDown(self):
+ remove_files(self._files_to_remove)
+ if self._dirs_to_remove:
+ for curr_dir in self._dirs_to_remove:
+ rmtree(curr_dir)
+
+ def test_usearch_qf(self):
+ """ Main program loop test, with default parameters """
+
+ # cluster size filtering set to 1 instead of default 4
+ clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+ output_dir=self.tmp_dir,
+ db_filepath=self.tmp_ref_database,
+ minsize=1,
+ remove_usearch_logs=True,
+ chimeras_retention='intersection')
+
+ expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+ '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2']}
+ expected_failures = ['chimera']
+
+ self.assertEqual(clusters, expected_clusters)
+ self.assertEqual(failures, expected_failures)
+
+ def test_usearch_qf_minlen(self):
+ """ Main program loop test, with longer minlen """
+
+ # cluster size filtering set to 1 instead of default 4
+ clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+ output_dir=self.tmp_dir,
+ db_filepath=self.tmp_ref_database,
+ minsize=1,
+ remove_usearch_logs=True,
+ chimeras_retention='intersection',
+ minlen=110)
+
+ expected_clusters = {'0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2']}
+ expected_failures = ['Solemya', 'Solemya_seq2', 'chimera']
+
+ self.assertEqual(clusters, expected_clusters)
+ self.assertEqual(failures, expected_failures)
+
+ def test_usearch_qf_reference_otu_picking(self):
+ """ Main program loop test, with reference + new clusters """
+
+ # cluster size filtering set to 1 instead of default 4
+ clusters, failures = usearch_qf(self.tmp_dna_seqs_ref_otu_picking,
+ output_dir=self.tmp_dir,
+ refseqs_fp=self.tmp_ref_database,
+ reference_chimera_detection=False,
+ minsize=1,
+ remove_usearch_logs=True,
+ suppress_new_clusters=False)
+
+ # Will cluster everything including RandomCrap, as new clusters
+ # allowed.
+ expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+ '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2'],
+ '2': ['RandomCrap']}
+ expected_failures = []
+
+ self.assertEqual(clusters, expected_clusters)
+ self.assertEqual(failures, expected_failures)
+
+ def test_usearch_qf_reference_otu_picking_no_new_clusters(self):
+ """ Main program loop test, with reference and no new clusters """
+
+ # cluster size filtering set to 1 instead of default 4
+ clusters, failures = usearch_qf(self.tmp_dna_seqs_ref_otu_picking,
+ output_dir=self.tmp_dir,
+ refseqs_fp=self.tmp_ref_database,
+ reference_chimera_detection=False,
+ minsize=1,
+ remove_usearch_logs=True,
+ suppress_new_clusters=True)
+
+ # Will cluster everything but RandomCrap, as no new clusters allowed.
+ expected_clusters = {'L07864': ['Solemya', 'Solemya_seq2'],
+ 'ref1': ['usearch_ecoli_seq', 'usearch_ecoli_seq2']}
+ expected_failures = ['RandomCrap']
+
+ self.assertEqual(clusters, expected_clusters)
+ self.assertEqual(failures, expected_failures)
+
+ def test_usearch_qf_no_ref_database(self):
+ """ Main program loop with no reference chimera testing """
+
+ # cluster size filtering set to 1 instead of default 4
+ clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+ output_dir=self.tmp_dir,
+ reference_chimera_detection=False,
+ minsize=1,
+ remove_usearch_logs=True)
+
+ # Chimera sequence should not be detected without reference test.
+ expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+ '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2'],
+ '2': ['chimera']}
+
+ expected_failures = []
+
+ self.assertEqual(clusters, expected_clusters)
+ self.assertEqual(failures, expected_failures)
+
+ def test_usearch_qf_union(self):
+ """ Main program loop with union nonchimera retention """
+
+ # cluster size filtering set to 1 instead of default 4
+ clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+ output_dir=self.tmp_dir,
+ reference_chimera_detection=False,
+ minsize=1,
+ remove_usearch_logs=True,
+ chimeras_retention='union')
+
+ # Chimera sequence retained as passes de novo test
+ expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+ '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2'],
+ '2': ['chimera']}
+
+ expected_failures = []
+
+ self.assertEqual(clusters, expected_clusters)
+ self.assertEqual(failures, expected_failures)
+
+ def test_usearch_qf_disabled_filters(self):
+ """ Returns expected clustering with no filtering """
+
+ # cluster size filtering set to 1 instead of default 4
+ clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+ output_dir=self.tmp_dir,
+ de_novo_chimera_detection=False,
+ reference_chimera_detection=False,
+ cluster_size_filtering=False,
+ remove_usearch_logs=True)
+
+ # Chimera sequence should not be detected without reference test.
+ expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+ '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2'],
+ '2': ['chimera']}
+
+ expected_failures = []
+
+ self.assertEqual(clusters, expected_clusters)
+ self.assertEqual(failures, expected_failures)
+
+ def test_usearch_qf_generates_logs(self):
+ """ Generates expected log files """
+ curr_output_dir = mkdtemp(dir=self.tmp_dir)
+
+ self._dirs_to_remove.append(curr_output_dir)
+
+ # cluster size filtering set to 1 instead of default 4
+ clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+ output_dir=curr_output_dir,
+ db_filepath=self.tmp_ref_database,
+ minsize=1,
+ remove_usearch_logs=False,
+ chimeras_retention='intersection')
+
+ expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+ '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2']}
+ expected_failures = ['chimera']
+
+ self.assertEqual(clusters, expected_clusters)
+ self.assertEqual(failures, expected_failures)
+
+ # Only checking for creation of files, as file contents contain
+ # tmp file names.
+ expected_log_names = ['assign_reads_to_otus.log',
+ 'uchime_de_novo_chimera_filtering.log',
+ 'derep.log',
+ 'uchime_reference_chimera_filtering.log',
+ 'minsize_0_abundance_sort.log',
+ 'usearch_cluster_err_corrected.log',
+ 'minsize_1_abundance_sort.log',
+ 'usearch_cluster_seqs.log',
+ 'sortlen.log']
+
+ actual_logs =\
+ [basename(curr_file)
+ for curr_file in glob(curr_output_dir + "/*.*")]
+
+ self.assertItemsEqual(actual_logs, expected_log_names)
+
+ def test_concatenate_fastas(self):
+ """ Properly concatenates two fasta files """
+
+ f, out_f = mkstemp(prefix='UsearchConcatFileTest_', suffix='.fasta')
+ close(f)
+
+ actual_concatenated_seqs = concatenate_fastas(self.tmp_seq_filepath1,
+ self.tmp_seq_filepath2, out_f)
+
+ self._files_to_remove.append(out_f)
+
+ actual_lines =\
+ [line.strip() for line in open(actual_concatenated_seqs, "U")]
+
+ self.assertEqual(actual_lines,
+ expected_combined_dna_seqs_1_seqs_usearch)
+
+ def test_assign_reads_to_otus(self):
+ """ Properly assigns reads back to original ID """
+
+ app_result, output_filepath =\
+ assign_reads_to_otus(original_fasta=self.tmp_ref_database,
+ filtered_fasta=self.tmp_seq_filepath2,
+ remove_usearch_logs=True,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ # Stripping off first line, which refers to the command using tmp
+ # file names, retaining other actual results.
+ actual_assignments =\
+ [line.strip() for line in open(output_filepath, "U")][2:]
+
+ self.assertEqual(actual_assignments, self.expected_otu_assignments)
+
+ def test_enumerate_otus(self):
+ """ Enumerates OTUs properly """
+
+ output_filepath = enumerate_otus(self.tmp_seq_filepath1)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_fasta = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_fasta, self.expected_enumerated_fasta)
+
+ def test_enumerate_otus_added_options(self):
+ """ Enumerates with all options properly """
+
+ output_filepath = enumerate_otus(self.tmp_seq_filepath1,
+ label_prefix="Big",
+ label_suffix="Ern",
+ retain_label_as_comment=True,
+ count_start=255)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_fasta = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_fasta,
+ self.expected_enumerated_fasta_added_options)
+
+ def test_usearch_cluster_seqs(self):
+ """ Clusters sequences correctly """
+
+ # clusters all seqs with default 97% identity
+ app_result, output_filepath =\
+ usearch_cluster_seqs(self.tmp_seqs_w_abundance,
+ save_intermediate_files=False,
+ remove_usearch_logs=True,
+ percent_id=0.97,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_clusters = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_clusters,
+ self.expected_clusters_w_abundance_default_settings)
+
+ def test_usearch_cluster_seqs_high_identity(self):
+ """ Clusters sequences correctly """
+
+ # Should get two clusters with 99.9% identity
+ app_result, output_filepath =\
+ usearch_cluster_seqs(self.tmp_seqs_w_abundance,
+ save_intermediate_files=False,
+ remove_usearch_logs=True,
+ percent_id=0.999,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_clusters = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_clusters,
+ self.expected_clusters_w_abundance_low_setting)
+
+ def test_usearch_chimera_filter_ref_based(self):
+ """ Properly detects chimeras against reference database """
+
+ app_result, output_filepath =\
+ usearch_chimera_filter_ref_based(self.tmp_seq_filepath2,
+ self.tmp_ref_database,
+ remove_usearch_logs=True,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_filtered_chimeras =\
+ [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_filtered_chimeras,
+ self.expected_reference_filtered_seqs)
+
+ def test_usearch_chimera_filter_de_novo(self):
+ """ Properly detects de novo chimeras """
+
+ app_result, output_filepath =\
+ usearch_chimera_filter_de_novo(self.tmp_de_novo_chimera_seqs,
+ remove_usearch_logs=True,
+ abundance_skew=2,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_seqs = \
+ [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_seqs, self.expected_de_novo_chimeras_default)
+
+ def test_usearch_chimera_filter_de_novo_abundance_skew(self):
+ """ Properly detects de novo chimeras with skew changes """
+
+ app_result, output_filepath =\
+ usearch_chimera_filter_de_novo(self.tmp_de_novo_chimera_seqs,
+ remove_usearch_logs=True,
+ abundance_skew=11,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_seqs = \
+ [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_seqs,
+ self.expected_de_novo_chimera_filtered_skew11)
+
+ def test_usearch_cluster_error_correction(self):
+ """ Properly clusters seqs for chimera testing/filtering """
+
+ # clusters all seqs with default 97% identity
+ app_result, output_filepath =\
+ usearch_cluster_error_correction(self.tmp_seqs_w_abundance,
+ save_intermediate_files=False,
+ remove_usearch_logs=True,
+ percent_id_err=0.97,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_clusters = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_clusters,
+ self.expected_cluster_err_seqs)
+
+ def test_usearch_sort_by_abundance(self):
+ """ Properly sorts fasta by abundance """
+
+ app_result, output_filepath =\
+ usearch_sort_by_abundance(self.tmp_de_novo_chimera_seqs,
+ remove_usearch_logs=True,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_seqs,
+ self.expected_sorted_by_abundance_no_filter)
+
+ def test_usearch_sort_by_abundance_filter(self):
+ """ Properly sorts fasta by abundance, filters low count otus """
+
+ app_result, output_filepath =\
+ usearch_sort_by_abundance(self.tmp_de_novo_chimera_seqs,
+ remove_usearch_logs=True,
+ minsize=40,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_seqs,
+ self.expected_abundance_sort_filtered)
+
+ def test_usearch_dereplicate_exact_subseqs(self):
+ """ Properly dereplicates fasta file """
+
+ app_result, output_filepath =\
+ usearch_dereplicate_exact_subseqs(self.tmp_dna_seqs_with_dups,
+ remove_usearch_logs=True,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_seqs, self.expected_derep_seqs)
+
+ def test_usearch_dereplicate_exact_seqs(self):
+ """ Properly dereplicates fasta file """
+
+ app_result, output_filepath =\
+ usearch_dereplicate_exact_seqs(self.tmp_dna_seqs_with_dups,
+ remove_usearch_logs=True,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_seqs, self.expected_derep_seqs_full_len)
+
+ def test_usearch_fasta_sort_from_filepath(self):
+ """ Properly sorts fasta according to seq length """
+
+ app_result, output_filepath =\
+ usearch_fasta_sort_from_filepath(self.tmp_seq_filepath2,
+ remove_usearch_logs=True,
+ working_dir=self.tmp_dir)
+
+ self._files_to_remove.append(output_filepath)
+
+ actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+ self.assertEqual(actual_seqs, self.expected_len_sorted_seqs)
+
+ def test_clusters_from_blast_uc_file(self):
+ """ clusters_from_uc_file functions as expected """
+
+ expected_clusters = {'19': ['PC.634_4'], '42': ['PC.test2_1',
+ 'PC.test1_2', 'PC.634_3'], '6': ['PC.269_5']}
+ expected_failures = ['PC.481_6']
+
+ self.assertEqual(clusters_from_blast_uc_file(self.uc_lines1),
+ (expected_clusters, expected_failures))
+
+ def test_get_retained_chimeras_union(self):
+ """ Properly returns union of two fastas """
+
+ f, out_f = mkstemp(prefix='UsearchUnionTest_', suffix='.fasta')
+ close(f)
+
+ actual_out_fp = get_retained_chimeras(self.tmp_retained_chimeras_seqs1,
+ self.tmp_retained_chimeras_seqs2, out_f, chimeras_retention='union')
+
+ self._files_to_remove.append(out_f)
+
+ actual_out_f = [line.strip() for line in open(actual_out_fp, "U")]
+
+ self.assertEqual(actual_out_f, self.expected_retained_chimeras_union)
+
+ def test_get_retained_chimeras_intersection(self):
+ """ Properly returns intersection of two fastas """
+
+ f, out_f = mkstemp(prefix='UsearchIntersectionTest_', suffix='.fasta')
+ close(f)
+
+ actual_out_fp = get_retained_chimeras(self.tmp_retained_chimeras_seqs1,
+ self.tmp_retained_chimeras_seqs2, out_f,
+ chimeras_retention='intersection')
+
+ self._files_to_remove.append(out_f)
+
+ actual_out_f = [line.strip() for line in open(actual_out_fp, "U")]
+
+ self.assertEqual(actual_out_f,
+ self.expected_retained_chimeras_intersection)
+
+ def test_assign_dna_reads_to_protein_database(self):
+ """assign_dna_reads_to_protein_database wrapper functions as expected
+ """
+ output_dir = mkdtemp(dir=self.tmp_dir)
+ self._dirs_to_remove.append(output_dir)
+ output_fp = join(output_dir, 'out.uc')
+ assign_dna_reads_to_protein_database(self.dna_seqs3_filepath,
+ self.protein_ref_seqs1_filepath,
+ output_fp,
+ temp_dir=self.tmp_dir)
+
+ self.assertTrue(exists(output_fp))
+ self.assertTrue(exists(output_fp.replace('.uc', '.bl6')))
+
+ # confirm that the clusters look like what we expect
+ expected_clusters = sorted(
+ [['eco:b0015'], ['eco:b0122', 'eco:b0122-like']])
+ actual_clusters = sorted(clusters_from_blast_uc_file(
+ open(output_fp))[0].values())
+ self.assertEqual(actual_clusters, expected_clusters)
+
+ def test_assign_dna_reads_to_protein_database_alt_params(self):
+ """assign_dna_reads_to_protein_database wrapper functions with alt params
+ """
+ output_dir = mkdtemp(dir=self.tmp_dir)
+ self._dirs_to_remove.append(output_dir)
+ output_fp = join(output_dir, 'out.uc')
+ assign_dna_reads_to_protein_database(self.dna_seqs3_filepath,
+ self.protein_ref_seqs1_filepath,
+ output_fp,
+ temp_dir=self.tmp_dir,
+ params={'--id': 1.0})
+
+ self.assertTrue(exists(output_fp))
+ self.assertTrue(exists(output_fp.replace('.uc', '.bl6')))
+
+ # confirm that the clusters look like what we expect
+ expected_clusters = sorted([['eco:b0015'], ['eco:b0122']])
+ actual_clusters = sorted(clusters_from_blast_uc_file(
+ open(output_fp))[0].values())
+ self.assertEqual(actual_clusters, expected_clusters)
+
+ def test_assign_dna_reads_to_dna_database(self):
+ """assign_dna_reads_to_protein_database wrapper functions as expected
+ """
+ output_dir = mkdtemp(dir=self.tmp_dir)
+ self._dirs_to_remove.append(output_dir)
+ output_fp = join(output_dir, 'out.uc')
+ assign_dna_reads_to_protein_database(self.dna_seqs3_filepath,
+ self.dna_seqs4_filepath,
+ output_fp,
+ temp_dir=self.tmp_dir)
+
+ self.assertTrue(exists(output_fp))
+ self.assertTrue(exists(output_fp.replace('.uc', '.bl6')))
+
+ # confirm that the clusters look like what we expect
+ expected_clusters = sorted(
+ [['eco:b0015'], ['eco:b0122', 'eco:b0122-like']])
+ actual_clusters = sorted(clusters_from_blast_uc_file(
+ open(output_fp))[0].values())
+ self.assertEqual(actual_clusters, expected_clusters)
+
+# Long strings for test files, output, etc.
+# *************************************************
+
+retained_chimeras_seqs1 = """>seq1
+ACAGGCC
+>seq2
+ACAGGCCCCC
+>seq3
+TTATCCATT"""
+
+retained_chimeras_seqs2 = """>seq3
+ACAGGCC
+>seq4
+ACAGGCCCCC
+>seq5
+TTATCCATT"""
+
+dna_seqs_1 = """>uclust_test_seqs_0 some comment0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_1 some comment1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_2 some comment2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3 some comment3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>uclust_test_seqs_4 some comment4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>uclust_test_seqs_5 some comment4_again
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>uclust_test_seqs_6 some comment6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>uclust_test_seqs_7 some comment7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_8 some comment8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>uclust_test_seqs_9 some comment9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA"""
+
+dna_seqs_1_subset = """>uclust_test_seqs_0 some comment0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_1 some comment1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_2 some comment2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3 some comment3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT"""
+
+dna_seqs_3 = """>eco:b0001 thrL; thr operon leader peptide; K08278 thr operon leader peptide (N)
+atgaaacgcattagcaccaccattaccaccaccatcaccattaccacaggtaacggtgcg
+ggctga
+>eco:b0015 dnaJ; chaperone Hsp40, co-chaperone with DnaK; K03686 molecular chaperone DnaJ (N)
+atggctaagcaagattattacgagattttaggcgtttccaaaacagcggaagagcgtgaa
+atcagaaaggcctacaaacgcctggccatgaaataccacccggaccgtaaccagggtgac
+aaagaggccgaggcgaaatttaaagagatcaaggaagcttatgaagttctgaccgactcg
+caaaaacgtgcggcatacgatcagtatggtcatgctgcgtttgagcaaggtggcatgggc
+ggcggcggttttggcggcggcgcagacttcagcgatatttttggtgacgttttcggcgat
+atttttggcggcggacgtggtcgtcaacgtgcggcgcgcggtgctgatttacgctataac
+atggagctcaccctcgaagaagctgtacgtggcgtgaccaaagagatccgcattccgact
+ctggaagagtgtgacgtttgccacggtagcggtgcaaaaccaggtacacagccgcagact
+tgtccgacctgtcatggttctggtcaggtgcagatgcgccagggattcttcgctgtacag
+cagacctgtccacactgtcagggccgcggtacgctgatcaaagatccgtgcaacaaatgt
+catggtcatggtcgtgttgagcgcagcaaaacgctgtccgttaaaatcccggcaggggtg
+gacactggagaccgcatccgtcttgcgggcgaaggtgaagcgggcgagcatggcgcaccg
+gcaggcgatctgtacgttcaggttcaggttaaacagcacccgattttcgagcgtgaaggc
+aacaacctgtattgcgaagtcccgatcaacttcgctatggcggcgctgggtggcgaaatc
+gaagtaccgacccttgatggtcgcgtcaaactgaaagtgcctggcgaaacccagaccggt
+aagctattccgtatgcgcggtaaaggcgtcaagtctgtccgcggtggcgcacagggtgat
+ttgctgtgccgcgttgtcgtcgaaacaccggtaggcctgaacgaaaggcagaaacagctg
+ctgcaagagctgcaagaaagcttcggtggcccaaccggcgagcacaacagcccgcgctca
+aagagcttctttgatggtgtgaagaagttttttgacgacctgacccgctaa
+>eco:b0122 yacC; conserved protein, PulS_OutS family (N)
+atgaagacgtttttcagaacagtgttattcggcagcctgatggccgtctgcgcaaacagt
+tacgcgctcagcgagtctgaagccgaagatatggccgatttaacggcagtttttgtcttt
+ctgaagaacgattgtggttaccagaacttacctaacgggcaaattcgtcgcgcactggtc
+tttttcgctcagcaaaaccagtgggacctcagtaattacgacaccttcgacatgaaagcc
+ctcggtgaagacagctaccgcgatctcagcggcattggcattcccgtcgctaaaaaatgc
+aaagccctggcccgcgattccttaagcctgcttgcctacgtcaaataa
+>eco:b0122-like
+atgaagaaaattttcagaacagtgttattcggcagcctgatggccgtctgcgcaaacagt
+tacgcgctcagcgagtctgaagccgaagatatggccgatttaacggcagtttttgtcttt
+ctgaagaacgattgtggttaccagaacttacctaacgggcaaattcgtcgcgcactggtc
+tttttcgctcagcaaaaccagtgggacctcagtaattacgacaccttcgacatgaaagcc
+ctcggtgaagacagctaccgcgatctcagcggcattggcattcccgtcgctaaaaaatgc
+aaagccctggcccgcgattccttaagcctgcttgcctacgtcaaatcc"""
+
+dna_seqs_4 = """>eco:b0015 dnaJ; chaperone Hsp40, co-chaperone with DnaK; K03686 molecular chaperone DnaJ (N)
+atggctaagcaagattattacgagattttaggcgtttccaaaacagcggaagagcgtgaa
+atcagaaaggcctacaaacgcctggccatgaaataccacccggaccgtaaccagggtgac
+aaagaggccgaggcgaaatttaaagagatcaaggaagcttatgaagttctgaccgactcg
+caaaaacgtgcggcatacgatcagtatggtcatgctgcgtttgagcaaggtggcatgggc
+ggcggcggttttggcggcggcgcagacttcagcgatatttttggtgacgttttcggcgat
+atttttggcggcggacgtggtcgtcaacgtgcggcgcgcggtgctgatttacgctataac
+atggagctcaccctcgaagaagctgtacgtggcgtgaccaaagagatccgcattccgact
+ctggaagagtgtgacgtttgccacggtagcggtgcaaaaccaggtacacagccgcagact
+tgtccgacctgtcatggttctggtcaggtgcagatgcgccagggattcttcgctgtacag
+cagacctgtccacactgtcagggccgcggtacgctgatcaaagatccgtgcaacaaatgt
+catggtcatggtcgtgttgagcgcagcaaaacgctgtccgttaaaatcccggcaggggtg
+gacactggagaccgcatccgtcttgcgggcgaaggtgaagcgggcgagcatggcgcaccg
+gcaggcgatctgtacgttcaggttcaggttaaacagcacccgattttcgagcgtgaaggc
+aacaacctgtattgcgaagtcccgatcaacttcgctatggcggcgctgggtggcgaaatc
+gaagtaccgacccttgatggtcgcgtcaaactgaaagtgcctggcgaaacccagaccggt
+aagctattccgtatgcgcggtaaaggcgtcaagtctgtccgcggtggcgcacagggtgat
+ttgctgtgccgcgttgtcgtcgaaacaccggtaggcctgaacgaaaggcagaaacagctg
+ctgcaagagctgcaagaaagcttcggtggcccaaccggcgagcacaacagcccgcgctca
+aagagcttctttgatggtgtgaagaagttttttgacgacctgacccgctaa
+>eco:b0122 yacC; conserved protein, PulS_OutS family (N)
+atgaagacgtttttcagaacagtgttattcggcagcctgatggccgtctgcgcaaacagt
+tacgcgctcagcgagtctgaagccgaagatatggccgatttaacggcagtttttgtcttt
+ctgaagaacgattgtggttaccagaacttacctaacgggcaaattcgtcgcgcactggtc
+tttttcgctcagcaaaaccagtgggacctcagtaattacgacaccttcgacatgaaagcc
+ctcggtgaagacagctaccgcgatctcagcggcattggcattcccgtcgctaaaaaatgc
+aaagccctggcccgcgattccttaagcctgcttgcctacgtcaaataa
+>eco:b0122-like
+atgaagacgtttttcagaacagtgttattcggcagcctgatggccgtctgcgcaaacagt
+tacgcgctcagcgagtctgaagccgaagatatggccgatttaacggcagtttttgtcttt
+ctgaagaacgattgtggttaccagaacttacctaacgggcaaattcgtcgcgcactggtc
+tttttcgctcagcaaaaccagtgggacctcagtaattacgacaccttcgacatgaaagcc
+ctcggtgaagacagctaccgcgatctcagcggcattggcattcccgtcgctaaaaaatgc
+aaagccctggcccgcgattccttaagcctgcttgcctacgtcaaatcc"""
+
+protein_ref_seqs1 = """>eco:b0001 thrL; thr operon leader peptide; K08278 thr operon leader peptide (A)
+MKRISTTITTTITITTGNGAG
+>eco:b0015 dnaJ; chaperone Hsp40, co-chaperone with DnaK; K03686 molecular chaperone DnaJ (A)
+MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAKFKEIKEAYEVLTDS
+QKRAAYDQYGHAAFEQGGMGGGGFGGGADFSDIFGDVFGDIFGGGRGRQRAARGADLRYN
+MELTLEEAVRGVTKEIRIPTLEECDVCHGSGAKPGTQPQTCPTCHGSGQVQMRQGFFAVQ
+QTCPHCQGRGTLIKDPCNKCHGHGRVERSKTLSVKIPAGVDTGDRIRLAGEGEAGEHGAP
+AGDLYVQVQVKQHPIFEREGNNLYCEVPINFAMAALGGEIEVPTLDGRVKLKVPGETQTG
+KLFRMRGKGVKSVRGGAQGDLLCRVVVETPVGLNERQKQLLQELQESFGGPTGEHNSPRS
+KSFFDGVKKFFDDLTR
+>eco:b0015:rep
+MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAKFKEIKEAYEVLTDS
+QKRAAYDQYGHAAFEQGGMGGGGFGGGADFSDIFGDVFGDIFGGGRGRQRAARGADLRYN
+MELTLEEAVRGVTKEIRIPTLEECDVCHGSGAKPGTQPQTCPTCHGSGQVQMRQGFFAVQ
+QTCPHCQGRGTLIKDPCNKCHGHGRVERSKTLSVKIPAGVDTGDRIRLAGEGEAGEHGAP
+AGDLYVQVQVKQHPIFEREGNNLYCEVPINFAMAALGGEIEVPTLDGRVKLKVPGETQTG
+KLFRMRGKGVKSVRGGAQGDLLCRVVVETPVGLNERQKQLLQELQESFGGPTGEHNSPRS
+KSFFDGVKKFFDDLTR
+>eco:b0122 yacC; conserved protein, PulS_OutS family (A)
+MKTFFRTVLFGSLMAVCANSYALSESEAEDMADLTAVFVFLKNDCGYQNLPNGQIRRALV
+FFAQQNQWDLSNYDTFDMKALGEDSYRDLSGIGIPVAKKCKALARDSLSLLAYVK"""
+
+usearch_ref_seqs1 = """>ref1 ecoli sequence
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCA
+>EU199232 1 1236 Bacteria/Deltaproteobacteria/Desulfurella - Hippea/uncultured
+TACGCGCGGAAATCGAGCGAGATTGGGAACGCAAGTTCCTGAGTATTGCGGCGAACGGGTGAGTAAGACGTGGGTGATCTACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC
+>L07864 1 1200 Bacteria/Beta Gammaproteobacteria/Solemya symbiont
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTAATGCATGGGAATCTGCCATATAGTGGGGGACAACTGGGGAAACCCAGGCTAATACCGCATAATCTCTACGGAGGAAAGGCTTC
+"""
+
+dna_seqs_usearch = """>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTA
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTATCAAG
+>chimera
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC
+"""
+
+dna_seqs_reference_otu_picking = """>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTA
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTATCAAG
+>RandomCrap
+ACACAAACAGTATATTATATCCCCAGACAGGGACCGAGATTTACCACACCCAAAAAAAAAAAAAACACACCCCCCCCCCCCCCACACACACACTTATTTT
+"""
+
+dna_seqs_with_abundance = """>Cluster1;size=114
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster2;size=45
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCC
+>Cluster0;size=37
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGAACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster7;size=33
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster6;size=32
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster5;size=25
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster11;size=22
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster12;size=15
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster13;size=2
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTGTGTCAGGCCT
+>Cluster14;size=1
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTG"""
+
+de_novo_chimera_seqs = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG
+>Cluster3;size=30
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCAGTCTCTCAACTCGGCTATGCATCATTGCCTTGGTAAGCCGTTACCT
+TACCAACTAGCTAATGCACCGCAGGTCCATCCAAGAGTGATAGCAGAACCATCTTTCAAACTCTAGACATGCGTCTAGTG
+TTGTTATCCGGTATTAGCATCTGTTTCCAGGTGTTATCCCAGTCTCTTGGG
+>Cluster12;size=19
+TTGGTCCGTGTCTCAGTACCAATGTGGGGGGTTAACCTCTCAGTCCCCCTATGTATCGTGGTCTTGGTGAGCCGTTACCC
+CACCAACTAACTAATACAACGCATGCCCATCCATTACCACCGGAGTTTTCAACCCAAGAAGATGCCTCCCTGGATGTTAT
+GGGGTATTAGTACCGATTTCTCAGTGTTATCCCCCTGTAATGGGTAGGTTGCATACGCGTTACGCACCCGTGCGCCGGTC
+GCCGACAAT
+>Cluster30;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTTGGTGGGCCGTTACCC
+CGCCAACTAGCTAATGCGCCGCATGGCCATCCGTAGCCGGTGTTACCCTTTAAACCCCAAGAGATGCCTCTCGGAGTTAT
+TACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTACGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster29;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGCCCATCCGCCACCGGTAATCCCTTTGGCGGCACCGGGATGCCCCGACGCCGCGTC
+ACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTGGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCGG
+TCGCCGG
+>Cluster16;size=16
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCGTTACCC
+CTCCAACCAGCTAATCAGACGCGGGTCCATCCTGTACCACCGGAGTTTTTCACACTGTACCATGCGGTACTGTGCGCTTA
+TGCGGTTTTAGCACCTATTTCTAAGTGTTATCCCCCTGTACAGGGCAGGTTACCCACGCGTTACTCACCCGTCCGCCACT
+>Cluster222;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT
+>Cluster221;size=1
+CTGGGCCGTATCTCAGTCCCAATGTGGCCGTTCAACCTCTCAGTCCGGCTACTGATCGTCGCCTTGGTAGGCCGTTGCCC
+CGCCAACTACCTAATCGGACGCGAGCCCATCTTTCAGCGGATTGCTCCTTTGATTATCTCACCATGCGGCAAAATAATGT
+CATGCGGTATTAGCGTTCGTTTCCAAACGTTATCCCCCTCTGAAAGGCAGGTTGCTCACGCGTT
+>Cluster218;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGGCCACCCTCTCAGGTCGGCTACTGATCGTCACCTTGGTAGGCCGTTACCC
+CACCAACTAGCTAATCAGACGCAAGCCCATCTATCAGCGGATTGCTCCTTTTCTAGCTATATCATGCGATACTACTAGCT
+TATGCGGTATTAGCAATGATTTCTCACTGTTATTCCCCTCTGATAGGCAGG
+>Cluster217;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGAGTCCATCTCAGAGCGATAAATCTTTGATATCCAGAGCCATGCGACCCAGATATATT
+ATGCGGTATTAGCAGCTGTTTCCAGCTGTTATTCCCCATCCAAGGCAGGTT
+>Cluster216;size=1
+CTGGGCCGTGTCTCAGTCCCAGTGTGGCCGTCCGCCCTCTCAGGTCAGCTACTGATCGTCGCCTTGGTAGGCCATTACCC
+TACCAACTAGCTAATCAGACGCGAGGCCATCTCTCAGCGATAAATCTTTGATATATCTGCCATGCGACAAACATATATTA
+TGCGGTATTAGCAGTCGTTTCCAACTGTTGTCCCCCTCTGAAAGGCAGGTT
+>Cluster522;size=10
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT"""
+
+reference_seqs_fp = """>seq1
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>seq2
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>seq3
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>mixed_seq
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCAACATATTTCGGGACAGATTAACACACAAAGGATTTACACAAAAT
+ACATTAGACCAAACCCCAAGATTTAGACAGGATTACAGGATTTACAGATTTTTACCAACATTAGACAGGGG"""
+
+dna_seqs_with_dups = """>seq1
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA
+>seq2
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>seq3
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>seq4
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTT"""
+
+dna_seqs_with_dups2 = """>seq1
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA
+>seq2
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>seq3
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>seq4
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC"""
+
+
+# Expected output file data
+uc_lines1 = """# usearch --id 0.97 --uc usearch_picked_otus/assign_reads_to_otus.uc --query seqs.fna --global --db usearch_picked_otus/enumerated_otus.fasta
+# version=4.2.66
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+H\t42\t217\t99.1\t+\t0\t0\t217MI\tPC.test2_1 FLP3FBN02xELBSXx orig_bc=ACAGAGTCGGCG new_bc=ACAGAGTCGGCG,FLP3FBN02x bc_diffs=0\t42
+H\t42\t217\t99.1\t+\t0\t0\t217MI\tPC.test1_2 FLP3FBN03ELBSXx orig_bc=ACAGAGTCGGCG new_bc=ACAGAGTCGGCG,FLP3FBN03 bc_diffs=0\t42
+H\t42\t217\t99.1\t+\t0\t0\t217MI\tPC.634_3 FLP3FBN01ELBSX orig_bc=TCAGAGTCGGCT new_bc=ACAGAGTCGGCT,FLP3FBN01 bc_diffs=1\t42
+H\t19\t243\t100.0\t+\t0\t0\t25MI218M\tPC.634_4 FLP3FBN01EG8AX orig_bc=ACAGAGTCGGCT new_bc=ACAGAGTCGGCT,FLP3FBN01 bc_diffs=0\t19
+N\t*\t219\t*\t*\t*\t*\t*\tPC.481_6\tFLP3FBN01DEHK3 orig_bc=ACCAGCGACTAG new_bc=ACCAGCGACTAG,FLP3FBN01 bc_diffs=0\t*
+H\t6\t211\t99.5\t+\t0\t0\t211M\tPC.269_5 FLP3FBN01EEWKD orig_bc=AGCACGAGCCTA new_bc=AGCACGAGCCTA,FLP3FBN01 bc_diffs=0\t6
+""".split('\n')
+
+expected_otu_assignments = """# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+H\t2\t199\t97.5\t.\t0\t0\t119M80D\tref1 ecoli sequence\tusearch_ecoli_seq2
+N\t*\t178\t*\t*\t*\t*\t*\tEU199232 1 1236 Bacteria/Deltaproteobacteria/Desulfurella - Hippea/uncultured\t*
+H\t1\t180\t100.0\t.\t0\t0\t97M83D\tL07864 1 1200 Bacteria/Beta Gammaproteobacteria/Solemya symbiont\tSolemya seq""".split('\n')
+
+expected_enumerated_fasta = """>0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA""".split('\n')
+
+expected_enumerated_fasta_added_options = """>Big255Ern\tuclust_test_seqs_0 some comment0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Big256Ern\tuclust_test_seqs_1 some comment1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>Big257Ern\tuclust_test_seqs_2 some comment2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>Big258Ern\tuclust_test_seqs_3 some comment3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>Big259Ern\tuclust_test_seqs_4 some comment4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>Big260Ern\tuclust_test_seqs_5 some comment4_again
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>Big261Ern\tuclust_test_seqs_6 some comment6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>Big262Ern\tuclust_test_seqs_7 some comment7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>Big263Ern\tuclust_test_seqs_8 some comment8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>Big264Ern\tuclust_test_seqs_9 some comment9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA""".split('\n')
+
+expected_clusters_w_abundance_default_settings = """>Cluster1;size=326
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT""".split('\n')
+
+expected_clusters_w_abundance_low_setting = """>Cluster1;size=304
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster11;size=22
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCC
+CCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT""".split('\n')
+
+expected_reference_filtered_seqs = """>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTG
+ACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAG
+TGGCGGACGGGTGAGTA
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTG
+ACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAG
+TGGCGGACGGGTGAGTATCAAG""".split('\n')
+
+expected_de_novo_chimeras_default = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG
+>Cluster3;size=30
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCAGTCTCTCAACTCGGCTATGCATCATTGCCTTGGTAAGCCGTTACCT
+TACCAACTAGCTAATGCACCGCAGGTCCATCCAAGAGTGATAGCAGAACCATCTTTCAAACTCTAGACATGCGTCTAGTG
+TTGTTATCCGGTATTAGCATCTGTTTCCAGGTGTTATCCCAGTCTCTTGGG
+>Cluster12;size=19
+TTGGTCCGTGTCTCAGTACCAATGTGGGGGGTTAACCTCTCAGTCCCCCTATGTATCGTGGTCTTGGTGAGCCGTTACCC
+CACCAACTAACTAATACAACGCATGCCCATCCATTACCACCGGAGTTTTCAACCCAAGAAGATGCCTCCCTGGATGTTAT
+GGGGTATTAGTACCGATTTCTCAGTGTTATCCCCCTGTAATGGGTAGGTTGCATACGCGTTACGCACCCGTGCGCCGGTC
+GCCGACAAT
+>Cluster29;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGCCCATCCGCCACCGGTAATCCCTTTGGCGGCACCGGGATGCCCCGACGCCGCGTC
+ACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTGGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCGG
+TCGCCGG
+>Cluster30;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTTGGTGGGCCGTTACCC
+CGCCAACTAGCTAATGCGCCGCATGGCCATCCGTAGCCGGTGTTACCCTTTAAACCCCAAGAGATGCCTCTCGGAGTTAT
+TACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTACGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster16;size=16
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCGTTACCC
+CTCCAACCAGCTAATCAGACGCGGGTCCATCCTGTACCACCGGAGTTTTTCACACTGTACCATGCGGTACTGTGCGCTTA
+TGCGGTTTTAGCACCTATTTCTAAGTGTTATCCCCCTGTACAGGGCAGGTTACCCACGCGTTACTCACCCGTCCGCCACT
+>Cluster221;size=1
+CTGGGCCGTATCTCAGTCCCAATGTGGCCGTTCAACCTCTCAGTCCGGCTACTGATCGTCGCCTTGGTAGGCCGTTGCCC
+CGCCAACTACCTAATCGGACGCGAGCCCATCTTTCAGCGGATTGCTCCTTTGATTATCTCACCATGCGGCAAAATAATGT
+CATGCGGTATTAGCGTTCGTTTCCAAACGTTATCCCCCTCTGAAAGGCAGGTTGCTCACGCGTT
+>Cluster218;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGGCCACCCTCTCAGGTCGGCTACTGATCGTCACCTTGGTAGGCCGTTACCC
+CACCAACTAGCTAATCAGACGCAAGCCCATCTATCAGCGGATTGCTCCTTTTCTAGCTATATCATGCGATACTACTAGCT
+TATGCGGTATTAGCAATGATTTCTCACTGTTATTCCCCTCTGATAGGCAGG
+>Cluster217;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGAGTCCATCTCAGAGCGATAAATCTTTGATATCCAGAGCCATGCGACCCAGATATATT
+ATGCGGTATTAGCAGCTGTTTCCAGCTGTTATTCCCCATCCAAGGCAGGTT
+>Cluster216;size=1
+CTGGGCCGTGTCTCAGTCCCAGTGTGGCCGTCCGCCCTCTCAGGTCAGCTACTGATCGTCGCCTTGGTAGGCCATTACCC
+TACCAACTAGCTAATCAGACGCGAGGCCATCTCTCAGCGATAAATCTTTGATATATCTGCCATGCGACAAACATATATTA
+TGCGGTATTAGCAGTCGTTTCCAACTGTTGTCCCCCTCTGAAAGGCAGGTT""".split('\n')
+
+expected_de_novo_chimera_filtered_skew11 = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG
+>Cluster3;size=30
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCAGTCTCTCAACTCGGCTATGCATCATTGCCTTGGTAAGCCGTTACCT
+TACCAACTAGCTAATGCACCGCAGGTCCATCCAAGAGTGATAGCAGAACCATCTTTCAAACTCTAGACATGCGTCTAGTG
+TTGTTATCCGGTATTAGCATCTGTTTCCAGGTGTTATCCCAGTCTCTTGGG
+>Cluster12;size=19
+TTGGTCCGTGTCTCAGTACCAATGTGGGGGGTTAACCTCTCAGTCCCCCTATGTATCGTGGTCTTGGTGAGCCGTTACCC
+CACCAACTAACTAATACAACGCATGCCCATCCATTACCACCGGAGTTTTCAACCCAAGAAGATGCCTCCCTGGATGTTAT
+GGGGTATTAGTACCGATTTCTCAGTGTTATCCCCCTGTAATGGGTAGGTTGCATACGCGTTACGCACCCGTGCGCCGGTC
+GCCGACAAT
+>Cluster29;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGCCCATCCGCCACCGGTAATCCCTTTGGCGGCACCGGGATGCCCCGACGCCGCGTC
+ACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTGGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCGG
+TCGCCGG
+>Cluster30;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTTGGTGGGCCGTTACCC
+CGCCAACTAGCTAATGCGCCGCATGGCCATCCGTAGCCGGTGTTACCCTTTAAACCCCAAGAGATGCCTCTCGGAGTTAT
+TACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTACGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster16;size=16
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCGTTACCC
+CTCCAACCAGCTAATCAGACGCGGGTCCATCCTGTACCACCGGAGTTTTTCACACTGTACCATGCGGTACTGTGCGCTTA
+TGCGGTTTTAGCACCTATTTCTAAGTGTTATCCCCCTGTACAGGGCAGGTTACCCACGCGTTACTCACCCGTCCGCCACT
+>Cluster522;size=10
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT
+>Cluster221;size=1
+CTGGGCCGTATCTCAGTCCCAATGTGGCCGTTCAACCTCTCAGTCCGGCTACTGATCGTCGCCTTGGTAGGCCGTTGCCC
+CGCCAACTACCTAATCGGACGCGAGCCCATCTTTCAGCGGATTGCTCCTTTGATTATCTCACCATGCGGCAAAATAATGT
+CATGCGGTATTAGCGTTCGTTTCCAAACGTTATCCCCCTCTGAAAGGCAGGTTGCTCACGCGTT
+>Cluster218;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGGCCACCCTCTCAGGTCGGCTACTGATCGTCACCTTGGTAGGCCGTTACCC
+CACCAACTAGCTAATCAGACGCAAGCCCATCTATCAGCGGATTGCTCCTTTTCTAGCTATATCATGCGATACTACTAGCT
+TATGCGGTATTAGCAATGATTTCTCACTGTTATTCCCCTCTGATAGGCAGG
+>Cluster217;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGAGTCCATCTCAGAGCGATAAATCTTTGATATCCAGAGCCATGCGACCCAGATATATT
+ATGCGGTATTAGCAGCTGTTTCCAGCTGTTATTCCCCATCCAAGGCAGGTT
+>Cluster216;size=1
+CTGGGCCGTGTCTCAGTCCCAGTGTGGCCGTCCGCCCTCTCAGGTCAGCTACTGATCGTCGCCTTGGTAGGCCATTACCC
+TACCAACTAGCTAATCAGACGCGAGGCCATCTCTCAGCGATAAATCTTTGATATATCTGCCATGCGACAAACATATATTA
+TGCGGTATTAGCAGTCGTTTCCAACTGTTGTCCCCCTCTGAAAGGCAGGTT""".split('\n')
+
+expected_cluster_err_seqs = """>Cluster0;size=326
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT""".split('\n')
+
+expected_sorted_by_abundance_no_filter = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG
+>Cluster3;size=30
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCAGTCTCTCAACTCGGCTATGCATCATTGCCTTGGTAAGCCGTTACCT
+TACCAACTAGCTAATGCACCGCAGGTCCATCCAAGAGTGATAGCAGAACCATCTTTCAAACTCTAGACATGCGTCTAGTG
+TTGTTATCCGGTATTAGCATCTGTTTCCAGGTGTTATCCCAGTCTCTTGGG
+>Cluster12;size=19
+TTGGTCCGTGTCTCAGTACCAATGTGGGGGGTTAACCTCTCAGTCCCCCTATGTATCGTGGTCTTGGTGAGCCGTTACCC
+CACCAACTAACTAATACAACGCATGCCCATCCATTACCACCGGAGTTTTCAACCCAAGAAGATGCCTCCCTGGATGTTAT
+GGGGTATTAGTACCGATTTCTCAGTGTTATCCCCCTGTAATGGGTAGGTTGCATACGCGTTACGCACCCGTGCGCCGGTC
+GCCGACAAT
+>Cluster29;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGCCCATCCGCCACCGGTAATCCCTTTGGCGGCACCGGGATGCCCCGACGCCGCGTC
+ACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTGGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCGG
+TCGCCGG
+>Cluster30;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTTGGTGGGCCGTTACCC
+CGCCAACTAGCTAATGCGCCGCATGGCCATCCGTAGCCGGTGTTACCCTTTAAACCCCAAGAGATGCCTCTCGGAGTTAT
+TACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTACGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster16;size=16
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCGTTACCC
+CTCCAACCAGCTAATCAGACGCGGGTCCATCCTGTACCACCGGAGTTTTTCACACTGTACCATGCGGTACTGTGCGCTTA
+TGCGGTTTTAGCACCTATTTCTAAGTGTTATCCCCCTGTACAGGGCAGGTTACCCACGCGTTACTCACCCGTCCGCCACT
+>Cluster522;size=10
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT
+>Cluster222;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT
+>Cluster221;size=1
+CTGGGCCGTATCTCAGTCCCAATGTGGCCGTTCAACCTCTCAGTCCGGCTACTGATCGTCGCCTTGGTAGGCCGTTGCCC
+CGCCAACTACCTAATCGGACGCGAGCCCATCTTTCAGCGGATTGCTCCTTTGATTATCTCACCATGCGGCAAAATAATGT
+CATGCGGTATTAGCGTTCGTTTCCAAACGTTATCCCCCTCTGAAAGGCAGGTTGCTCACGCGTT
+>Cluster218;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGGCCACCCTCTCAGGTCGGCTACTGATCGTCACCTTGGTAGGCCGTTACCC
+CACCAACTAGCTAATCAGACGCAAGCCCATCTATCAGCGGATTGCTCCTTTTCTAGCTATATCATGCGATACTACTAGCT
+TATGCGGTATTAGCAATGATTTCTCACTGTTATTCCCCTCTGATAGGCAGG
+>Cluster217;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGAGTCCATCTCAGAGCGATAAATCTTTGATATCCAGAGCCATGCGACCCAGATATATT
+ATGCGGTATTAGCAGCTGTTTCCAGCTGTTATTCCCCATCCAAGGCAGGTT
+>Cluster216;size=1
+CTGGGCCGTGTCTCAGTCCCAGTGTGGCCGTCCGCCCTCTCAGGTCAGCTACTGATCGTCGCCTTGGTAGGCCATTACCC
+TACCAACTAGCTAATCAGACGCGAGGCCATCTCTCAGCGATAAATCTTTGATATATCTGCCATGCGACAAACATATATTA
+TGCGGTATTAGCAGTCGTTTCCAACTGTTGTCCCCCTCTGAAAGGCAGGTT""".split('\n')
+
+expected_abundance_sort_filtered = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG""".split('\n')
+
+expected_derep_seqs = """>seq1;size=2
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA
+>seq2;size=2
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC""".split('\n')
+
+expected_derep_seqs_full_len = """>Cluster0;size=1
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA
+>Cluster1;size=2
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>Cluster2;size=1
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTT""".split('\n')
+
+expected_len_sorted_seqs = """>chimera
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTATCAAG
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTA""".split('\n')
+
+expected_combined_dna_seqs_1_seqs_usearch = """>uclust_test_seqs_0 some comment0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_1 some comment1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_2 some comment2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3 some comment3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>uclust_test_seqs_4 some comment4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>uclust_test_seqs_5 some comment4_again
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>uclust_test_seqs_6 some comment6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>uclust_test_seqs_7 some comment7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_8 some comment8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>uclust_test_seqs_9 some comment9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTA
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTATCAAG
+>chimera
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC""".split('\n')
+
+expected_retained_chimeras_union = """>seq1
+ACAGGCC
+>seq2
+ACAGGCCCCC
+>seq3
+TTATCCATT
+>seq4
+ACAGGCCCCC
+>seq5
+TTATCCATT""".split('\n')
+
+expected_retained_chimeras_intersection = """>seq3
+TTATCCATT""".split('\n')
+
+expected_usearch61_denovo_uchime_file = """0.0000\tCluster1;size=52\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster0;size=50\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster2;size=45\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster10;size=43\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster4;size=40\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster6;size=40\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster3;size=30\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0263\tCluster12;size=19\tCluster2;size=45\tCluster1;size=52\tCluster1;size=52\t75.6\t73.3\t76.5\t67.3\t75.6\t20\t21\t26\t6\t1\t3\t*\tN
+0.0000\tCluster30;size=18\t*\t*\tCluster6;size=40\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0924\tCluster29;size=18\tCluster6;size=40\tCluster1;size=52\tCluster1;size=52\t92.0\t88.6\t89.0\t86.5\t88.7\t7\t0\t0\t12\t7\t14\t3.3\tN
+0.0187\tCluster16;size=16\tCluster2;size=45\tCluster4;size=40\tCluster4;size=40\t94.5\t92.3\t94.1\t90.9\t94.0\t2\t1\t0\t9\t4\t7\t0.5\tN
+0.4232\tCluster222;size=1\tCluster4;size=40\tCluster2;size=45\tCluster2;size=45\t100.0\t94.1\t97.3\t91.3\t96.8\t7\t1\t0\t13\t0\t0\t3.2\tY
+0.0759\tCluster221;size=1\tCluster16;size=16\tCluster1;size=52\tCluster16;size=16\t74.5\t75.9\t67.3\t66.8\t75.4\t15\t0\t5\t16\t19\t32\t*\tN
+0.0107\tCluster218;size=1\tCluster2;size=45\tCluster4;size=40\tCluster4;size=40\t81.7\t80.7\t80.7\t90.6\t78.7\t6\t5\t28\t2\t0\t3\t3.0\tN
+0.0086\tCluster217;size=1\tCluster4;size=40\tCluster2;size=45\tCluster4;size=40\t83.1\t83.1\t80.7\t90.8\t82.1\t4\t0\t1\t2\t4\t33\t1.0\tN
+0.0000\tCluster216;size=1\t*\t*\tCluster16;size=16\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.4232\tCluster522;size=10\tCluster4;size=40\tCluster2;size=45\tCluster2;size=45\t100.0\t94.1\t97.3\t91.3\t96.8\t7\t1\t0\t13\t0\t0\t3.2\tY""".split('\n')
+
+expected_usearch61_ref_uchime_file = """0.0000\tCluster1;size=52\t*\t*\tseq1\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster0;size=50\t*\t*\tseq2\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster2;size=45\t*\t*\tseq3\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.1074\tCluster10;size=43\tmixed_seq\tseq1\tseq1\t70.3\t67.0\t65.1\t54.1\t65.7\t11\t0\t1\t31\t27\t33\t4.6\tN
+0.6322\tCluster4;size=40\tmixed_seq\tseq3\tseq3\t96.0\t77.6\t92.5\t73.6\t91.0\t6\t0\t0\t38\t2\t5\t5.0\tY
+0.1101\tCluster6;size=40\tseq2\tseq1\tseq1\t82.6\t71.3\t85.2\t69.6\t85.1\t12\t19\t16\t25\t0\t4\t*\tN
+0.0258\tCluster3;size=30\tmixed_seq\tseq3\tseq3\t71.6\t66.0\t68.0\t71.1\t66.4\t12\t7\t36\t16\t7\t5\t5.3\tN
+0.0263\tCluster12;size=19\tseq3\tseq1\tseq1\t75.6\t73.3\t76.5\t67.3\t75.6\t20\t21\t26\t6\t1\t3\t*\tN
+0.0530\tCluster30;size=18\tseq2\tseq1\tseq1\t79.6\t68.3\t85.7\t70.4\t85.9\t8\t24\t16\t25\t0\t6\t*\tN
+0.0534\tCluster29;size=18\tseq2\tseq1\tseq1\t80.9\t70.4\t88.3\t70.0\t88.7\t7\t25\t17\t23\t0\t2\t*\tN
+0.0699\tCluster16;size=16\tmixed_seq\tseq3\tseq3\t94.0\t74.6\t93.5\t73.6\t91.9\t2\t2\t2\t41\t3\t5\t2.1\tN
+1.2277\tCluster222;size=1\tmixed_seq\tseq3\tseq3\t100.0\t78.4\t97.1\t75.5\t96.8\t6\t1\t0\t44\t0\t0\t3.2\tY
+0.0855\tCluster221;size=1\tseq3\tseq1\tseq3\t75.8\t77.2\t68.8\t65.1\t72.9\t14\t0\t4\t17\t18\t28\t2.9\tN
+0.0174\tCluster218;size=1\tmixed_seq\tseq3\tseq3\t81.7\t70.3\t80.7\t70.3\t78.0\t1\t0\t4\t34\t12\t21\t3.6\tN
+0.0713\tCluster217;size=1\tmixed_seq\tseq3\tseq3\t83.3\t77.5\t79.9\t68.6\t79.7\t4\t0\t1\t27\t12\t17\t3.6\tN
+0.0505\tCluster216;size=1\tmixed_seq\tseq3\tseq3\t77.5\t72.5\t71.6\t70.1\t72.0\t14\t4\t27\t15\t5\t8\t5.4\tN
+1.2277\tCluster522;size=10\tmixed_seq\tseq3\tseq3\t100.0\t78.4\t97.1\t75.5\t96.8\t6\t1\t0\t44\t0\t0\t3.2\tY""".split('\n')
+
+usearch61_dereplicated_uc_lines = """S 0 80 * * * * * seq2 *
+H 0 80 100.0 * 0 0 * seq3 seq2
+H 0 80 100.0 * 0 0 * seq4 seq2
+S 1 80 * * * * * seq1 *
+C 0 3 * * * * * seq2 *
+C 1 1 * * * * * seq1 *""".split('\n')
+
+usearch61_clustered_uc_lines = """S 0 80 * * * * * seq2;size=3; *
+S 1 80 * * * * * seq1;size=1; *
+C 0 1 * * * * * seq2;size=3; *
+C 1 1 * * * * * seq1;size=1; *""".split('\n')
+
+usearch61_clustered_uc_lines_ref = """H 3 80 100.0 + 0 0 80M seq2;size=3; seq4
+H 0 80 100.0 + 0 0 80M seq1;size=1; seq1""".split('\n')
+
+usearch61_clustered_ref_lines = """H 0 80 100.0 + 0 0 80M seq2;size=3; seq2
+N * * * . * * * seq1;size=1; *""".split('\n')
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/tests/test_vsearch.py b/bfillings/tests/test_vsearch.py
new file mode 100644
index 0000000..68c87a6
--- /dev/null
+++ b/bfillings/tests/test_vsearch.py
@@ -0,0 +1,1686 @@
+#!/usr/bin/env python
+
+# -----------------------------------------------------------------------------
+# Copyright (c) 2015--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# -----------------------------------------------------------------------------
+"""
+Unit tests for the VSEARCH version 1.1.1 Application controller
+===============================================================
+"""
+
+
+from unittest import TestCase, main
+from os import close
+from os.path import exists, join, dirname
+from tempfile import mkstemp, mkdtemp
+from shutil import rmtree
+
+from skbio.util import remove_files
+from skbio.parse.sequences import parse_fasta
+
+from bfillings.vsearch import (vsearch_dereplicate_exact_seqs,
+ vsearch_sort_by_abundance,
+ vsearch_chimera_filter_de_novo,
+ vsearch_chimera_filter_ref)
+
+
+# Test class and cases
+class VsearchTests(TestCase):
+ """ Tests for VSEARCH version 1.1.1 functionality """
+
+ def setUp(self):
+ self.output_dir = mkdtemp()
+ self.seqs_to_derep = seqs_to_derep
+ self.seqs_to_derep_max_min_abundance =\
+ seqs_to_derep_max_min_abundance
+ self.seqs_to_derep_merged_derep_files =\
+ seqs_to_derep_merged_derep_files
+ self.seqs_to_sort = seqs_to_sort
+ self.amplicon_reads = amplicon_reads
+ self.single_chimera = single_chimera
+ self.single_chimera_ref = single_chimera_ref
+ self.uchime_ref_db = uchime_ref_db
+ self.uchime_single_ref_db = uchime_single_ref_db
+
+ # temporary file for seqs_to_derep
+ f, self.seqs_to_derep_fp = mkstemp(prefix='tmp_seqs_to_derep_',
+ suffix='.fasta')
+ close(f)
+
+ # write seqs_to_derep to file
+ with open(self.seqs_to_derep_fp, 'w') as tmp:
+ tmp.write(self.seqs_to_derep)
+
+ # temporary file for seqs_to_derep_max_min_abundance
+ f, self.seqs_to_derep_max_min_abundance_fp =\
+ mkstemp(prefix='tmp_seqs_to_derep_abun_',
+ suffix='.fasta')
+ close(f)
+
+ # write seqs_to_derep_max_min_abundance to file
+ with open(self.seqs_to_derep_max_min_abundance_fp, 'w') as tmp:
+ tmp.write(self.seqs_to_derep_max_min_abundance)
+
+ # temporary file for seqs_to_derep_merged_derep_files
+ f, self.seqs_to_derep_merged_derep_files_fp =\
+ mkstemp(prefix='tmp_seqs_to_derep_concat_',
+ suffix='.fasta')
+ close(f)
+
+ # write seqs_to_derep_merged_derep_files to file
+ with open(self.seqs_to_derep_merged_derep_files_fp, 'w') as tmp:
+ tmp.write(self.seqs_to_derep_merged_derep_files)
+
+ # temporary file for seqs_to_sort
+ f, self.seqs_to_sort_fp = mkstemp(prefix='tmp_seqs_to_sort_',
+ suffix='.fasta')
+ close(f)
+
+ # write seqs_to_sort to file
+ with open(self.seqs_to_sort_fp, 'w') as tmp:
+ tmp.write(self.seqs_to_sort)
+
+ # temporary file for amplicon_reads
+ f, self.amplicon_reads_fp = mkstemp(prefix='tmp_amplicon_reads_',
+ suffix='.fasta')
+ close(f)
+
+ # write amplicon_reads to file
+ with open(self.amplicon_reads_fp, 'w') as tmp:
+ tmp.write(self.amplicon_reads)
+
+ # temporary file for single_chimera
+ f, self.single_chimera_fp = mkstemp(prefix='tmp_single_chimera_',
+ suffix='.fasta')
+ close(f)
+
+ # write single_chimera to file
+ # (de novo chimera checking)
+ with open(self.single_chimera_fp, 'w') as tmp:
+ tmp.write(self.single_chimera)
+
+ # temporary file for single_chimera_ref
+ f, self.single_chimera_ref_fp = mkstemp(prefix='tmp_single_chimera_',
+ suffix='.fasta')
+ close(f)
+
+ # write single_chimera_ref to file
+ # (reference chimera checking)
+ with open(self.single_chimera_ref_fp, 'w') as tmp:
+ tmp.write(self.single_chimera_ref)
+
+ # temporary file for uchime_ref_db
+ f, self.uchime_ref_db_fp = mkstemp(prefix='tmp_uchime_ref_db_',
+ suffix='.fasta')
+ close(f)
+
+ # write uchime_ref_db to file
+ with open(self.uchime_ref_db_fp, 'w') as tmp:
+ tmp.write(self.uchime_ref_db)
+
+ # temporary file for uchime_single_ref_db
+ f, self.uchime_single_ref_db_fp =\
+ mkstemp(prefix='tmp_uchime_single_ref_db_',
+ suffix='.fasta')
+ close(f)
+
+ # write uchime_single_ref_db to file
+ with open(self.uchime_single_ref_db_fp, 'w') as tmp:
+ tmp.write(self.uchime_single_ref_db)
+
+ # list of files to remove
+ self.files_to_remove = [self.seqs_to_derep_fp,
+ self.seqs_to_derep_max_min_abundance_fp,
+ self.seqs_to_derep_merged_derep_files_fp,
+ self.seqs_to_sort_fp,
+ self.amplicon_reads_fp,
+ self.single_chimera_fp,
+ self.single_chimera_ref_fp,
+ self.uchime_ref_db_fp,
+ self.uchime_single_ref_db_fp]
+
+ def tearDown(self):
+ remove_files(self.files_to_remove)
+ rmtree(self.output_dir)
+
+ def test_vsearch_chimera_filter_ref(self):
+ """ Test reference chimera filter, output only
+ chimeric sequences and log
+ """
+ chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+ vsearch_chimera_filter_ref(
+ self.amplicon_reads_fp,
+ self.output_dir,
+ self.uchime_ref_db_fp,
+ output_chimeras=True,
+ output_nonchimeras=False,
+ output_alns=False,
+ output_tabular=False,
+ log_name="vsearch_uchime_ref_chimera_filtering.log",
+ HALT_EXEC=False)
+
+ self.assertTrue(nonchimeras_fp is None)
+ self.assertTrue(alns_fp is None)
+ self.assertTrue(tabular_fp is None)
+ self.assertTrue(exists(log_fp))
+
+ expected_chimeras = ['251;size=2;', '320;size=2;', '36;size=2;',
+ '672;size=2;', '142;size=1;', '201;size=1;',
+ '241;size=1;', '279;size=1;', '299;size=1;',
+ '359;size=1;', '375;size=1;', '407;size=1;',
+ '423;size=1;', '516;size=1;', '618;size=1;',
+ '717;size=1;', '902;size=1;', '918;size=1;',
+ '941;size=1;']
+
+ num_seqs = 0
+
+ with open(chimeras_fp, "U") as chimeras_f:
+ for label, seq in parse_fasta(chimeras_f):
+ # check label represents chimeric sequence
+ self.assertTrue(label in expected_chimeras)
+ # check sequence exists
+ self.assertTrue(len(seq) > 0)
+ num_seqs += 1
+
+ self.assertTrue(num_seqs, 19)
+
+ def test_vsearch_chimera_filter_ref_output(self):
+ """ Raise error when no output is selected for
+ reference chimera filtering
+ """
+
+ self.assertRaises(ValueError,
+ vsearch_chimera_filter_ref,
+ fasta_filepath=self.amplicon_reads_fp,
+ working_dir=self.output_dir,
+ db_filepath=self.uchime_ref_db_fp,
+ output_chimeras=False,
+ output_nonchimeras=False,
+ output_alns=False,
+ output_tabular=False,
+ log_name="vsearch_uchime_ref_chimera_filtering.log",
+ HALT_EXEC=False)
+
+ def test_vsearch_chimera_filter_ref_output_nonchimeras(self):
+ """ Test ref chimera filter, output nonchimeric sequences
+ """
+ chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+ vsearch_chimera_filter_ref(
+ self.amplicon_reads_fp,
+ self.output_dir,
+ self.uchime_ref_db_fp,
+ output_chimeras=False,
+ output_nonchimeras=True,
+ output_alns=False,
+ output_tabular=False,
+ log_name="vsearch_uchime_ref_chimera_filtering.log",
+ HALT_EXEC=False)
+
+ self.assertTrue(chimeras_fp is None)
+ self.assertTrue(alns_fp is None)
+ self.assertTrue(tabular_fp is None)
+ self.assertTrue(exists(log_fp))
+
+ expected_nonchimeras =\
+ ['3;size=102;', '16;size=95;', '22;size=93;', '2;size=87;', '39;size=84;',
+ '4;size=79;', '6;size=72;', '11;size=70;', '45;size=67;', '1;size=65;',
+ '425;size=2;', '100;size=1;', '102;size=1;', '10;size=1;', '115;size=1;',
+ '123;size=1;', '132;size=1;', '134;size=1;', '140;size=1;', '144;size=1;',
+ '148;size=1;', '14;size=1;', '156;size=1;', '15;size=1;', '161;size=1;',
+ '162;size=1;', '186;size=1;', '203;size=1;', '217;size=1;', '218;size=1;',
+ '21;size=1;', '221;size=1;', '222;size=1;', '225;size=1;', '233;size=1;',
+ '234;size=1;', '235;size=1;', '249;size=1;', '24;size=1;', '259;size=1;',
+ '266;size=1;', '26;size=1;', '27;size=1;', '296;size=1;', '303;size=1;',
+ '306;size=1;', '307;size=1;', '322;size=1;', '326;size=1;', '32;size=1;',
+ '332;size=1;', '333;size=1;', '338;size=1;', '360;size=1;', '362;size=1;',
+ '364;size=1;', '366;size=1;', '369;size=1;', '371;size=1;', '373;size=1;',
+ '374;size=1;', '37;size=1;', '386;size=1;', '387;size=1;', '392;size=1;',
+ '393;size=1;', '397;size=1;', '405;size=1;', '414;size=1;', '418;size=1;',
+ '431;size=1;', '436;size=1;', '444;size=1;', '445;size=1;', '456;size=1;',
+ '460;size=1;', '469;size=1;', '470;size=1;', '477;size=1;', '479;size=1;',
+ '486;size=1;', '500;size=1;', '515;size=1;', '528;size=1;', '530;size=1;',
+ '531;size=1;', '549;size=1;', '551;size=1;', '557;size=1;', '559;size=1;',
+ '561;size=1;', '562;size=1;', '564;size=1;', '566;size=1;', '568;size=1;',
+ '570;size=1;', '578;size=1;', '57;size=1;', '586;size=1;', '596;size=1;',
+ '600;size=1;', '612;size=1;', '625;size=1;', '632;size=1;', '649;size=1;',
+ '650;size=1;', '651;size=1;', '664;size=1;', '66;size=1;', '673;size=1;',
+ '675;size=1;', '682;size=1;', '690;size=1;', '699;size=1;', '709;size=1;',
+ '73;size=1;', '740;size=1;', '745;size=1;', '746;size=1;', '748;size=1;',
+ '760;size=1;', '766;size=1;', '778;size=1;', '77;size=1;', '791;size=1;',
+ '797;size=1;', '7;size=1;', '809;size=1;', '813;size=1;', '814;size=1;',
+ '816;size=1;', '817;size=1;', '821;size=1;', '824;size=1;', '827;size=1;',
+ '82;size=1;', '83;size=1;', '842;size=1;', '851;size=1;', '853;size=1;',
+ '862;size=1;', '863;size=1;', '866;size=1;', '871;size=1;', '879;size=1;',
+ '886;size=1;', '892;size=1;', '895;size=1;', '897;size=1;', '904;size=1;',
+ '912;size=1;', '916;size=1;', '91;size=1;', '920;size=1;', '921;size=1;',
+ '925;size=1;', '930;size=1;', '942;size=1;', '945;size=1;', '947;size=1;',
+ '948;size=1;', '952;size=1;', '956;size=1;', '958;size=1;', '964;size=1;',
+ '967;size=1;', '984;size=1;', '992;size=1;', '993;size=1;']
+
+ num_seqs = 0
+
+ # check nonchimeras fasta file
+ with open(nonchimeras_fp, "U") as nonchimeras_f:
+ for label, seq in parse_fasta(nonchimeras_f):
+ # check label represents chimeric sequence
+ self.assertTrue(label in expected_nonchimeras)
+ # check sequence exists
+ self.assertTrue(len(seq) > 0)
+ num_seqs += 1
+
+ self.assertTrue(num_seqs, 169)
+
+ def test_vsearch_chimera_filter_ref_output_alns_tab(self):
+ """ Test ref chimera filter, output only
+ chimeric alignments and tabular format
+ """
+ chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+ vsearch_chimera_filter_ref(
+ self.single_chimera_ref_fp,
+ self.output_dir,
+ self.uchime_single_ref_db_fp,
+ output_chimeras=False,
+ output_nonchimeras=False,
+ output_alns=True,
+ output_tabular=True,
+ log_name="vsearch_uchime_ref_chimera_filtering.log",
+ HALT_EXEC=False)
+
+ self.assertTrue(chimeras_fp is None)
+ self.assertTrue(nonchimeras_fp is None)
+ self.assertTrue(exists(log_fp))
+
+ # check alignment is correct
+ with open(alns_fp, 'U') as alns_f:
+ actual_alns = alns_f.read()
+ self.assertEquals(single_chimera_ref_aln, actual_alns)
+
+ # check tabular output is correct
+ with open(tabular_fp, 'U') as tabular_f:
+ actual_tab = tabular_f.read()
+
+ self.assertEquals(single_chimera_ref_tab, actual_tab)
+
+ def test_vsearch_chimera_filter_de_novo(self):
+ """ Test de novo chimera filter, output only
+ chimeric sequences
+ """
+ chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+ vsearch_chimera_filter_de_novo(
+ self.amplicon_reads_fp,
+ self.output_dir,
+ output_chimeras=True,
+ output_nonchimeras=False,
+ output_alns=False,
+ output_tabular=False,
+ log_name="vsearch_uchime_de_novo_chimera_filtering.log",
+ HALT_EXEC=False)
+
+ self.assertTrue(nonchimeras_fp is None)
+ self.assertTrue(alns_fp is None)
+ self.assertTrue(tabular_fp is None)
+ self.assertTrue(exists(log_fp))
+
+ expected_chimeras = ['251;size=2;', '320;size=2;', '36;size=2;',
+ '672;size=2;', '142;size=1;', '201;size=1;',
+ '241;size=1;', '279;size=1;', '299;size=1;',
+ '359;size=1;', '375;size=1;', '407;size=1;',
+ '423;size=1;', '516;size=1;', '618;size=1;',
+ '717;size=1;', '902;size=1;', '918;size=1;',
+ '941;size=1;']
+
+ num_seqs = 0
+
+ with open(chimeras_fp, "U") as chimeras_f:
+ for label, seq in parse_fasta(chimeras_f):
+ # check label represents chimeric sequence
+ self.assertTrue(label in expected_chimeras)
+ # check sequence exists
+ self.assertTrue(len(seq) > 0)
+ num_seqs += 1
+
+ self.assertTrue(num_seqs, 19)
+
+ def test_vsearch_chimera_filter_de_novo_output(self):
+ """ Raise error when no output is selected for
+ de novo chimera filtering
+ """
+
+ self.assertRaises(ValueError,
+ vsearch_chimera_filter_de_novo,
+ fasta_filepath=self.amplicon_reads_fp,
+ working_dir=self.output_dir,
+ output_chimeras=False,
+ output_nonchimeras=False,
+ output_alns=False,
+ output_tabular=False,
+ log_name="vsearch_uchime_de_novo_chimera_filter.log",
+ HALT_EXEC=False)
+
+ def test_vsearch_chimera_filter_de_novo_output_nonchimeras(self):
+ """ Test de novo chimera filter, output nonchimeric sequences
+ """
+ chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+ vsearch_chimera_filter_de_novo(
+ self.amplicon_reads_fp,
+ self.output_dir,
+ output_chimeras=False,
+ output_nonchimeras=True,
+ output_alns=False,
+ output_tabular=False,
+ log_name="vsearch_uchime_de_novo_chimera_filter.log",
+ HALT_EXEC=False)
+
+ self.assertTrue(chimeras_fp is None)
+ self.assertTrue(alns_fp is None)
+ self.assertTrue(tabular_fp is None)
+ self.assertTrue(exists(log_fp))
+
+ expected_nonchimeras =\
+ ['3;size=102;', '16;size=95;', '22;size=93;', '2;size=87;', '39;size=84;',
+ '4;size=79;', '6;size=72;', '11;size=70;', '45;size=67;', '1;size=65;',
+ '425;size=2;', '100;size=1;', '102;size=1;', '10;size=1;', '115;size=1;',
+ '123;size=1;', '132;size=1;', '134;size=1;', '140;size=1;', '144;size=1;',
+ '148;size=1;', '14;size=1;', '156;size=1;', '15;size=1;', '161;size=1;',
+ '162;size=1;', '186;size=1;', '203;size=1;', '217;size=1;', '218;size=1;',
+ '21;size=1;', '221;size=1;', '222;size=1;', '225;size=1;', '233;size=1;',
+ '234;size=1;', '235;size=1;', '249;size=1;', '24;size=1;', '259;size=1;',
+ '266;size=1;', '26;size=1;', '27;size=1;', '296;size=1;', '303;size=1;',
+ '306;size=1;', '307;size=1;', '322;size=1;', '326;size=1;', '32;size=1;',
+ '332;size=1;', '333;size=1;', '338;size=1;', '360;size=1;', '362;size=1;',
+ '364;size=1;', '366;size=1;', '369;size=1;', '371;size=1;', '373;size=1;',
+ '374;size=1;', '37;size=1;', '386;size=1;', '387;size=1;', '392;size=1;',
+ '393;size=1;', '397;size=1;', '405;size=1;', '414;size=1;', '418;size=1;',
+ '431;size=1;', '436;size=1;', '444;size=1;', '445;size=1;', '456;size=1;',
+ '460;size=1;', '469;size=1;', '470;size=1;', '477;size=1;', '479;size=1;',
+ '486;size=1;', '500;size=1;', '515;size=1;', '528;size=1;', '530;size=1;',
+ '531;size=1;', '549;size=1;', '551;size=1;', '557;size=1;', '559;size=1;',
+ '561;size=1;', '562;size=1;', '564;size=1;', '566;size=1;', '568;size=1;',
+ '570;size=1;', '578;size=1;', '57;size=1;', '586;size=1;', '596;size=1;',
+ '600;size=1;', '612;size=1;', '625;size=1;', '632;size=1;', '649;size=1;',
+ '650;size=1;', '651;size=1;', '664;size=1;', '66;size=1;', '673;size=1;',
+ '675;size=1;', '682;size=1;', '690;size=1;', '699;size=1;', '709;size=1;',
+ '73;size=1;', '740;size=1;', '745;size=1;', '746;size=1;', '748;size=1;',
+ '760;size=1;', '766;size=1;', '778;size=1;', '77;size=1;', '791;size=1;',
+ '797;size=1;', '7;size=1;', '809;size=1;', '813;size=1;', '814;size=1;',
+ '816;size=1;', '817;size=1;', '821;size=1;', '824;size=1;', '827;size=1;',
+ '82;size=1;', '83;size=1;', '842;size=1;', '851;size=1;', '853;size=1;',
+ '862;size=1;', '863;size=1;', '866;size=1;', '871;size=1;', '879;size=1;',
+ '886;size=1;', '892;size=1;', '895;size=1;', '897;size=1;', '904;size=1;',
+ '912;size=1;', '916;size=1;', '91;size=1;', '920;size=1;', '921;size=1;',
+ '925;size=1;', '930;size=1;', '942;size=1;', '945;size=1;', '947;size=1;',
+ '948;size=1;', '952;size=1;', '956;size=1;', '958;size=1;', '964;size=1;',
+ '967;size=1;', '984;size=1;', '992;size=1;', '993;size=1;']
+
+ num_seqs = 0
+
+ # check nonchimeras fasta file
+ with open(nonchimeras_fp, "U") as nonchimeras_f:
+ for label, seq in parse_fasta(nonchimeras_f):
+ # check label represents chimeric sequence
+ self.assertTrue(label in expected_nonchimeras)
+ # check sequence exists
+ self.assertTrue(len(seq) > 0)
+ num_seqs += 1
+
+ self.assertTrue(num_seqs, 169)
+
+ def test_vsearch_chimera_filter_de_novo_output_alns_tab(self):
+ """ Test de novo chimera filter, output only
+ chimeric alignments and tabular format
+ """
+ chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+ vsearch_chimera_filter_de_novo(
+ self.single_chimera_fp,
+ self.output_dir,
+ output_chimeras=False,
+ output_nonchimeras=False,
+ output_alns=True,
+ output_tabular=True,
+ log_name="vsearch_uchime_de_novo_chimera_filter.log",
+ HALT_EXEC=False)
+
+ self.assertTrue(chimeras_fp is None)
+ self.assertTrue(nonchimeras_fp is None)
+ self.assertTrue(exists(log_fp))
+
+ # check alignment is correct
+ with open(alns_fp, 'U') as alns_f:
+ actual_alns = alns_f.read()
+ self.assertEquals(single_chimera_aln, actual_alns)
+
+ # check tabular output is correct
+ with open(tabular_fp, 'U') as tabular_f:
+ actual_tab = tabular_f.read()
+ self.assertEquals(single_chimera_tab, actual_tab)
+
+ def test_vsearch_sort_by_abundance(self):
+ """ Test sorting sequences by abundance
+ """
+ tmp_fp = join(self.output_dir, "tmp_sorted_reads.fasta")
+
+ output_sorted, log_fp = vsearch_sort_by_abundance(
+ self.seqs_to_sort_fp,
+ tmp_fp,
+ working_dir=None,
+ minsize=None,
+ maxsize=None,
+ log_name="abundance_sort.log",
+ HALT_EXEC=False)
+
+ self.assertTrue(exists(log_fp))
+
+ expected_order = ['HWI-ST157_0368:1:2107:19923:3944#0/1;size=100;',
+ 'HWI-ST157_0368:1:1201:8401:113582#0/1;size=10;',
+ 'HWI-ST157_0368:1:2204:20491:181552#0/1;size=10;',
+ 'HWI-ST157_0368:1:2105:3428:36721#0/1;size=5;',
+ 'HWI-ST157_0368:1:2105:6731:137157#0/1;size=4;',
+ 'HWI-ST157_0368:1:2106:18272:88408#0/1;size=2;',
+ 'HWI-ST157_0368:1:1106:12200:200135#0/1;size=1;',
+ 'HWI-ST157_0368:1:2208:9135:145970#0/1;size=1;']
+
+ num_seqs = 0
+
+ with open(output_sorted, "U") as tmp_f:
+ for label, seq in parse_fasta(tmp_f):
+ # check label is in correct order
+ self.assertEquals(label, expected_order[num_seqs])
+ # check sequence exists
+ self.assertTrue(len(seq) > 0)
+ num_seqs += 1
+
+ self.assertTrue(num_seqs, 8)
+
+ def test_vsearch_sort_by_abundance_minsize_1_maxsize_10(self):
+ """ Test sorting sequences by abundance,
+ discard sequences with an abundance value smaller
+ than 1 and greater than 10
+ """
+ tmp_fp = join(self.output_dir, "tmp_sorted_reads.fasta")
+
+ output_sorted, log_fp = vsearch_sort_by_abundance(
+ self.seqs_to_sort_fp,
+ tmp_fp,
+ working_dir=None,
+ minsize=2,
+ maxsize=10,
+ log_name="abundance_sort.log",
+ HALT_EXEC=False)
+
+ self.assertTrue(exists(log_fp))
+
+ expected_order = ['HWI-ST157_0368:1:1201:8401:113582#0/1;size=10;',
+ 'HWI-ST157_0368:1:2204:20491:181552#0/1;size=10;',
+ 'HWI-ST157_0368:1:2105:3428:36721#0/1;size=5;',
+ 'HWI-ST157_0368:1:2105:6731:137157#0/1;size=4;',
+ 'HWI-ST157_0368:1:2106:18272:88408#0/1;size=2;']
+
+ num_seqs = 0
+
+ with open(output_sorted, "U") as tmp_f:
+ for label, seq in parse_fasta(tmp_f):
+ # check label is in correct order
+ self.assertEquals(label, expected_order[num_seqs])
+ # check sequence exists
+ self.assertTrue(len(seq) > 0)
+ num_seqs += 1
+
+ self.assertTrue(num_seqs, 5)
+
+ def test_vsearch_dereplicate_exact_seqs(self):
+ """ Test dereplicating sequences
+ """
+ tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+ dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+ self.seqs_to_derep_fp,
+ tmp_fp,
+ output_uc=False,
+ working_dir=self.output_dir,
+ strand="both",
+ maxuniquesize=None,
+ minuniquesize=None,
+ sizein=False,
+ sizeout=True)
+
+ # no output for .uc
+ self.assertTrue(uc_fp is None)
+ self.assertTrue(exists(log_fp))
+
+ num_seqs = 0
+ expected_derep = ['HWI-ST157_0368:1:1207:16180:126921#0/1;size=3;',
+ 'HWI-ST157_0368:1:2103:7895:197066#0/1;size=3;',
+ 'HWI-ST157_0368:1:1106:11378:83198#0/1;size=1;',
+ 'HWI-ST157_0368:1:2102:15078:69955#0/1;size=1;']
+
+ with open(tmp_fp, "U") as tmp_f:
+ for label, seq in parse_fasta(tmp_f):
+ num_seqs += 1
+ # check output labels are correct
+ self.assertTrue(label in expected_derep)
+ # check sequence exists
+ self.assertTrue(len(seq) > 0)
+
+ # check there are 4 sequences after dereplication
+ self.assertEquals(num_seqs, 4)
+
+ def test_vsearch_dereplicate_exact_seqs_uc(self):
+ """ Test dereplicating sequences with .uc output
+ """
+ tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+ dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+ self.seqs_to_derep_fp,
+ tmp_fp,
+ output_uc=True,
+ working_dir=self.output_dir,
+ strand="both",
+ maxuniquesize=None,
+ minuniquesize=None,
+ sizein=False,
+ sizeout=True)
+
+ # .uc exists
+ self.assertTrue(exists(uc_fp))
+ self.assertTrue(exists(log_fp))
+
+ id_to_count = {}
+
+ num_seqs = 0
+ expected_derep = {'HWI-ST157_0368:1:1207:16180:126921#0/1': 3,
+ 'HWI-ST157_0368:1:2103:7895:197066#0/1': 3,
+ 'HWI-ST157_0368:1:1106:11378:83198#0/1': 1,
+ 'HWI-ST157_0368:1:2102:15078:69955#0/1': 1}
+
+ with open(uc_fp, 'U') as uc_f:
+ for line in uc_f:
+ if line.startswith('S'):
+ num_seqs += 1
+ label = line.strip().split('\t')[8]
+ # check output labels are correct
+ self.assertTrue(label in expected_derep)
+ id_to_count[label] = 1
+ elif line.startswith('H'):
+ seed = line.strip().split('\t')[9]
+ id_to_count[seed] += 1
+
+ # check there are 4 sequences after dereplication
+ self.assertEquals(num_seqs, 4)
+
+ for label in id_to_count:
+ self.assertEquals(expected_derep[label], id_to_count[label])
+
+ def test_vsearch_dereplicate_exact_seqs_empty_working_dir(self):
+ """ Test dereplicating sequences without passing
+ a working directory
+ """
+ tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+ dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+ self.seqs_to_derep_fp,
+ tmp_fp,
+ output_uc=True,
+ working_dir=None,
+ strand="both",
+ maxuniquesize=None,
+ minuniquesize=None,
+ sizein=False,
+ sizeout=True)
+
+ self.assertTrue(exists(log_fp))
+
+ # check dereplicated seqs and uc file in the same
+ # directory (same path as tmp_fp)
+ self.assertEquals(dirname(tmp_fp), dirname(dereplicated_seqs_fp))
+ self.assertEquals(dirname(tmp_fp), dirname(uc_fp))
+
+ def test_vsearch_dereplicate_exact_seqs_abundance(self):
+ """ Test dereplicating sequences and discarding those with
+ abundance < 2 and abundance > 6
+ """
+ tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+ dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+ self.seqs_to_derep_max_min_abundance_fp,
+ tmp_fp,
+ output_uc=False,
+ working_dir=self.output_dir,
+ strand="both",
+ maxuniquesize=6,
+ minuniquesize=2,
+ sizein=False,
+ sizeout=True)
+
+ # no output for .uc
+ self.assertTrue(uc_fp is None)
+ self.assertTrue(exists(log_fp))
+
+ num_seqs = 0
+ expected_derep = ['HWI-ST157_0368:1:1106:10560:153880#0/1;size=6;',
+ 'HWI-ST157_0368:1:2103:12440:90119#0/1;size=2;',
+ 'HWI-ST157_0368:1:1106:15269:103850#0/1;size=3;',
+ 'HWI-ST157_0368:1:1205:9745:86166#0/1;size=5;']
+
+ with open(tmp_fp, "U") as tmp_f:
+ for label, seq in parse_fasta(tmp_f):
+ num_seqs += 1
+ # check output labels are correct
+ self.assertTrue(label in expected_derep)
+ # check sequence exists
+ self.assertTrue(len(seq) > 0)
+
+ # check there are 4 sequences after dereplication
+ self.assertEquals(num_seqs, 4)
+
+ def test_vsearch_dereplicate_exact_seqs_merged(self):
+ """ Test dereplicating sequences which already contain
+ abundance information in the label from previous
+ dereplication (ex. two dereplicated files have been
+ merged into a new file for dereplication)
+ """
+ tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+ dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+ self.seqs_to_derep_merged_derep_files_fp,
+ tmp_fp,
+ output_uc=False,
+ working_dir=self.output_dir,
+ strand="both",
+ maxuniquesize=None,
+ minuniquesize=None,
+ sizein=True,
+ sizeout=True)
+
+ # no output for .uc
+ self.assertTrue(uc_fp is None)
+ self.assertTrue(exists(log_fp))
+
+ num_seqs = 0
+ expected_derep = ['HWI-ST157_0368:1:1207:16180:126921#0/1;size=6;',
+ 'HWI-ST157_0368:1:2103:7895:197066#0/1;size=6;',
+ 'HWI-ST157_0368:1:1106:11378:83198#0/1;size=2;',
+ 'HWI-ST157_0368:1:2102:15078:69955#0/1;size=2;']
+
+ with open(tmp_fp, "U") as tmp_f:
+ for label, seq in parse_fasta(tmp_f):
+ num_seqs += 1
+ # check output labels are correct
+ self.assertTrue(label in expected_derep)
+ # check sequence exists
+ self.assertTrue(len(seq) > 0)
+
+ # check there are 4 sequences after dereplication
+ self.assertEquals(num_seqs, 4)
+
+ def test_vsearch_dereplicate_exact_seqs_strand(self):
+ """ Raise error when strand parameter is something
+ other than 'plus' or 'both'
+ """
+ tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+ self.assertRaises(ValueError,
+ vsearch_dereplicate_exact_seqs,
+ fasta_filepath=self.seqs_to_derep_fp,
+ output_filepath=tmp_fp,
+ output_uc=False,
+ working_dir=None,
+ strand="minus",
+ maxuniquesize=None,
+ minuniquesize=None,
+ sizein=False,
+ sizeout=True,
+ log_name="derep.log",
+ HALT_EXEC=False)
+
+
+# Test dereplicating sequences using default parameters
+seqs_to_derep = """>HWI-ST157_0368:1:2102:15078:69955#0/1
+TACGTAGGGCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGTGCGCAGGCGGTCTGTTAAGTCTGTAGTTAAAGGCTGTGGCTCAACTATGGTTAGTT
+>HWI-ST157_0368:1:2103:7895:197066#0/1
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#0/1
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+>HWI-ST157_0368:1:1106:11378:83198#0/1
+TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGC
+>HWI-ST157_0368:1:2103:7895:197066#0/2
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:2103:7895:197066#0/3
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#0/2
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#0/3
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+"""
+
+# Test dereplicating a file which is a concatenation of two separately
+# dereplicated files. The input fasta file contains abundance information.
+seqs_to_derep_merged_derep_files = """>HWI-ST157_0368:1:2102:15078:69955#0/1;size=1;
+TACGTAGGGCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGTGCGCAGGCGGTCTGTTAAGTCTGTAGTTAAAGGCTGTGGCTCAACTATGGTTAGTT
+>HWI-ST157_0368:1:2103:7895:197066#0/1;size=3;
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#0/1;size=3;
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+>HWI-ST157_0368:1:1106:11378:83198#0/1;size=1;
+TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGC
+>HWI-ST157_0368:1:2102:15078:69955#1/1;size=1;
+TACGTAGGGCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGTGCGCAGGCGGTCTGTTAAGTCTGTAGTTAAAGGCTGTGGCTCAACTATGGTTAGTT
+>HWI-ST157_0368:1:2103:7895:197066#1/1;size=3;
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#1/1;size=3;
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+>HWI-ST157_0368:1:1106:11378:83198#1/1;size=1;
+TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGC
+"""
+
+# Sequences to dereplicate with final clusters as follows:
+# 2 clusters with abundance 6 and 7
+# 3 clusters with abundance 1
+# 3 clusters with abundance 2, 3, 5
+seqs_to_derep_max_min_abundance = """>HWI-ST157_0368:1:1106:10560:153880#0/1
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/2
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/3
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/4
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/5
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/6
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/1
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/2
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/3
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/4
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/5
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/6
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/7
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:1102:8490:14349#0/1
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTCAACCCATAAACTGC
+>HWI-ST157_0368:1:1205:18016:113727#0/1
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTGATTAAGTTAGATGTGAAATCCCCGGGCTTAACCTGGGGATGGC
+>HWI-ST157_0368:1:1201:16382:127646#0/1
+TACAGAGGTCTCAAGCGTTGTTCGGAATCACTGGGCGTAAAGCGTGCGTAGGCGGTTTCGTAAGTCGGGTGTGAAAGGCGGGGGCTTAACGCCCGGACTGG
+>HWI-ST157_0368:1:2103:12440:90119#0/1
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCACGCCAAGTCAGCGGTGAAATTTCCGGGCTCAACCCGGAGTGTGC
+>HWI-ST157_0368:1:2103:12440:90119#0/2
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCACGCCAAGTCAGCGGTGAAATTTCCGGGCTCAACCCGGAGTGTGC
+>HWI-ST157_0368:1:1106:15269:103850#0/1
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGGTTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGC
+>HWI-ST157_0368:1:1106:15269:103850#0/2
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGGTTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGC
+>HWI-ST157_0368:1:1106:15269:103850#0/3
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGGTTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/1
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/2
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/3
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/4
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/5
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+"""
+
+# Test sort by abundance functionality in VSEARCH
+seqs_to_sort = """>HWI-ST157_0368:1:2105:3428:36721#0/1;size=5;
+TACGTAGGGTGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGCGTCCGGTGTGAAAGTCCATCGCTTAACGGTGGATCTGC
+>HWI-ST157_0368:1:2106:18272:88408#0/1;size=2;
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGGAGCAAGTCTGAAGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1106:12200:200135#0/1;size=1;
+TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCACTAAGGGTCTGTGGTGAAAGATCGAAGCTTAACTTCGGTAAGCC
+>HWI-ST157_0368:1:1201:8401:113582#0/1;size=10;
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTGTGGCAAGTCTGATGTGAAAGGCATGGGCTTAACCTGTGGACTGC
+>HWI-ST157_0368:1:2208:9135:145970#0/1;size=1;
+TACGTAGGGGGCGAGCGTTGTCCGGAATTACTGGGCGTAAGGGGAGCGTAGGCGGTCGATTAAGTTAGATGTGAAACCCCCGGGCTTAACTTGGGGACTGC
+>HWI-ST157_0368:1:2204:20491:181552#0/1;size=10;
+TACGTAGGGTGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGAGTCTGGTGTGAAAGTCCATCGCTTAACGGTGGATCCGC
+>HWI-ST157_0368:1:2105:6731:137157#0/1;size=4;
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGGCTGTGGCTCAACCATAGTTCGCT
+>HWI-ST157_0368:1:2107:19923:3944#0/1;size=100;
+TGCATTTTCTCTTATCGAAAACCTTCAGCGTTCTGATCTGAATCCCGTCGAAGAGGCTAAGGGCTATCGCCAACTCATTGATGCCAGCGGGATGACCCAGG
+"""
+
+# Grinder simulated chimeric reads using Greengenes 13.8 release
+# command used: grinder -random_seed 100 -reference_file 97_otus_gg_13_8.fasta \
+# -forward_reverse ./primers.fna -length_bias 0 -copy_bias 1 \
+# -unidirectional 1 -read_dist 150 -mutation_dist uniform 0.1 \
+# -mutation_ratio 100 0 -total_reads 1000 -diversity 10 -chimera_perc 10 \
+# -od grinder_chimeric_reads_illumina
+# primers.fna contain 515f and 806r primers
+# reads >251 reference=4370324,646991 amplicon=488..779,499..789 position=1..150
+# >320 reference=646991,4370324,646991 amplicon=499..789,488..779,499..789 position=1..150
+# >36 reference=4370324,814974 amplicon=488..779,479..769 position=1..150
+# >672 reference=814974,160832 amplicon=479..769,436..727 position=1..150
+# >142 reference=160832,814974 amplicon=436..727,479..769 position=1..150 errors=2%G
+# >201 reference=4304512,510574 amplicon=451..742,501..793 position=1..150
+# >241 reference=646991,4370324 amplicon=499..789,488..779 position=1..150 errors=13%A
+# >279 reference=311922,160832,510574 amplicon=481..773,436..727,501..793 position=1..150
+# >299 reference=4370324,4304512 amplicon=488..779,451..742 position=1..150
+# >359 reference=646991,4370324 amplicon=499..789,488..779 position=1..150 errors=52%A
+# >375 reference=4304512,769294 amplicon=451..742,504..795 position=1..150
+# >407 reference=4304512,579954 amplicon=451..742,488..779 position=1..150
+# >423 reference=4370324,579954 amplicon=488..779,488..779 position=1..150
+# >516 reference=814974,579954 amplicon=479..769,488..779 position=1..150
+# >618 reference=814974,646991 amplicon=479..769,499..789 position=1..150 errors=32%C
+# >717 reference=814974,510574 amplicon=479..769,501..793 position=1..150
+# >902 reference=510574,579954 amplicon=501..793,488..779 position=1..150
+# >918 reference=814974,4370324 amplicon=479..769,488..779 position=1..150
+# >941 reference=579954,4304512 amplicon=488..779,451..742 position=1..150
+# are chimeric
+amplicon_reads = """>3;size=102;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>16;size=95;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>22;size=93;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>2;size=87;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>39;size=84;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>4;size=79;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>6;size=72;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>11;size=70;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>45;size=67;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>1;size=65;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>251;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>30;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>320;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>36;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>425;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>672;size=2;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>10;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GGGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>100;size=1;
+TTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>102;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGACTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>115;size=1;
+GTGCCAGCAGCCGCGGTATTACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>123;size=1;
+GTGACAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>132;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGACACTGCAAGTCTTGAGATCGGAAG
+>134;size=1;
+GTGCCGGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>14;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCACGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>140;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGGAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>142;size=1;
+GGGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>144;size=1;
+GTGTCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>148;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTATCATTCTTGAGTATAGATG
+>15;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGAGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>156;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGACGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>161;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGGGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>162;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGCTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>186;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTAATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>201;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>203;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAACGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>21;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTCAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>217;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGTAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>218;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAACGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>221;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAATACTGGAG
+>222;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGGGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>225;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGAGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>233;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTGTACTTGAGTGTTGTAA
+>234;size=1;
+GTGCCAGCAGCCTCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>235;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAACGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>24;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAAGTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>241;size=1;
+GTGCCAGCAGCCACGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>249;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTCAAACTGCAAGTCTTGAGATCGGAAG
+>259;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCGCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>26;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACATGATACTGCCTTGCTCGAGTACTGGAG
+>266;size=1;
+GTGCCAGCGGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>27;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGTGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>279;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>296;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGCGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>299;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>303;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCAGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>306;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAA
+>307;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGTTACTGGTATACTTGAGTGTTGTAA
+>32;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGGTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>322;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGGCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>326;size=1;
+GTGCCAGCAGCCGCGGTAATACGGTGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>332;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGGTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>333;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTATGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>338;size=1;
+GTGCCAGCAGCCGCGGTATTACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>359;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTAGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>360;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGTTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>362;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGAATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>364;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCATTTAGAACTGGTTAACTAGAGTATTGGAG
+>366;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCCTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>369;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTAATGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>37;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACTTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>371;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGCGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>373;size=1;
+GTGCCAGCAGACGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>374;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGCTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>375;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>386;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGTAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>387;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACAGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>392;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAACCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>393;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGTGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>397;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTTCCATTGATACTGGTATACTTGAGTGTTGTAA
+>405;size=1;
+GTACCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>407;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>414;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGTGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>418;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTTAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>423;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>431;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AGGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>436;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGTAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>444;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTGAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>445;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGCGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>456;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGCCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>460;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+CAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>469;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGAGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>470;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCGCGAGTACTGGAG
+>477;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCGGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>479;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGGGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>486;size=1;
+GTGCCAGCAGCTGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>500;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTACAAGTCTTGAGATCGGAAG
+>515;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGCAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>516;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>528;size=1;
+GTTCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>530;size=1;
+GTGCCAGCAGGCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>531;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGGCTTGTAG
+>549;size=1;
+GTCCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>551;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCAGTTGATACTGCCTTGCTCGAGTACTGGAG
+>557;size=1;
+GTGCCAGCAGCCGCTGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>559;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGCTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>561;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCATGTCTTGAGATCGGAAG
+>562;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCATTGATACTGGATGTCTTGAGTGTGAGAG
+>564;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAATTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>566;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTAATGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>568;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTTAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>57;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGAGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>570;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGCGTTACATAGA
+>578;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATGTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>586;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATATCTTGAGTGTGAGAG
+>587;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>596;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGTGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>600;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGTTTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>612;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCAGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>618;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACCAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>625;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTATTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>632;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGCGCAGACTTGAGTGATGTAG
+>649;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCGACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>650;size=1;
+GTGCCAGCAGCCGTGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>651;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACCGGCAGACTTGAGTGATGTAG
+>66;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTGGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>664;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTGATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>673;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGCGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>675;size=1;
+GTGCCAGCAGCCGCGGTAATACCTAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>682;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTTT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>690;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTGGATACTGCCTTGCTCGAGTACTGGAG
+>699;size=1;
+GTTCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>7;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCATAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>709;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTCTGGCTTGAGTTCGGCAG
+>717;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>73;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCGCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>740;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGTGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>745;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGAGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>746;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGTCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>748;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCATTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>760;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGCGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>766;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTAATACTGGTATACTTGAGTGTTGTAA
+>77;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCGGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>778;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTTGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>791;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+TAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>797;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTTTGGCTTGAGTTCGGCAG
+>809;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGAGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>813;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACAGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>814;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>816;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGAGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>817;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>82;size=1;
+GTGCCAGCAGGCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>821;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGTGTACTGGAG
+>824;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGTGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>827;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTAGTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>83;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAAGTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>842;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGGGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>851;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGTCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>853;size=1;
+GTTCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>862;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTGGAGTGTTGTAA
+>863;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTGAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>866;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>871;size=1;
+GCGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>879;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCTGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>886;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGAGG
+>892;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGGAGGCTAGAGTCTTGTAG
+>895;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGATCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>897;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGAATTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>902;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>904;size=1;
+CTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>91;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAC
+>912;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCATTTTAGAACTGGTTAACTAGAGTATTGGAG
+>916;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGGGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>918;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>920;size=1;
+GTGCCAGCAGCCGCGGTAATACGTCGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>921;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGCGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>925;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGGAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>930;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>941;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>942;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTATAGTATTGGAG
+>945;size=1;
+GTGCCAGCAGCCGTGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>947;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTCATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>948;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAACCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>952;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGCGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>956;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCGTTTGATACTGGATGTCTTGAGTGTGAGAG
+>958;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCCGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>964;size=1;
+GTGCCAGCAGCCGCCGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>967;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTAGAG
+>984;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTAGAGATG
+>992;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAATCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>993;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGACTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+"""
+
+# Single chimeric sequence (251) with both parents to test alignment
+# and tabular output de novo
+single_chimera = """>22;size=93;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>45;size=67;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>251;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+"""
+
+# Alignment for chimeric sequence (251) against parents (22 and 45) de novo
+single_chimera_aln = """
+------------------------------------------------------------------------
+Query ( 150 nt) 251;size=2;
+ParentA ( 150 nt) 45;size=67;
+ParentB ( 150 nt) 22;size=93;
+
+A 1 GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT 80
+Q 1 GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT 80
+B 1 GTGCCAGCAGCCGCGGTAATACGGAGGgTGCGAGCGTTgTCCGGATTTATTGGGTTTAAAGGGTaCGTAGGCGGtgTatT 80
+Diffs A A A AA AA
+Votes + + + ++ ++
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A 81 AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGtcATtCTTGAGTaTaGatg 150
+Q 81 AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAGTGTTGTAA 150
+B 81 AAGTCAGTGGTGAAAgCCTGCgGCTCAACcGTAGgagTGCCATTGATACTGGTATACTTGAGTGTTGTAA 150
+Diffs A A A AAA BB B B B BBB
+Votes + + + +++ ++ + + + +++
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAxxxxxxxxxxxxxxBBBBBBBBBBBBBBBBBBB
+
+Ids. QA 94.7%, QB 91.3%, AB 86.0%, QModel 100.0%, Div. +5.6%
+Diffs Left 13: N 0, A 0, Y 13 (100.0%); Right 8: N 0, A 0, Y 8 (100.0%), Score 0.8291
+"""
+
+# Tabular format for UCHIME output
+single_chimera_tab = """0.0000\t22;size=93;\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\t45;size=67;\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.8291\t251;size=2;\t45;size=67;\t22;size=93;\t45;size=67;\t100.0\t94.7\t91.3\t86.0\t94.7\t13\t0\t0\t8\t0\t0\t5.3\tY
+"""
+
+# Single chimeric sequence for reference chimera checking
+single_chimera_ref = """>251;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+"""
+
+# Reference database for UCHIME ref
+uchime_ref_db = """>4304512
+TGAGTTTGATCCTGGCTCAGAACGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGCGTCCTTCGGGACGAGTGGCAGACGGGTGAGTAACGCGTGGGAACGTACCCTTTGGTTCGGAACAACTCCGGGAAACTGGAGCTAATACCGGATAAGCCCTTCGGGGGAAAGATTTATCGCCTTTAGAGCGGCCCGCGTCTGATTAGCTAGTTGGTGGTGTAATGGACCACCAAGGCGACGATCAGTAGCTGGTCTGAGAGGATGACCAGCCACATTGGGACTGAGACACGGCTCAAACTCCTACGGGAGGCAGCAGTGGGGAATCTTGCGCAATGGGCGAAAGCCTGACGCAGCCATGCCGCGTGTATGATGAAGGTCTTAGGATTGTAAAATACTTTCACCGGTGAAGATAATGACTGTAGCCGGAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTC [...]
+>814974
+GATGAACGCTAGCCGTGTGCCTAATACATGCATGTCGTACGAGAGTACTTGTACTCTAGTGGCGAATGGGTGAGTAACACGTACCTAACCTACTTTTAAGATTGGAATAACTACTGGAAACAGTAGCTAATGCCGAATACGTATTAACTTCGCATGAAGATAATATAAAAGGAGCGTTTGCTCCGCTTAGAAATGGGGGTGCGCCATATTAGTTAGTTGGTAGGGTAATGGCCTACCAAGACGATGATATGTAGCCGGGCTGAGAAGCTGATCGGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTAGGGAATTTTCCGCAATGAGCGAAAGCTTGACGGAGCGACACGGCGTGCAGGATGAAGGTCTTCGGATCGTAAACTGCTGTGGTTAGGGAAGAAAAGCAAAATAGGAAATGATTTTGCCCTGACGGTACCTAACTAGAAAGTGACGGCTAACTATGTGCCAGCAGCCGC [...]
+>160832
+GTCGAGCGGCGGACGGGTGAGTAACGGCTGGGAACCTGCCCTGACGCGGGGGATAACCGTTGGAAACGACGGCTAATACCGCATAATGTCTTAGTTCATTACGAGCTGGGACCAAAGGTGGCCTCTACATGTAAGCTATCGCGTTGGGATGGGCCCAGTTAGGATTAGCTAGTTGGTAAGGTAATGGCTTACCAAGGCRACGATCCTTAKCTGGTTTGAGAGGATGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGGGAGACCCTGATGCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCAGTGAGGAAGGTGGTGTACTTAATAAGTGCATGGCTTGACGTTAGCTGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGT [...]
+>573757
+GCCTAAGGCAGGCAAGTCGAACGATGATCTCCAGCCTGCTGGGGGGATTAGAGGAGAACGGGTGAGTAACACGTGAGTAACCTGCCCTTGACTCTGGGATAAGCCTGGGAAACTGGGTCTAATACTGGATACGACCTTCCCACGCATGTGGTGTTGGTGGAAAGCTTTTGTGGTTTTGGATGGACTCGCGGCCTATCAGCTTGTTGGTGGGGTAATGGCCTACCAAGGCGACGACGGGTAGCCGGCCTGAGAGGGTGGACGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGAGGGATGAAGGCCTTCGGGTTGTAAACCTCTTTCAGTAGGGAAGAAGCGAAAGTGACGGTACCTGCAGAAGAAGCGCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAA [...]
+>579954
+TGGCGGCGTGGATAAGACATGCAAGTCGAACGGGATATTGTTTGTAGCAATACAAGCGATGTCTAGTGGCGTAAGGGTGCGTAACACGTGGGGAATCTGCCGAGAAGTGGGGGATAGCTCGCCGAAAGGCGAATTAATACCGCATGTGGTTAGGGAAGACATCTTCCCGACACTAAAGCCGGGGCAACCTGGCGCTTCTTGATGACCCCGCGGCCTATCAGCTAGTCGGTGAGGTAACGGCTCACCAAGGCTATGACGGGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGAACTGAGACACGGTCCAGACACCTACGGGTGGCAGCAGTCGAGAATTTTTCTCAATGGGGGAAACCCTGAAGGAGCGACGCCGCGTGGAGGATGAAGGTCTTCGGATTGTAAACTCCTGTCATGCGGGAACAATTGTCACCGATTAACTGTCGGGGGCTTGATAGTACCAGAAGAGGAAGAGACGGCTAACTCTGTGCC [...]
+>311922
+GATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACATGAAGTGCTTGCACTTTGATGACGAGTGGCGGACGGGTGAGTAATGCTTGGGAATTTGCCTTTGCGCGGGGGATAACCATTGGAAACGATGGCTAATACCGCATAATGTCTACGGACCAAAGGGGGCTTAGGCTCCCACGTGAAGAGAAGCCCAAGTGAGATTAGCTAGTTGGTGGGGTAAAGGCTCACCAAGGCGACGATCTCTAGCTGTTCCGAGAGGAAGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCGCAATGGGGGGAACCCTGACGCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGTTATGAGGAAAGGTTGTTGGTTAATACCCAGCAGCTGTGACGTTAATAACAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCC [...]
+>4370324
+TCAGGATGAACGCTAGCGACAGGCCTAACACATGCAAGTCGAGGGGTAACATTGGTAGCTTGCTACCAGATGACGACCGGCGCACGGGTGAGTAACGCGTATGCAACCTTCCTTTAACAGGAGAATAGCCCCCGGAAACGGGGATTAATGCTCCATGGCACTCTAATTTCGCATGGAATAAGAGTTAAAGTTCCGACGGTTAAAGATGGGCATGCGTGACATTAGCCAGTTGGCGGGGTAACGGCCCACCAAAGCAACGATGTCTAGGGGTTCTGAGAGGAAGGTCCCCCACACTGGTACTGAGACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGACGCAAGTCTGAACCAGCCATGTCGCGTGCAGGATGACTGCCCTATGGGTTGTAAACTGCTTTTGTACGGGAAGAAATGTACTTACGAGTAAGTATTTGCCGGTACCGTACGAATAAGCATCGGCTAACTCCGTGCC [...]
+>646991
+AGTTTGATCTTGGCTCAGGATGAACGCTAGCGGCAGGCCTAATACATGCAAGTCGTGGGGCATCAGCGCCTTCGGGCGGCTGGCGACCGGCGCACGGGTGCGTAACGCGTATGCAACCTGCCCACAACAGGGGGACAGCCTTCGGAAACGAGGATTAATACCCCATGATACAGGGGTACCGCATGGTGCCTTTCGTCAAAGGTTTCGGCCGGTTGTGGATGGGCATGCGTCCCATTAGCTAGTAGGCGGGGTAACGGCCCACCTAGGCTATGATGGGTAGGGGTTCTGAGAGGACGATCCCCCACACTGGTACTGAGATACGGACCAGACTCCTACGGGAGGCAGCAGTAGGGAATATTGGGCAATGGGCGGAAGCCTGACCCAGCCATGCCGCGTGCAGGACGAAGGCCCTCGGGTCGTAAACTGCTTTTATACGGGAAGAACTGCGTCCTGCGGGACGCGCTGACGGTACCGTACGAATAAGCACCGGCT [...]
+>510574
+GAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGAATGCTTTACACATGCAAGTCGAGCGGCAGCGCGGGGGCAACCCTGGCGGCGAGCGGCGAACGGGTGAGTAACACATCGGAACGTACCCAATTGAGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATAAGTCCTGAGGGAGAAAGCGGGGGACCGCAAGGCCTCGCGCGATTGGAGCGGCCGATGTCGGATTAGCTAGTTGGTGGGGTAAAGGCTCACCAAGGCGACGATCCGTAGCTGGTCTGAGAGGATGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGCAAGCCTGATCCAGCCATTCCGCGTGAGTGAAGAAGGCCTTCGGGTTGTAAAGCTCTTTCGGACGGAAAGAAATCGCCCGGGTAAATAATCCGGGTGGATGACGGTACCGTAAGAAGAAGCACCGG [...]
+>769294
+AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCAAGGGGAAAGTTTTCTTCGGAGAATTAGTATACTGGCGCACGGGTGAGTAATGTATAAGTAATCTACCTATAGGAAAGGAATAACTCTAAGAAATTGGGGCTAATACCATATAATGCAGCGGCACCGCATGGTGATGTTGTTAAAGTAATTTATTACGCCTATAGATGAGCTTGTATTCGATTAGCTTGTTGGTAAGGTAACGGCTTACCAAGGCGACGATCGATAGCTGGTCTGAGAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGGCAATGGACGAAAGTCTGACCCAGCAACGCCGCGTGGAGGATGAAGGTCGTAAGATCGTAAACTCCTTTTTTGGGGGAAGAAAAAACAGGTTTGTAGCCTGTATTGACTGTACCCTAAGAATAAGCCC [...]
+"""
+
+# Reference database for single chimera sequence (ref)
+uchime_single_ref_db = """>646991
+AGTTTGATCTTGGCTCAGGATGAACGCTAGCGGCAGGCCTAATACATGCAAGTCGTGGGGCATCAGCGCCTTCGGGCGGCTGGCGACCGGCGCACGGGTGCGTAACGCGTATGCAACCTGCCCACAACAGGGGGACAGCCTTCGGAAACGAGGATTAATACCCCATGATACAGGGGTACCGCATGGTGCCTTTCGTCAAAGGTTTCGGCCGGTTGTGGATGGGCATGCGTCCCATTAGCTAGTAGGCGGGGTAACGGCCCACCTAGGCTATGATGGGTAGGGGTTCTGAGAGGACGATCCCCCACACTGGTACTGAGATACGGACCAGACTCCTACGGGAGGCAGCAGTAGGGAATATTGGGCAATGGGCGGAAGCCTGACCCAGCCATGCCGCGTGCAGGACGAAGGCCCTCGGGTCGTAAACTGCTTTTATACGGGAAGAACTGCGTCCTGCGGGACGCGCTGACGGTACCGTACGAATAAGCACCGGCT [...]
+>4370324
+TCAGGATGAACGCTAGCGACAGGCCTAACACATGCAAGTCGAGGGGTAACATTGGTAGCTTGCTACCAGATGACGACCGGCGCACGGGTGAGTAACGCGTATGCAACCTTCCTTTAACAGGAGAATAGCCCCCGGAAACGGGGATTAATGCTCCATGGCACTCTAATTTCGCATGGAATAAGAGTTAAAGTTCCGACGGTTAAAGATGGGCATGCGTGACATTAGCCAGTTGGCGGGGTAACGGCCCACCAAAGCAACGATGTCTAGGGGTTCTGAGAGGAAGGTCCCCCACACTGGTACTGAGACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGACGCAAGTCTGAACCAGCCATGTCGCGTGCAGGATGACTGCCCTATGGGTTGTAAACTGCTTTTGTACGGGAAGAAATGTACTTACGAGTAAGTATTTGCCGGTACCGTACGAATAAGCATCGGCTAACTCCGTGCC [...]
+"""
+
+# 3-way alignment for single chimeric sequence against reference
+# database using UCHIME
+single_chimera_ref_aln = """
+------------------------------------------------------------------------
+Query ( 150 nt) 251;size=2;
+ParentA ( 1403 nt) 4370324
+ParentB ( 1480 nt) 646991
+
+A 1 tcaggatgaacgctagcgacaggcctaacacatgcaagtcgaggggtaacattggtagcttgctaccagatgacgaccgg 80
+Q 1 -------------------------------------------------------------------------------- 0
+B 1 agtttgatcttggctcaggatgaacgctagcggcaggcctaatacatgcaagtcgtggggcatcagcgccttcgggcggc 80
+Diffs
+Votes
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A 81 cgcacgggtgagtaacgcgtatgcaaccttcctttaacaggagaatagcccccggaaacggggattaatgctccatggca 160
+Q 1 -------------------------------------------------------------------------------- 0
+B 81 tggcgaccggcgcacgggtgcgtaacgcgtatgcaacctgcccacaacagggggacagccttcggaaacgaggattaata 160
+Diffs
+Votes
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A 161 ctctaatttcgcatggaataagagttaaagttccgacggttaaagatgggcatgcgtgacattagccagttggcggggta 240
+Q 1 -------------------------------------------------------------------------------- 0
+B 161 ccccatgatacaggggtaccgcatggtgcctttcgtcaaaggtttcggccggttgtggatgggcatgcgtcccattagct 240
+Diffs
+Votes
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A 241 acggcccaccaaagcaacgatgtctaggggttctgagaggaaggtcccccacactggtactgagacacggaccagactcc 320
+Q 1 -------------------------------------------------------------------------------- 0
+B 241 agtaggcggggtaacggcccacctaggctatgatgggtaggggttctgagaggacgatcccccacactggtactgagata 320
+Diffs
+Votes
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A 321 tacgggaggcagcagtgaggaatattggtcaatggacgcaagtctgaaccagccatgtcgcgtgcaggatgactgcccta 400
+Q 1 -------------------------------------------------------------------------------- 0
+B 321 cggaccagactcctacgggaggcagcagtagggaatattgggcaatgggcggaagcctgacccagccatgccgcgtgcag 400
+Diffs
+Votes
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A 401 tgggttgtaaactgcttttgtacgggaagaaatgtacttacgagtaagtatttgccggtaccgtacgaataagcatcggc 480
+Q 1 -------------------------------------------------------------------------------- 0
+B 401 gacgaaggccctcgggtcgtaaactgcttttatacgggaagaactgcgtcctgcgggacgcgctgacggtaccgtacgaa 480
+Diffs
+Votes
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A 481 taactcc-----------GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGG 549
+Q 1 ------------------GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGG 62
+B 481 taagcaccggctaactccGTGCCAGCAGCCGCGGTAATACGGAGGgTGCGAGCGTTgTCCGGATTTATTGGGTTTAAAGG 560
+Diffs A A
+Votes + +
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A 550 GTGCGTAGGCGGAATGGTAAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGtcATtCTTGAG 629
+Q 63 GTGCGTAGGCGGAATGGTAAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAG 142
+B 561 GTaCGTAGGCGGtgTatTAAGTCAGTGGTGAAAgCCTGCgGCTCAACcGTAGgagTGCCATTGATACTGGTATACTTGAG 640
+Diffs A AA AA A A A AAA BB B
+Votes + ++ ++ + + + +++ ++ +
+Model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAxxxxxxxxxxxxxxBBBBBBBBBBB
+
+A 630 TaTaGatgaggtaggcggaatgagtagtgtagcggtgaaatgcatagatattactcagaacaccaattgcgaaggcagct 709
+Q 143 TGTTGTAA------------------------------------------------------------------------ 150
+B 641 TGTTGTAAgggtgggcggaattccgcatgtagcggtgaaatgcatagatatgcggaggaacaccgagagcgaaggcagct 720
+Diffs B B BBB
+Votes + + ++
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 710 tactaaactataactgacgctgaagcacgaaagcgtgggtatcaaacaggattagataccctggtagtccacgccgtaaa 789
+Q 151 -------------------------------------------------------------------------------- 150
+B 721 cactaggcacgactgacgctgaggtacgaaagcgtggggagcgaacaggattagataccctggtagtccacgccgtaaac 800
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 790 cgatgattactggttgtttgcaatacaccgcaagcgactgagcgaaagcattaagtaatccacctggggagtacgtcggc 869
+Q 151 -------------------------------------------------------------------------------- 150
+B 801 gatggtaactaggtgtgtgcgacacagagtgcgcgcccaagcgaaagcgataagttacccacctggggagtacgctcgca 880
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 870 aacgatgaaactcaaaggaattgacgggggcccgcacaagcggtggaacatgtggtttaattcgatgatacgcgaggaac 949
+Q 151 -------------------------------------------------------------------------------- 150
+B 881 agagtgaaactcaaaggaattgacgggggtccgcacaagcggtggagcatgtggtttaattcgatgatacgcgaggaacc 960
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 950 cttacctgggtttaaatgggaagtgacaggggtagaaatacctttttcttcggacacttttcaaggtgctgcatggttgt 1029
+Q 151 -------------------------------------------------------------------------------- 150
+B 961 ttacctgggctcgaatggcctatgacaggcccagagatgggcccttcctcggacataggtcaaggtgctgcatggctgtc 1040
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 1030 cgtcagctcgtgccgtgaggtgtcgggttaagtcccataacgagcgcaacccctgttgttagttaccagcatgtaaagat 1109
+Q 151 -------------------------------------------------------------------------------- 150
+B 1041 gtcagctcgtgccgtgaggtgttgggttaagtcccgcaacgagcgcaacccttgcccctagttgccatcaggtaaagctg 1120
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 1110 ggggactctaacaagactgccggtgtaaaccgcgaggaaggtggggatgacgtcaaatcagcacggcccttacatccagg 1189
+Q 151 -------------------------------------------------------------------------------- 150
+B 1121 gggactctagggggactgcctgcgcaagcagagaggaaggaggggacgatgtcaagtcatcatggcccttacgcccaggg 1200
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 1190 gctacacacgtgttacaatggcaggtacaaagggcagctacacagcgatgtgatgctaatctcgaaaacctgtcccagtt 1269
+Q 151 -------------------------------------------------------------------------------- 150
+B 1201 ctacacacgtgctacaatggcgcatacagagggtagccacctggcgacagggcgccaatctcaaaaagtgcgtctcagtt 1280
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 1270 cggattgaagtctgcaacccgacttcatgaagctggaatcgctagtaatcgcgcatcagccatggcgcggtgaatacgtt 1349
+Q 151 -------------------------------------------------------------------------------- 150
+B 1281 cggatcggggcctgcaactcggccccgtgaagtcggaatcgctagtaatcgcagatcagccatgctgcggtgaatacgtt 1360
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 1350 cccgggccttgtacactccgcccgtcaagccatggaagccgggagtacctgaag-------------------------- 1403
+Q 151 -------------------------------------------------------------------------------- 150
+B 1361 cccgggccttgtacacaccgcccgtcaagccatggaagccgggggcacctgaagtcgggggtaacaacccgcctagggtg 1440
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A 1404 ---------------------------------------- 1403
+Q 151 ---------------------------------------- 150
+B 1441 aaactggtaactggggctaagtcgtaacaaggtaaccgta 1480
+Diffs
+Votes
+Model BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+Ids. QA 95.3%, QB 91.2%, AB 86.5%, QModel 100.0%, Div. +5.0%
+Diffs Left 13: N 0, A 0, Y 13 (100.0%); Right 7: N 0, A 0, Y 7 (100.0%), Score 0.7254
+"""
+
+# UCHIME tabular output for single chimeric sequence
+single_chimera_ref_tab = """0.7254\t251;size=2;\t4370324\t646991\t4370324\t100.0\t95.3\t91.2\t86.5\t95.3\t13\t0\t0\t7\t0\t0\t4.7\tY
+"""
+
+
+if __name__ == '__main__':
+ main()
diff --git a/bfillings/uclust.py b/bfillings/uclust.py
new file mode 100644
index 0000000..9c89691
--- /dev/null
+++ b/bfillings/uclust.py
@@ -0,0 +1,606 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for uclust version 1.1.579
+
+Includes application controllers for uclust and
+convenience wrappers for different functions of uclust including
+sorting fasta files, finding clusters, converting to cd-hit format and
+searching and aligning against a database. Also contains
+a parser for the resulting .clstr file.
+
+Modified from cogent.app.cd_hit.py on 1-21-10, written by Daniel McDonald.
+"""
+
+from os.path import splitext, basename, join
+from tempfile import gettempdir, mkstemp
+
+from cogent import DNA
+
+from burrito.util import (CommandLineApplication, ResultPath,
+ ApplicationError, ApplicationNotFoundError)
+from burrito.parameters import ValuedParameter, FlagParameter
+from skbio.parse.sequences import parse_fasta
+from skbio.util import remove_files
+
+
+class UclustParseError(Exception):
+ pass
+
+
+class Uclust(CommandLineApplication):
+
+ """ Uclust ApplicationController
+
+ """
+
+ _command = 'uclust'
+ _input_handler = '_input_as_parameters'
+ _parameters = {
+
+ # Fasta input file for merge-sort function
+ '--mergesort': ValuedParameter('--', Name='mergesort', Delimiter=' ',
+ IsPath=True),
+
+ # Output file, used by several difference functions
+ '--output': ValuedParameter('--', Name='output', Delimiter=' ',
+ IsPath=True),
+
+ # Sets temp directory for uclust to create temp fasta file
+ '--tmpdir': ValuedParameter('--', Name='tmpdir', Delimiter=' ',
+ IsPath=True),
+
+ # input filename, fasta format
+ '--input': ValuedParameter('--', Name='input', Delimiter=' ',
+ IsPath=True),
+
+ # Output filename will be in uclust (.uc) format
+ # Output cluster file, required parameter
+ '--uc': ValuedParameter('--', Name='uc', Delimiter=' ',
+ IsPath=True),
+
+ # ID percent for OTU, by default is 97%
+ '--id': ValuedParameter('--', Name='id', Delimiter=' ', IsPath=False),
+
+ # Disable reverse comparison option, if norev is disabled
+ # memory usage is expected to double for uclust
+ '--rev': FlagParameter('--', Name='rev'),
+
+ # 'library' file -- a reference of sequences representing pre-existing
+ # clusters
+ '--lib': ValuedParameter('--', Name='lib', Delimiter=' ', IsPath=True),
+
+ # only compare sequences to the library file, don't add new clusters
+ # for sequences which don't hit the library
+ '--libonly': FlagParameter('--', Name='libonly'),
+
+ # Maximum hits before quitting search (default 1, 0=infinity).
+ '--maxaccepts':
+ ValuedParameter('--', Name='maxaccepts', Delimiter=' '),
+
+ # Maximum rejects before quitting search (default 8, 0=infinity).
+ '--maxrejects':
+ ValuedParameter('--', Name='maxrejects', Delimiter=' '),
+
+ # Target nr. of common words (default 8, 0=don't step)
+ '--stepwords': ValuedParameter('--', Name='stepwords', Delimiter=' '),
+
+ # Word length for windex (default 5 aa.s, 8 nuc.s).
+ '--w': ValuedParameter('--', Name='w', Delimiter=' '),
+
+ # output fp for pairwise aligned sequences
+ '--fastapairs': ValuedParameter('--', Name='fastapairs', Delimiter=' ',
+ IsPath=True),
+
+ # input filename, .uc format
+ '--uc2clstr': ValuedParameter('--', Name='uc2clstr', Delimiter=' ',
+ IsPath=True),
+
+ # Don't assume input is sorted by length (default assume sorted).
+ '--usersort': FlagParameter('--', Name='usersort'),
+
+ # Same as --maxrejects 0 --nowordcountreject.
+ # comes with a performance hit.
+ '--exact': FlagParameter('--', Name='exact'),
+
+ # Same as --maxrejects 0 --maxaccepts 0 --nowordcountreject --
+ # comes with a performance hit.
+ '--optimal': FlagParameter('--', Name='optimal'),
+
+ '--stable_sort': FlagParameter('--', Name='stable_sort'),
+
+ # From uclust help:
+ # Write all accepts to .uc file (default top hit/no match only).
+ '--allhits': FlagParameter('--', Name='allhits'),
+ }
+
+ _suppress_stdout = False
+ _suppress_stderr = False
+
+ def _input_as_parameters(self, data):
+ """ Set the input path (a fasta filepath)
+ """
+ # The list of values which can be passed on a per-run basis
+ allowed_values = ['--input', '--uc', '--fastapairs',
+ '--uc2clstr', '--output', '--mergesort']
+
+ unsupported_parameters = set(data.keys()) - set(allowed_values)
+ if unsupported_parameters:
+ raise ApplicationError(
+ "Unsupported parameter(s) passed when calling uclust: %s" %
+ ' '.join(unsupported_parameters))
+
+ for v in allowed_values:
+ # turn the parameter off so subsequent runs are not
+ # affected by parameter settings from previous runs
+ self.Parameters[v].off()
+ if v in data:
+ # turn the parameter on if specified by the user
+ self.Parameters[v].on(data[v])
+
+ return ''
+
+ def _get_result_paths(self, data):
+ """ Set the result paths """
+
+ result = {}
+
+ result['Output'] = ResultPath(
+ Path=self.Parameters['--output'].Value,
+ IsWritten=self.Parameters['--output'].isOn())
+
+ result['ClusterFile'] = ResultPath(
+ Path=self.Parameters['--uc'].Value,
+ IsWritten=self.Parameters['--uc'].isOn())
+
+ result['PairwiseAlignments'] = ResultPath(
+ Path=self.Parameters['--fastapairs'].Value,
+ IsWritten=self.Parameters['--fastapairs'].isOn())
+
+ return result
+
+ def _accept_exit_status(self, exit_status):
+ """ Test for acceptable exit status
+
+ uclust can seg fault and still generate a parsable .uc file
+ so we explicitly check the exit status
+
+ """
+ return exit_status == 0
+
+ def getHelp(self):
+ """Method that points to documentation"""
+ help_str =\
+ """
+ UCLUST is hosted at:
+ http://www.drive5.com/uclust/
+
+ The following papers should be cited if this resource is used:
+
+ Paper pending. Check with Robert Edgar who is writing the paper
+ for uclust as of March 2010. Cite the above URL for the time being.
+ """
+ return help_str
+
+# Start functions for processing uclust output files
+
+
+def get_next_record_type(lines, types):
+ for line in lines:
+ line = line.strip()
+ if line and line[0] in types:
+ yield line.split('\t')
+ return
+
+
+def get_next_two_fasta_records(lines):
+ result = []
+ for record in parse_fasta(lines):
+ result.append(record)
+ if len(result) == 2:
+ yield result
+ result = []
+ return
+
+
+def process_uclust_pw_alignment_results(fasta_pairs_lines, uc_lines):
+ """ Process results of uclust search and align """
+ alignments = get_next_two_fasta_records(fasta_pairs_lines)
+ for hit in get_next_record_type(uc_lines, 'H'):
+ matching_strand = hit[4]
+ if matching_strand == '-':
+ strand_id = '-'
+ target_rev_match = True
+ elif matching_strand == '+':
+ strand_id = '+'
+ target_rev_match = False
+ elif matching_strand == '.':
+ # protein sequence, so no strand information
+ strand_id = ''
+ target_rev_match = False
+ else:
+ raise UclustParseError("Unknown strand type: %s" % matching_strand)
+ uc_query_id = hit[8]
+ uc_target_id = hit[9]
+ percent_id = float(hit[3])
+
+ fasta_pair = alignments.next()
+
+ fasta_query_id = fasta_pair[0][0]
+ aligned_query = fasta_pair[0][1]
+
+ if fasta_query_id != uc_query_id:
+ raise UclustParseError("Order of fasta and uc files do not match." +
+ " Got query %s but expected %s." %
+ (fasta_query_id, uc_query_id))
+
+ fasta_target_id = fasta_pair[1][0]
+ aligned_target = fasta_pair[1][1]
+
+ if fasta_target_id != uc_target_id + strand_id:
+ raise UclustParseError("Order of fasta and uc files do not match." +
+ " Got target %s but expected %s." %
+ (fasta_target_id, uc_target_id + strand_id))
+
+ if target_rev_match:
+ query_id = uc_query_id + ' RC'
+ aligned_query = DNA.rc(aligned_query)
+ target_id = uc_target_id
+ aligned_target = DNA.rc(aligned_target)
+ else:
+ query_id = uc_query_id
+ aligned_query = aligned_query
+ target_id = uc_target_id
+ aligned_target = aligned_target
+
+ yield (query_id, target_id, aligned_query, aligned_target, percent_id)
+
+
+def clusters_from_uc_file(uc_lines,
+ error_on_multiple_hits=True):
+ """ Given an open .uc file, return lists (clusters, failures, new_seeds)
+
+ uc_lines: open .uc file, or similar object -- this is the output
+ generated by uclust's -uc parameter
+ error_on_multiple_hits: if True (default), when a single query hits
+ to multiple seeds, as can happen when --allhits is passed to uclust,
+ throw a UclustParseError. if False, when a single query hits to
+ multiple seeds, it will appear in each cluster.
+
+ This function processes all hit (H), seed (S), and no hit (N) lines
+ to return all clusters, failures, and new_seeds generated in
+ a uclust run. failures should only arise when users have passed
+ --lib and --libonly, and a sequence doesn't cluster to any existing
+ reference database sequences.
+
+ """
+ clusters = {}
+ failures = []
+ seeds = []
+ all_hits = set()
+ # the types of hit lines we're interested in here
+ # are hit (H), seed (S), library seed (L) and no hit (N)
+ hit_types = {}.fromkeys(list('HSNL'))
+ for record in get_next_record_type(uc_lines, hit_types):
+ hit_type = record[0]
+ # sequence identifiers from the fasta header lines only
+ # (no comment data) are stored to identify a sequence in
+ # a cluster -- strip off any comments here as this value
+ # is used in several places
+ query_id = record[8].split()[0]
+ target_cluster = record[9].split()[0]
+ if hit_type == 'H':
+ if error_on_multiple_hits and query_id in all_hits:
+ raise UclustParseError("Query id " + query_id + " hit multiple seeds. "
+ "This can happen if --allhits is "
+ "enabled in the call to uclust, which isn't supported by default. "
+ "Call clusters_from_uc_file(lines, error_on_multiple_hits=False) to "
+ "allow a query to cluster to multiple seeds.")
+ else:
+ # add the hit to its existing cluster (either library
+ # or new cluster)
+ clusters[target_cluster].append(query_id)
+ all_hits.add(query_id)
+ elif hit_type == 'S':
+ # a new seed was identified -- create a cluster with this
+ # sequence as the first instance
+ if query_id in clusters:
+ raise UclustParseError("A seq id was provided as a seed, but that seq id already "
+ "represents a cluster. Are there overlapping seq ids in your "
+ "reference and input files or repeated seq ids in either? "
+ "Offending seq id is %s" % query_id)
+ clusters[query_id] = [query_id]
+ seeds.append(query_id)
+ elif hit_type == 'L':
+ # a library seed was identified -- create a cluster with this
+ # id as the index, but don't give it any instances yet bc the hit
+ # line will be specified separately. note we need to handle these
+ # lines separately from the H lines to detect overlapping seq ids
+ # between the reference and the input fasta files
+ if query_id in clusters:
+ raise UclustParseError("A seq id was provided as a seed, but that seq id already "
+ "represents a cluster. Are there overlapping seq ids in your "
+ "reference and input files or repeated seq ids in either? "
+ "Offending seq id is %s" % query_id)
+ clusters[query_id] = []
+ elif hit_type == 'N':
+ # a failure was identified -- add it to the failures list
+ failures.append(query_id)
+ else:
+ # shouldn't be possible to get here, but provided for
+ # clarity
+ raise UclustParseError(
+ "Unexpected result parsing line:\n%s" %
+ '\t'.join(record))
+
+ # will need to return the full clusters dict, I think, to support
+ # useful identifiers in reference database clustering
+ # return clusters.values(), failures, seeds
+ return clusters, failures, seeds
+
+# End functions for processing uclust output files
+
+
+# Start uclust convenience functions
+def uclust_fasta_sort_from_filepath(
+ fasta_filepath,
+ output_filepath=None,
+ tmp_dir=gettempdir(),
+ HALT_EXEC=False):
+ """Generates sorted fasta file via uclust --mergesort."""
+ if not output_filepath:
+ _, output_filepath = mkstemp(dir=tmp_dir, prefix='uclust_fasta_sort',
+ suffix='.fasta')
+
+ app = Uclust(params={'--tmpdir': tmp_dir},
+ TmpDir=tmp_dir, HALT_EXEC=HALT_EXEC)
+
+ app_result = app(data={'--mergesort': fasta_filepath,
+ '--output': output_filepath})
+
+ return app_result
+
+
+def uclust_search_and_align_from_fasta_filepath(
+ query_fasta_filepath,
+ subject_fasta_filepath,
+ percent_ID=0.75,
+ enable_rev_strand_matching=True,
+ max_accepts=8,
+ max_rejects=32,
+ tmp_dir=gettempdir(),
+ HALT_EXEC=False):
+ """ query seqs against subject fasta using uclust,
+
+ return global pw alignment of best match
+ """
+
+ # Explanation of parameter settings
+ # id - min percent id to count a match
+ # maxaccepts = 8 , searches for best match rather than first match
+ # (0 => infinite accepts, or good matches before
+ # quitting search)
+ # maxaccepts = 32,
+ # libonly = True , does not add sequences to the library if they don't
+ # match something there already. this effectively makes
+ # uclust a search tool rather than a clustering tool
+
+ params = {'--id': percent_ID,
+ '--maxaccepts': max_accepts,
+ '--maxrejects': max_rejects,
+ '--libonly': True,
+ '--lib': subject_fasta_filepath,
+ '--tmpdir': tmp_dir}
+
+ if enable_rev_strand_matching:
+ params['--rev'] = True
+
+ # instantiate the application controller
+ app = Uclust(params,
+ TmpDir=tmp_dir, HALT_EXEC=HALT_EXEC)
+
+ # apply uclust
+ _, alignment_filepath = mkstemp(dir=tmp_dir, prefix='uclust_alignments',
+ suffix='.fasta')
+ _, uc_filepath = mkstemp(dir=tmp_dir, prefix='uclust_results',
+ suffix='.uc')
+ input_data = {'--input': query_fasta_filepath,
+ '--fastapairs': alignment_filepath,
+ '--uc': uc_filepath}
+ app_result = app(input_data)
+
+ # yield the pairwise alignments
+ for result in process_uclust_pw_alignment_results(
+ app_result['PairwiseAlignments'], app_result['ClusterFile']):
+ try:
+ yield result
+ except GeneratorExit:
+ break
+
+ # clean up the temp files that were generated
+ app_result.cleanUp()
+
+ return
+
+
+def uclust_cluster_from_sorted_fasta_filepath(
+ fasta_filepath,
+ uc_save_filepath=None,
+ percent_ID=0.97,
+ max_accepts=1,
+ max_rejects=8,
+ stepwords=8,
+ word_length=8,
+ optimal=False,
+ exact=False,
+ suppress_sort=False,
+ enable_rev_strand_matching=False,
+ subject_fasta_filepath=None,
+ suppress_new_clusters=False,
+ stable_sort=False,
+ tmp_dir=gettempdir(),
+ HALT_EXEC=False):
+ """ Returns clustered uclust file from sorted fasta"""
+ output_filepath = uc_save_filepath
+ if not output_filepath:
+ _, output_filepath = mkstemp(dir=tmp_dir, prefix='uclust_clusters',
+ suffix='.uc')
+
+ params = {'--id': percent_ID,
+ '--maxaccepts': max_accepts,
+ '--maxrejects': max_rejects,
+ '--stepwords': stepwords,
+ '--w': word_length,
+ '--tmpdir': tmp_dir}
+ app = Uclust(params,
+ TmpDir=tmp_dir, HALT_EXEC=HALT_EXEC)
+
+ # Set any additional parameters specified by the user
+ if enable_rev_strand_matching:
+ app.Parameters['--rev'].on()
+ if optimal:
+ app.Parameters['--optimal'].on()
+ if exact:
+ app.Parameters['--exact'].on()
+ if suppress_sort:
+ app.Parameters['--usersort'].on()
+ if subject_fasta_filepath:
+ app.Parameters['--lib'].on(subject_fasta_filepath)
+ if suppress_new_clusters:
+ app.Parameters['--libonly'].on()
+ if stable_sort:
+ app.Parameters['--stable_sort'].on()
+
+ app_result = app({'--input': fasta_filepath, '--uc': output_filepath})
+ return app_result
+
+
+def get_output_filepaths(output_dir, fasta_filepath):
+ """ Returns filepaths for intermediate file to be kept """
+ return join(output_dir,
+ splitext(basename(fasta_filepath))[0] + '_clusters.uc')
+
+
+def get_clusters_from_fasta_filepath(
+ fasta_filepath,
+ original_fasta_path,
+ percent_ID=0.97,
+ max_accepts=1,
+ max_rejects=8,
+ stepwords=8,
+ word_length=8,
+ optimal=False,
+ exact=False,
+ suppress_sort=False,
+ output_dir=None,
+ enable_rev_strand_matching=False,
+ subject_fasta_filepath=None,
+ suppress_new_clusters=False,
+ return_cluster_maps=False,
+ stable_sort=False,
+ tmp_dir=gettempdir(),
+ save_uc_files=True,
+ HALT_EXEC=False):
+ """ Main convenience wrapper for using uclust to generate cluster files
+
+ A source fasta file is required for the fasta_filepath. This will be
+ sorted to be in order of longest to shortest length sequences. Following
+ this, the sorted fasta file is used to generate a cluster file in the
+ uclust (.uc) format. Next the .uc file is converted to cd-hit format
+ (.clstr). Finally this file is parsed and returned as a list of lists,
+ where each sublist a cluster of sequences. If an output_dir is
+ specified, the intermediate files will be preserved, otherwise all
+ files created are temporary and will be deleted at the end of this
+ function
+
+ The percent_ID parameter specifies the percent identity for a clusters,
+ i.e., if 99% were the parameter, all sequences that were 99% identical
+ would be grouped as a cluster.
+ """
+
+ # Create readable intermediate filenames if they are to be kept
+ fasta_output_filepath = None
+ uc_output_filepath = None
+ cd_hit_filepath = None
+
+ if output_dir and not output_dir.endswith('/'):
+ output_dir += '/'
+
+ if save_uc_files:
+ uc_save_filepath = get_output_filepaths(
+ output_dir,
+ original_fasta_path)
+ else:
+ uc_save_filepath = None
+
+ sorted_fasta_filepath = ""
+ uc_filepath = ""
+ clstr_filepath = ""
+
+ # Error check in case any app controller fails
+ files_to_remove = []
+ try:
+ if not suppress_sort:
+ # Sort fasta input file from largest to smallest sequence
+ sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath,
+ output_filepath=fasta_output_filepath)
+
+ # Get sorted fasta name from application wrapper
+ sorted_fasta_filepath = sort_fasta['Output'].name
+ files_to_remove.append(sorted_fasta_filepath)
+
+ else:
+ sort_fasta = None
+ sorted_fasta_filepath = fasta_filepath
+
+ # Generate uclust cluster file (.uc format)
+ uclust_cluster = uclust_cluster_from_sorted_fasta_filepath(
+ sorted_fasta_filepath,
+ uc_save_filepath,
+ percent_ID=percent_ID,
+ max_accepts=max_accepts,
+ max_rejects=max_rejects,
+ stepwords=stepwords,
+ word_length=word_length,
+ optimal=optimal,
+ exact=exact,
+ suppress_sort=suppress_sort,
+ enable_rev_strand_matching=enable_rev_strand_matching,
+ subject_fasta_filepath=subject_fasta_filepath,
+ suppress_new_clusters=suppress_new_clusters,
+ stable_sort=stable_sort,
+ tmp_dir=tmp_dir,
+ HALT_EXEC=HALT_EXEC)
+ # Get cluster file name from application wrapper
+ remove_files(files_to_remove)
+ except ApplicationError:
+ remove_files(files_to_remove)
+ raise ApplicationError('Error running uclust. Possible causes are '
+ 'unsupported version (current supported version is v1.2.22) is installed or '
+ 'improperly formatted input file was provided')
+ except ApplicationNotFoundError:
+ remove_files(files_to_remove)
+ raise ApplicationNotFoundError('uclust not found, is it properly ' +
+ 'installed?')
+
+ # Get list of lists for each cluster
+ clusters, failures, seeds = \
+ clusters_from_uc_file(uclust_cluster['ClusterFile'])
+
+ # Remove temp files unless user specifies output filepath
+ if not save_uc_files:
+ uclust_cluster.cleanUp()
+
+ if return_cluster_maps:
+ return clusters, failures, seeds
+ else:
+ return clusters.values(), failures, seeds
+
+# End uclust convenience functions
diff --git a/bfillings/usearch.py b/bfillings/usearch.py
new file mode 100644
index 0000000..fda86fc
--- /dev/null
+++ b/bfillings/usearch.py
@@ -0,0 +1,2547 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for usearch v5.2.32
+
+Includes application controllers for usearch and
+convenience wrappers for different functions of uclust including
+sorting fasta files, finding clusters, converting to cd-hit format and
+searching and aligning against a database. Also contains
+a parser for the resulting .clstr file.
+
+Modified from pycogent_backports/uclust.py, written by
+Greg Caporaso/William Walters
+"""
+
+from os.path import splitext, abspath, join
+from tempfile import mkstemp, gettempdir
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+ ApplicationError, ApplicationNotFoundError)
+from skbio.util import remove_files
+
+
+class UsearchParseError(Exception):
+ pass
+
+
+class Usearch(CommandLineApplication):
+
+ """ Usearch ApplicationController
+
+ """
+
+ _command = 'usearch'
+ _input_handler = '_input_as_parameters'
+ _parameters = {
+
+ # Fasta input file for merge-sort function
+ '--mergesort': ValuedParameter('--', Name='mergesort', Delimiter=' ',
+ IsPath=True),
+
+ # Fasta input file for merge-sort function
+ '--evalue': ValuedParameter('--', Name='evalue', Delimiter=' ',
+ IsPath=False),
+
+ # Output file, used by several difference functions
+ '--output': ValuedParameter('--', Name='output', Delimiter=' ',
+ IsPath=True),
+
+ # Output filename will be in uclust (.uc) format
+ # Output cluster file, required parameter
+ '--uc': ValuedParameter('--', Name='uc', Delimiter=' ',
+ IsPath=True),
+
+ '--blast6out': ValuedParameter('--', Name='blast6out', Delimiter=' ',
+ IsPath=True),
+
+ # ID percent for OTU, by default is 97%
+ '--id': ValuedParameter('--', Name='id', Delimiter=' ', IsPath=False),
+
+ '--evalue':
+ ValuedParameter('--', Name='evalue', Delimiter=' ', IsPath=False),
+
+ '--queryalnfract':
+ ValuedParameter(
+ '--',
+ Name='queryalnfract',
+ Delimiter=' ',
+ IsPath=False),
+
+ '--targetalnfract':
+ ValuedParameter(
+ '--',
+ Name='targetalnfract',
+ Delimiter=' ',
+ IsPath=False),
+
+ # Enable reverse strand matching. Will double memory.
+ '--rev': FlagParameter('--', Name='rev'),
+
+ # Maximum hits before quitting search (default 1, 0=infinity).
+ '--maxaccepts':
+ ValuedParameter('--', Name='maxaccepts', Delimiter=' '),
+
+ # Maximum rejects before quitting search (default 8, 0=infinity).
+ '--maxrejects':
+ ValuedParameter('--', Name='maxrejects', Delimiter=' '),
+
+ # Target nr. of common words (default 8, 0=don't step)
+ '--stepwords': ValuedParameter('--', Name='stepwords', Delimiter=' '),
+
+ # Word length for windex (default 5 aa.s, 8 nuc.s).
+ '--w': ValuedParameter('--', Name='w', Delimiter=' '),
+
+ # Don't assume input is sorted by length (default assume sorted).
+ '--usersort': FlagParameter('--', Name='usersort'),
+
+ # log filepath
+ '--log': ValuedParameter('--', Name='log', Delimiter=' ', IsPath=True),
+
+ # cluster command
+ '--cluster': ValuedParameter('--', Name='cluster', Delimiter=' ',
+ IsPath=True),
+
+
+ # Size of compressed index table. Should be prime, e.g. 40000003.
+ '--slots': ValuedParameter('--', Name='slots', Delimiter=' ',
+ IsPath=False),
+
+ # Not specified in usearch helpstring...
+ '--sizein': FlagParameter('--', Name='sizein'),
+
+ # Not specified in usearch helpstring...
+ '--sizeout': FlagParameter('--', Name='sizeout'),
+
+ # Not specified in usearch helpstring...
+ '--minlen': ValuedParameter('--', Name='minlen', Delimiter=' ',
+ IsPath=False),
+
+ # output filepath for dereplicated fasta file
+ '--seedsout': ValuedParameter('--', Name='seedsout', Delimiter=' ',
+ IsPath=True),
+
+ # Dereplicate exact subsequences
+ '--derep_subseq': FlagParameter('--', Name='derep_subseq'),
+
+ # Dereplicate exact sequences
+ '--derep_fullseq': FlagParameter('--', Name='derep_fullseq'),
+
+ # Sort by abundance
+ '--sortsize': ValuedParameter('--', Name='sortsize', Delimiter=' ',
+ IsPath=True),
+
+ # usearch search plus clustering
+ '--consout': ValuedParameter('--', Name='consout', Delimiter=' ',
+ IsPath=True),
+
+ # Abundance skew setting for uchime de novo chimera detection
+ '--abskew': ValuedParameter('--', Name='abskew', Delimiter=' ',
+ IsPath=False),
+
+ # input fasta filepath for uchime chimera
+ '--uchime': ValuedParameter('--', Name='uchime', Delimiter=' ',
+ IsPath=True),
+
+ # output chimera filepath
+ '--chimeras': ValuedParameter('--', Name='chimeras', Delimiter=' ',
+ IsPath=True),
+
+ # output non-chimera filepath
+ '--nonchimeras': ValuedParameter('--', Name='nonchimeras',
+ Delimiter=' ', IsPath=True),
+
+ # reference sequence database for ref based chimera detection
+ '--db': ValuedParameter('--', Name='db', Delimiter=' ', IsPath=True),
+
+ # output clusters filepath for chimera detection
+ '--uchimeout': ValuedParameter('--', Name='uchimeout', Delimiter=' ',
+ IsPath=True),
+
+ # minimum cluster size for quality filtering
+ '--minsize': ValuedParameter('--', Name='minsize', Delimiter=' ',
+ IsPath=False),
+
+ # input fasta for blast alignments
+ '--query': ValuedParameter('--', Name='query', Delimiter=' ',
+ IsPath=True),
+
+ # global alignment flag
+ '--global': FlagParameter('--', Name='global')
+
+ }
+
+ _suppress_stdout = False
+ _suppress_stderr = False
+
+ def _input_as_parameters(self, data):
+ """ Set the input path (a fasta filepath)
+ """
+ # The list of values which can be passed on a per-run basis
+ allowed_values = ['--uc', '--output', '--mergesort', '--log',
+ '--cluster', '--seedsout', '--sortsize',
+ '--consout', '--uchime', '--chimeras',
+ '--nonchimeras', '--db', '--uchimeout',
+ '--query', '--blast6out']
+
+ unsupported_parameters = set(data.keys()) - set(allowed_values)
+ if unsupported_parameters:
+ raise ApplicationError(
+ "Unsupported parameter(s) passed when calling usearch: %s" %
+ ' '.join(unsupported_parameters))
+
+ for v in allowed_values:
+ # turn the parameter off so subsequent runs are not
+ # affected by parameter settings from previous runs
+ self.Parameters[v].off()
+ if v in data:
+ # turn the parameter on if specified by the user
+ self.Parameters[v].on(data[v])
+
+ return ''
+
+ def _get_result_paths(self, data):
+ """ Set the result paths """
+
+ result = {}
+
+ result['Output'] = ResultPath(
+ Path=self.Parameters['--output'].Value,
+ IsWritten=self.Parameters['--output'].isOn())
+
+ result['ClusterFile'] = ResultPath(
+ Path=self.Parameters['--uc'].Value,
+ IsWritten=self.Parameters['--uc'].isOn())
+
+ return result
+
+ def _accept_exit_status(self, exit_status):
+ """ Test for acceptable exit status
+
+ usearch can seg fault and still generate a parsable .uc file
+ so we explicitly check the exit status
+
+ """
+ return exit_status == 0
+
+ def getHelp(self):
+ """Method that points to documentation"""
+ help_str =\
+ """
+ USEARCH is hosted at:
+ http://www.drive5.com/usearch/
+
+ The following papers should be cited if this resource is used:
+
+ Paper pending. Check with Robert Edgar who is writing the paper
+ for usearch as of Aug. 2011
+ """
+ return help_str
+
+# Start functions for processing usearch output files
+
+
+def clusters_from_blast_uc_file(uc_lines, otu_id_field=1):
+ """ Parses out hit/miss sequences from usearch blast uc file
+
+ All lines should be 'H'it or 'N'o hit. Returns a dict of OTU ids: sequence
+ labels of the hits, and a list of all sequence labels that miss.
+
+ uc_lines = open file object of uc file
+
+ otu_id_field: uc field to use as the otu id. 1 is usearch's ClusterNr field,
+ and 9 is usearch's TargetLabel field
+
+ """
+
+ hit_miss_index = 0
+ cluster_id_index = otu_id_field
+ seq_label_index = 8
+
+ otus = {}
+ unassigned_seqs = []
+
+ for line in uc_lines:
+ # skip empty, comment lines
+ if line.startswith('#') or len(line.strip()) == 0:
+ continue
+
+ curr_line = line.split('\t')
+
+ if curr_line[hit_miss_index] == 'N':
+ # only retaining actual sequence label
+ unassigned_seqs.append(curr_line[seq_label_index].split()[0])
+
+ if curr_line[hit_miss_index] == 'H':
+
+ curr_seq_label = curr_line[seq_label_index].split()[0]
+ curr_otu_id = curr_line[cluster_id_index].split()[0]
+ # Append sequence label to dictionary, or create key
+ try:
+ otus[curr_otu_id].append(curr_seq_label)
+ except KeyError:
+ otus[curr_otu_id] = [curr_seq_label]
+
+ return otus, unassigned_seqs
+
+
+# End functions for processing usearch output files
+# Start usearch convenience functions
+def usearch_fasta_sort_from_filepath(
+ fasta_filepath,
+ output_filepath=None,
+ log_name="sortlen.log",
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ working_dir=None):
+ """Generates sorted fasta file via usearch --mergesort.
+
+ fasta_filepath: filepath to input fasta file
+ output_filepath: filepath for output sorted fasta file.
+ log_name: string to specify log filename
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created."""
+ if not output_filepath:
+ _, output_filepath = mkstemp(prefix='usearch_fasta_sort',
+ suffix='.fasta')
+
+ log_filepath = join(working_dir, log_name)
+
+ params = {}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ data = {'--mergesort': fasta_filepath,
+ '--output': output_filepath,
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ app_result = app(data)
+
+ return app_result, output_filepath
+
+
+def usearch_dereplicate_exact_subseqs(
+ fasta_filepath,
+ output_filepath=None,
+ minlen=64,
+ w=64,
+ slots=16769023,
+ sizeout=True,
+ maxrejects=64,
+ log_name="derep.log",
+ usersort=False,
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ working_dir=None):
+ """ Generates clusters and fasta file of dereplicated subsequences
+
+ These parameters are those specified by Robert Edgar for optimal use of
+ usearch in clustering/filtering sequences.
+
+ fasta_filepath = input filepath of fasta file to be dereplicated
+ output_filepath = output filepath of dereplicated fasta file
+ minlen = (not specified in usearch helpstring)
+ w = Word length for U-sorting
+ slots = Size of compressed index table. Should be prime, e.g. 40000003.
+ Should also specify --w, typical is --w 16 or --w 32.
+ sizeout = (not specified in usearch helpstring)
+ maxrejects = Max rejected targets, 0=ignore, default 32.
+ log_name: string to specify log filename
+ usersort = Enable if input fasta not sorted by length purposefully, lest
+ usearch will raise an error.
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created."""
+ if not output_filepath:
+ _, output_filepath = mkstemp(prefix='usearch_fasta_dereplicated',
+ suffix='.fasta')
+
+ log_filepath = join(working_dir, log_name)
+
+ uc_filepath = join(working_dir, "derep.uc")
+
+ params = {'--derep_subseq': True,
+ '--minlen': minlen,
+ '--w': w,
+ '--slots': slots,
+ '--sizeout': sizeout,
+ '--maxrejects': maxrejects}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if usersort:
+ app.Parameters['--usersort'].on()
+
+ data = {'--cluster': fasta_filepath,
+ '--uc': uc_filepath,
+ '--seedsout': output_filepath
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ app_result = app(data)
+
+ if not save_intermediate_files:
+ remove_files([uc_filepath])
+
+ # Returning output filepath to delete if specified.
+
+ return app_result, output_filepath
+
+
+def usearch_dereplicate_exact_seqs(
+ fasta_filepath,
+ output_filepath=None,
+ minlen=64,
+ w=64,
+ slots=16769023,
+ sizeout=True,
+ maxrejects=64,
+ log_name="derep.log",
+ usersort=False,
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ working_dir=None):
+ """ Generates clusters and fasta file of dereplicated subsequences
+ for exact sequences.
+
+ These parameters are those specified by Robert Edgar for optimal use of
+ usearch in clustering/filtering sequences.
+
+ fasta_filepath = input filepath of fasta file to be dereplicated
+ output_filepath = output filepath of dereplicated fasta file
+ minlen = (not specified in usearch helpstring)
+ w = Word length for U-sorting
+ slots = Size of compressed index table. Should be prime, e.g. 40000003.
+ Should also specify --w, typical is --w 16 or --w 32.
+ sizeout = (not specified in usearch helpstring)
+ maxrejects = Max rejected targets, 0=ignore, default 32.
+ log_name: string to specify log filename
+ usersort = Enable if input fasta not sorted by length purposefully, lest
+ usearch will raise an error.
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created."""
+ if not output_filepath:
+ _, output_filepath = mkstemp(prefix='usearch_fasta_dereplicated',
+ suffix='.fasta')
+
+ log_filepath = join(working_dir, log_name)
+
+ uc_filepath = join(working_dir, "derep.uc")
+
+ params = {'--derep_fullseq': True,
+ '--minlen': minlen,
+ '--w': w,
+ '--slots': slots,
+ '--sizeout': sizeout,
+ '--maxrejects': maxrejects}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if usersort:
+ app.Parameters['--usersort'].on()
+
+ data = {'--cluster': fasta_filepath,
+ '--uc': uc_filepath,
+ '--seedsout': output_filepath
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ app_result = app(data)
+
+ if not save_intermediate_files:
+ remove_files([uc_filepath])
+
+ # Returning output filepath to delete if specified.
+
+ return app_result, output_filepath
+
+
+def usearch_sort_by_abundance(
+ fasta_filepath,
+ output_filepath=None,
+ sizein=True,
+ sizeout=True,
+ minsize=0,
+ log_name="abundance_sort.log",
+ usersort=False,
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ working_dir=None):
+ """ Sorts fasta file by abundance
+
+ fasta_filepath = input fasta file, generally a dereplicated fasta
+ output_filepath = output abundance sorted fasta filepath
+ sizein = not defined in usearch helpstring
+ sizeout = not defined in usearch helpstring
+ minsize = minimum size of cluster to retain.
+ log_name = string to specify log filename
+ usersort = Use if not sorting by abundance or usearch will raise an error
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created.
+ """
+ if not output_filepath:
+ _, output_filepath = mkstemp(prefix='usearch_abundance_sorted',
+ suffix='.fasta')
+
+ log_filepath = join(
+ working_dir,
+ "minsize_" + str(minsize) + "_" + log_name)
+
+ params = {}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if usersort:
+ app.Parameters['--usersort'].on()
+
+ if minsize:
+ app.Parameters['--minsize'].on(minsize)
+
+ if sizein:
+ app.Parameters['--sizein'].on()
+
+ if sizeout:
+ app.Parameters['--sizeout'].on()
+
+ data = {'--sortsize': fasta_filepath,
+ '--output': output_filepath
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ # Can have no data following this filter step, which will raise an
+ # application error, try to catch it here to raise meaningful message.
+
+ try:
+ app_result = app(data)
+ except ApplicationError:
+ raise ValueError('No data following filter steps, please check ' +
+ 'parameter settings for usearch_qf.')
+
+ return app_result, output_filepath
+
+
+def usearch_cluster_error_correction(
+ fasta_filepath,
+ output_filepath=None,
+ output_uc_filepath=None,
+ percent_id_err=0.97,
+ sizein=True,
+ sizeout=True,
+ w=64,
+ slots=16769023,
+ maxrejects=64,
+ log_name="usearch_cluster_err_corrected.log",
+ usersort=False,
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ working_dir=None):
+ """ Cluster for err. correction at percent_id_err, output consensus fasta
+
+ fasta_filepath = input fasta file, generally a dereplicated fasta
+ output_filepath = output error corrected fasta filepath
+ percent_id_err = minimum identity percent.
+ sizein = not defined in usearch helpstring
+ sizeout = not defined in usearch helpstring
+ w = Word length for U-sorting
+ slots = Size of compressed index table. Should be prime, e.g. 40000003.
+ Should also specify --w, typical is --w 16 or --w 32.
+ maxrejects = Max rejected targets, 0=ignore, default 32.
+ log_name = string specifying output log name
+ usersort = Enable if input fasta not sorted by length purposefully, lest
+ usearch will raise an error.
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created.
+ """
+ if not output_filepath:
+ _, output_filepath = mkstemp(prefix='usearch_cluster_err_corrected',
+ suffix='.fasta')
+
+ log_filepath = join(working_dir, log_name)
+
+ params = {'--sizein': sizein,
+ '--sizeout': sizeout,
+ '--id': percent_id_err,
+ '--w': w,
+ '--slots': slots,
+ '--maxrejects': maxrejects}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if usersort:
+ app.Parameters['--usersort'].on()
+
+ data = {'--cluster': fasta_filepath,
+ '--consout': output_filepath
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ if output_uc_filepath:
+ data['--uc'] = output_uc_filepath
+
+ app_result = app(data)
+
+ return app_result, output_filepath
+
+
+def usearch_chimera_filter_de_novo(
+ fasta_filepath,
+ output_chimera_filepath=None,
+ output_non_chimera_filepath=None,
+ abundance_skew=2.0,
+ log_name="uchime_de_novo_chimera_filtering.log",
+ usersort=False,
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ working_dir=None):
+ """ Chimera filter de novo, output chimeras and non-chimeras to fastas
+
+ fasta_filepath = input fasta file, generally a dereplicated fasta
+ output_chimera_filepath = output chimera filepath
+ output_non_chimera_filepath = output non chimera filepath
+ abundance_skew = abundance skew setting for de novo filtering.
+ usersort = Enable if input fasta not sorted by length purposefully, lest
+ usearch will raise an error.
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created.
+ """
+ if not output_chimera_filepath:
+ _, output_chimera_filepath = mkstemp(prefix='uchime_chimeras_',
+ suffix='.fasta')
+
+ if not output_non_chimera_filepath:
+ _, output_non_chimera_filepath = mkstemp(prefix='uchime_non_chimeras_',
+ suffix='.fasta')
+
+ log_filepath = join(working_dir, log_name)
+
+ params = {'--abskew': abundance_skew}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if usersort:
+ app.Parameters['--usersort'].on()
+
+ data = {'--uchime': fasta_filepath,
+ '--chimeras': output_chimera_filepath,
+ '--nonchimeras': output_non_chimera_filepath
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ app_result = app(data)
+
+ if not save_intermediate_files:
+ remove_files([output_chimera_filepath])
+
+ return app_result, output_non_chimera_filepath
+
+
+def usearch_chimera_filter_ref_based(
+ fasta_filepath,
+ db_filepath,
+ output_chimera_filepath=None,
+ output_non_chimera_filepath=None,
+ rev=False,
+ log_name="uchime_reference_chimera_filtering.log",
+ usersort=False,
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ working_dir=None):
+ """ Chimera filter against a reference database.
+
+ fasta_filepath = input fasta file, generally a dereplicated fasta
+ db_filepath = filepath to reference sequence database
+ output_chimera_filepath = output chimera filepath
+ output_non_chimera_filepath = output non chimera filepath
+ rev = search plus and minus strands of sequences
+ abundance_skew = abundance skew setting for de novo filtering.
+ log_name = string specifying log filename.
+ usersort = Enable if input fasta not sorted by length purposefully, lest
+ usearch will raise an error.
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created.
+ """
+
+ if not output_chimera_filepath:
+ _, output_chimera_filepath = mkstemp(prefix='uchime_chimeras_',
+ suffix='.fasta')
+
+ if not output_non_chimera_filepath:
+ _, output_non_chimera_filepath = mkstemp(prefix='uchime_non_chimeras_',
+ suffix='.fasta')
+
+ log_filepath = join(working_dir, log_name)
+
+ # clusters filepath created by usearch
+ cluster_filepath = join(working_dir, "refdb.uc")
+
+ params = {}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if usersort:
+ app.Parameters['--usersort'].on()
+ if rev:
+ app.Parameters['--rev'].on()
+
+ data = {'--uchime': fasta_filepath,
+ '--db': db_filepath,
+ '--chimeras': output_chimera_filepath,
+ '--nonchimeras': output_non_chimera_filepath,
+ '--uchimeout': cluster_filepath
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ app_result = app(data)
+
+ if not save_intermediate_files:
+ remove_files([cluster_filepath, output_chimera_filepath])
+
+ return app_result, output_non_chimera_filepath
+
+
+def usearch_cluster_seqs(
+ fasta_filepath,
+ output_filepath=None,
+ percent_id=0.97,
+ sizein=True,
+ sizeout=True,
+ w=64,
+ slots=16769023,
+ maxrejects=64,
+ log_name="usearch_cluster_seqs.log",
+ usersort=True,
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ working_dir=None
+):
+ """ Cluster seqs at percent_id, output consensus fasta
+
+ fasta_filepath = input fasta file, generally a dereplicated fasta
+ output_filepath = output error corrected fasta filepath
+ percent_id = minimum identity percent.
+ sizein = not defined in usearch helpstring
+ sizeout = not defined in usearch helpstring
+ w = Word length for U-sorting
+ slots = Size of compressed index table. Should be prime, e.g. 40000003.
+ Should also specify --w, typical is --w 16 or --w 32.
+ maxrejects = Max rejected targets, 0=ignore, default 32.
+ log_name = string specifying output log name
+ usersort = Enable if input fasta not sorted by length purposefully, lest
+ usearch will raise an error. In post chimera checked sequences, the seqs
+ are sorted by abundance, so this should be set to True.
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created.
+ """
+ if not output_filepath:
+ _, output_filepath = mkstemp(prefix='usearch_cluster', suffix='.fasta')
+
+ log_filepath = join(working_dir, log_name)
+
+ uc_filepath = join(working_dir, "clustered_seqs_post_chimera.uc")
+
+ params = {'--sizein': sizein,
+ '--sizeout': sizeout,
+ '--id': percent_id,
+ '--w': w,
+ '--slots': slots,
+ '--maxrejects': maxrejects}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if usersort:
+ app.Parameters['--usersort'].on()
+
+ data = {'--cluster': fasta_filepath,
+ '--seedsout': output_filepath,
+ '--uc': uc_filepath
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ app_result = app(data)
+
+ if not save_intermediate_files:
+ remove_files([uc_filepath])
+
+ return app_result, output_filepath
+
+
+def usearch_cluster_seqs_ref(
+ fasta_filepath,
+ output_filepath=None,
+ percent_id=0.97,
+ sizein=True,
+ sizeout=True,
+ w=64,
+ slots=16769023,
+ maxrejects=64,
+ log_name="usearch_cluster_seqs.log",
+ usersort=True,
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ suppress_new_clusters=False,
+ refseqs_fp=None,
+ output_dir=None,
+ working_dir=None,
+ rev=False):
+ """ Cluster seqs at percent_id, output consensus fasta
+
+ Also appends de novo clustered seqs if suppress_new_clusters is False.
+ Forced to handle reference + de novo in hackish fashion as usearch does not
+ work as listed in the helpstrings. Any failures are clustered de novo,
+ and given unique cluster IDs.
+
+ fasta_filepath = input fasta file, generally a dereplicated fasta
+ output_filepath = output reference clustered uc filepath
+ percent_id = minimum identity percent.
+ sizein = not defined in usearch helpstring
+ sizeout = not defined in usearch helpstring
+ w = Word length for U-sorting
+ slots = Size of compressed index table. Should be prime, e.g. 40000003.
+ Should also specify --w, typical is --w 16 or --w 32.
+ maxrejects = Max rejected targets, 0=ignore, default 32.
+ log_name = string specifying output log name
+ usersort = Enable if input fasta not sorted by length purposefully, lest
+ usearch will raise an error. In post chimera checked sequences, the seqs
+ are sorted by abundance, so this should be set to True.
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created.
+ suppress_new_clusters: Disables de novo OTUs when ref based OTU picking
+ enabled.
+ refseqs_fp: Filepath for ref based OTU picking
+ output_dir: output directory
+ rev = search plus and minus strands of sequences
+ """
+ if not output_filepath:
+ _, output_filepath = mkstemp(prefix='usearch_cluster_ref_based',
+ suffix='.uc')
+
+ log_filepath = join(working_dir, log_name)
+
+ uc_filepath = join(working_dir, "clustered_seqs_post_chimera.uc")
+
+ params = {'--sizein': sizein,
+ '--sizeout': sizeout,
+ '--id': percent_id,
+ '--w': w,
+ '--slots': slots,
+ '--maxrejects': maxrejects}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if usersort:
+ app.Parameters['--usersort'].on()
+ if rev:
+ app.Parameters['--rev'].on()
+
+ data = {'--query': fasta_filepath,
+ '--uc': uc_filepath,
+ '--db': refseqs_fp
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ app_result = app(data)
+
+ files_to_remove = []
+
+ # Need to create fasta file of all hits (with reference IDs),
+ # recluster failures if new clusters allowed, and create complete fasta
+ # file, with unique fasta label IDs.
+
+ if suppress_new_clusters:
+ output_fna_filepath = join(output_dir, 'ref_clustered_seqs.fasta')
+ output_filepath, labels_hits = get_fasta_from_uc_file(fasta_filepath,
+ uc_filepath, hit_type="H", output_dir=output_dir,
+ output_fna_filepath=output_fna_filepath)
+
+ files_to_remove.append(uc_filepath)
+ else:
+ # Get fasta of successful ref based clusters
+ output_fna_clustered = join(output_dir, 'ref_clustered_seqs.fasta')
+ output_filepath_ref_clusters, labels_hits =\
+ get_fasta_from_uc_file(fasta_filepath, uc_filepath, hit_type="H",
+ output_dir=output_dir, output_fna_filepath=output_fna_clustered)
+
+ # get failures and recluster
+ output_fna_failures =\
+ join(output_dir, 'ref_clustered_seqs_failures.fasta')
+ output_filepath_failures, labels_hits =\
+ get_fasta_from_uc_file(fasta_filepath,
+ uc_filepath, hit_type="N", output_dir=output_dir,
+ output_fna_filepath=output_fna_failures)
+
+ # de novo cluster the failures
+ app_result, output_filepath_clustered_failures =\
+ usearch_cluster_seqs(output_fna_failures, output_filepath=
+ join(
+ output_dir,
+ 'clustered_seqs_reference_failures.fasta'),
+ percent_id=percent_id, sizein=sizein, sizeout=sizeout, w=w,
+ slots=slots, maxrejects=maxrejects,
+ save_intermediate_files=save_intermediate_files,
+ remove_usearch_logs=remove_usearch_logs, working_dir=working_dir)
+
+ output_filepath = concatenate_fastas(output_fna_clustered,
+ output_fna_failures, output_concat_filepath=join(
+ output_dir,
+ 'concatenated_reference_denovo_clusters.fasta'))
+
+ files_to_remove.append(output_fna_clustered)
+ files_to_remove.append(output_fna_failures)
+ files_to_remove.append(output_filepath_clustered_failures)
+
+ if not save_intermediate_files:
+ remove_files(files_to_remove)
+
+ return app_result, output_filepath
+
+
+def concatenate_fastas(output_fna_clustered,
+ output_fna_failures,
+ output_concat_filepath):
+ """ Concatenates two input fastas, writes to output_concat_filepath
+
+ output_fna_clustered: fasta of successful ref clusters
+ output_fna_failures: de novo fasta of cluster failures
+ output_concat_filepath: path to write combined fastas to
+ """
+
+ output_fp = open(output_concat_filepath, "w")
+
+ for label, seq in parse_fasta(open(output_fna_clustered, "U")):
+ output_fp.write(">%s\n%s\n" % (label, seq))
+ for label, seq in parse_fasta(open(output_fna_failures, "U")):
+ output_fp.write(">%s\n%s\n" % (label, seq))
+
+ return output_concat_filepath
+
+
+def enumerate_otus(fasta_filepath,
+ output_filepath=None,
+ label_prefix="",
+ label_suffix="",
+ retain_label_as_comment=False,
+ count_start=0):
+ """ Writes unique, sequential count to OTUs
+
+ fasta_filepath = input fasta filepath
+ output_filepath = output fasta filepath
+ label_prefix = string to place before enumeration
+ label_suffix = string to place after enumeration
+ retain_label_as_comment = if True, will place existing label in sequence
+ comment, after a tab
+ count_start = number to start enumerating OTUs with
+
+ """
+
+ fasta_i = open(fasta_filepath, "U")
+
+ if not output_filepath:
+ _, output_filepath = mkstemp(prefix='enumerated_seqs_',
+ suffix='.fasta')
+
+ fasta_o = open(output_filepath, "w")
+
+ for label, seq in parse_fasta(fasta_i):
+ curr_label = ">" + label_prefix + str(count_start) + label_suffix
+ if retain_label_as_comment:
+ curr_label += '\t' + label
+ fasta_o.write(curr_label.strip() + '\n')
+ fasta_o.write(seq.strip() + '\n')
+ count_start += 1
+
+ return output_filepath
+
+
+def get_fasta_from_uc_file(fasta_filepath,
+ uc_filepath,
+ hit_type="H",
+ output_fna_filepath=None,
+ label_prefix="",
+ output_dir=None):
+ """ writes fasta of sequences from uc file of type hit_type
+
+ fasta_filepath: Filepath of original query fasta file
+ uc_filepath: Filepath of .uc file created by usearch post error filtering
+ hit_type: type to read from first field of .uc file, "H" for hits, "N" for
+ no hits.
+ output_fna_filepath = fasta output filepath
+ label_prefix = Added before each fasta label, important when doing ref
+ based OTU picking plus de novo clustering to preserve label matching.
+ output_dir: output directory
+ """
+
+ hit_type_index = 0
+ seq_label_index = 8
+ target_label_index = 9
+
+ labels_hits = {}
+ labels_to_keep = []
+
+ for line in open(uc_filepath, "U"):
+ if line.startswith("#") or len(line.strip()) == 0:
+ continue
+ curr_line = line.split('\t')
+ if curr_line[0] == hit_type:
+ labels_hits[curr_line[seq_label_index]] =\
+ curr_line[target_label_index].strip()
+ labels_to_keep.append(curr_line[seq_label_index])
+
+ labels_to_keep = set(labels_to_keep)
+
+ out_fna = open(output_fna_filepath, "w")
+
+ for label, seq in parse_fasta(open(fasta_filepath, "U")):
+ if label in labels_to_keep:
+ if hit_type == "H":
+ out_fna.write(">" + labels_hits[label] + "\n%s\n" % seq)
+ if hit_type == "N":
+ out_fna.write(">" + label + "\n%s\n" % seq)
+
+ return output_fna_filepath, labels_hits
+
+
+def get_retained_chimeras(output_fp_de_novo_nonchimeras,
+ output_fp_ref_nonchimeras,
+ output_combined_fp,
+ chimeras_retention='union'):
+ """ Gets union or intersection of two supplied fasta files
+
+ output_fp_de_novo_nonchimeras: filepath of nonchimeras from de novo
+ usearch detection.
+ output_fp_ref_nonchimeras: filepath of nonchimeras from reference based
+ usearch detection.
+ output_combined_fp: filepath to write retained sequences to.
+ chimeras_retention: accepts either 'intersection' or 'union'. Will test
+ for chimeras against the full input error clustered sequence set, and
+ retain sequences flagged as non-chimeras by either (union) or
+ only those flagged as non-chimeras by both (intersection)."""
+
+ de_novo_non_chimeras = []
+ reference_non_chimeras = []
+
+ de_novo_nonchimeras_f = open(output_fp_de_novo_nonchimeras, "U")
+ reference_nonchimeras_f = open(output_fp_ref_nonchimeras, "U")
+
+ output_combined_f = open(output_combined_fp, "w")
+
+ for label, seq in parse_fasta(de_novo_nonchimeras_f):
+ de_novo_non_chimeras.append(label)
+ de_novo_nonchimeras_f.close()
+ for label, seq in parse_fasta(reference_nonchimeras_f):
+ reference_non_chimeras.append(label)
+ reference_nonchimeras_f.close()
+
+ de_novo_non_chimeras = set(de_novo_non_chimeras)
+ reference_non_chimeras = set(reference_non_chimeras)
+
+ if chimeras_retention == 'union':
+ all_non_chimeras = de_novo_non_chimeras.union(reference_non_chimeras)
+ elif chimeras_retention == 'intersection':
+ all_non_chimeras =\
+ de_novo_non_chimeras.intersection(reference_non_chimeras)
+
+ de_novo_nonchimeras_f = open(output_fp_de_novo_nonchimeras, "U")
+ reference_nonchimeras_f = open(output_fp_ref_nonchimeras, "U")
+
+ # Save a list of already-written labels
+ labels_written = []
+
+ for label, seq in parse_fasta(de_novo_nonchimeras_f):
+ if label in all_non_chimeras:
+ if label not in labels_written:
+ output_combined_f.write('>%s\n%s\n' % (label, seq))
+ labels_written.append(label)
+ de_novo_nonchimeras_f.close()
+ for label, seq in parse_fasta(reference_nonchimeras_f):
+ if label in all_non_chimeras:
+ if label not in labels_written:
+ output_combined_f.write('>%s\n%s\n' % (label, seq))
+ labels_written.append(label)
+ reference_nonchimeras_f.close()
+
+ output_combined_f.close()
+
+ return output_combined_fp
+
+
+def assign_reads_to_otus(original_fasta,
+ filtered_fasta,
+ output_filepath=None,
+ log_name="assign_reads_to_otus.log",
+ perc_id_blast=0.97,
+ global_alignment=True,
+ HALT_EXEC=False,
+ save_intermediate_files=False,
+ remove_usearch_logs=False,
+ working_dir=None):
+ """ Uses original fasta file, blasts to assign reads to filtered fasta
+
+ original_fasta = filepath to original query fasta
+ filtered_fasta = filepath to enumerated, filtered fasta
+ output_filepath = output path to clusters (uc) file
+ log_name = string specifying output log name
+ perc_id_blast = percent ID for blasting original seqs against filtered set
+ usersort = Enable if input fasta not sorted by length purposefully, lest
+ usearch will raise an error. In post chimera checked sequences, the seqs
+ are sorted by abundance, so this should be set to True.
+ HALT_EXEC: Used for debugging app controller
+ save_intermediate_files: Preserve all intermediate files created.
+ """
+
+ # Not sure if I feel confortable using blast as a way to recapitulate
+ # original read ids....
+ if not output_filepath:
+ _, output_filepath = mkstemp(prefix='assign_reads_to_otus',
+ suffix='.uc')
+
+ log_filepath = join(working_dir, log_name)
+
+ params = {'--id': perc_id_blast,
+ '--global': global_alignment}
+
+ app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ data = {'--query': original_fasta,
+ '--db': filtered_fasta,
+ '--uc': output_filepath
+ }
+
+ if not remove_usearch_logs:
+ data['--log'] = log_filepath
+
+ app_result = app(data)
+
+ return app_result, output_filepath
+
+
+def usearch_qf(
+ fasta_filepath,
+ refseqs_fp=None,
+ output_dir=None,
+ percent_id=0.97,
+ percent_id_err=0.97,
+ minsize=4,
+ abundance_skew=2.0,
+ db_filepath=None,
+ rev=False,
+ label_prefix="",
+ label_suffix="",
+ retain_label_as_comment=False,
+ count_start=0,
+ perc_id_blast=0.97,
+ save_intermediate_files=False,
+ HALT_EXEC=False,
+ global_alignment=True,
+ sizein=True,
+ sizeout=True,
+ w=64,
+ slots=16769023,
+ maxrejects=64,
+ minlen=64,
+ de_novo_chimera_detection=True,
+ derep_fullseq=False,
+ reference_chimera_detection=True,
+ cluster_size_filtering=True,
+ remove_usearch_logs=False,
+ usersort=True,
+ suppress_new_clusters=False,
+ chimeras_retention="union",
+ verbose=False
+):
+ """ Main convenience wrapper for using usearch to filter/cluster seqs
+
+ The complete 'usearch_qf' process is a multistep process with many calls
+ to usearch with various parameters. It is likely to change from the
+ original implementation. A lot.
+
+ fasta_filepath = fasta filepath to filtering/clustering (e.g., output
+ seqs.fna file from split_libraries.py)
+ refseqs_fp = fasta filepath for ref-based otu picking.
+ output_dir = directory to store the otu mapping file, as well logs and
+ the intermediate files created if save_intermediate_files is True.
+ percent_ID = percent ID for clustering sequences.
+ percent_ID_err = percent ID for filtering out chimeras
+ minsize = Minimum size of cluster for retention after chimera removal.
+ abundance_skew = threshold setting for chimera removal with de novo
+ chimera detection.
+ db_filepath = filepath of reference fasta sequence set for ref based
+ chimera detection.
+ rev = search plus and minus strands of sequences, used in ref based chimera
+ detection.
+ label_prefix = optional prefix added to filtered fasta file.
+ label_suffix = optional suffix added to filtered fasta file.
+ retain_label_as_comment = option to add usearch generated label to
+ enumerated fasta labels.
+ count_start = integer to begin counting at for sequence enumeration.
+ perc_id_blast = percent identity setting for using blast algorithm to
+ assign original sequence labels to filtered fasta.
+ global_alignment = Setting for assignment of original seq labels to filtered
+ seqs.
+ sizein = not defined in usearch helpstring
+ sizeout = not defined in usearch helpstring
+ w = Word length for U-sorting
+ slots = Size of compressed index table. Should be prime, e.g. 40000003.
+ Should also specify --w, typical is --w 16 or --w 32.
+ maxrejects = Max rejected targets, 0=ignore, default 32.
+ save_intermediate_files = retain all the intermediate files created during
+ this process.
+ minlen = (not specified in usearch helpstring), but seems like a good bet
+ that this refers to the minimum length of the sequences for dereplication.
+ HALT_EXEC = used to debug app controller problems.
+ de_novo_chimera_detection = If True, will detect chimeras de novo
+ reference_chimera_detection = If True, will detect chimeras ref based
+ cluster_size_filtering = If True, will filter OTUs according to seq counts.
+ remove_usearch_logs = If True, will not call the --log function for each
+ usearch call.
+ usersort = Used for specifying custom sorting (i.e., non-length based
+ sorting) with usearch/uclust.
+ suppress_new_clusters = with reference based OTU picking, if enabled,
+ will prevent new clusters that do not match the reference from being
+ clustered.
+ chimeras_retention = accepts either 'intersection' or 'union'. Will test
+ for chimeras against the full input error clustered sequence set, and
+ retain sequences flagged as non-chimeras by either (union) or
+ only those flagged as non-chimeras by both (intersection).
+ """
+
+ # Save a list of intermediate filepaths in case they are to be removed.
+ intermediate_files = []
+
+ # Need absolute paths to avoid problems with app controller
+ if output_dir:
+ output_dir = abspath(output_dir) + '/'
+
+ fasta_filepath = abspath(fasta_filepath)
+
+ try:
+
+ if verbose:
+ print "Sorting sequences by length..."
+ # Sort seqs by length
+ app_result, output_filepath_len_sorted =\
+ usearch_fasta_sort_from_filepath(fasta_filepath, output_filepath=
+ join(
+ output_dir,
+ 'len_sorted.fasta'),
+ save_intermediate_files=save_intermediate_files,
+ remove_usearch_logs=remove_usearch_logs,
+ working_dir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ intermediate_files.append(output_filepath_len_sorted)
+
+ if verbose:
+ print "Dereplicating sequences..."
+ # Dereplicate sequences
+ app_result, output_filepath_dereplicated =\
+ usearch_dereplicate_exact_subseqs(output_filepath_len_sorted,
+ output_filepath=join(
+ output_dir,
+ 'dereplicated_seqs.fasta'),
+ minlen=minlen, w=w, slots=slots, sizeout=sizeout,
+ maxrejects=maxrejects, save_intermediate_files=save_intermediate_files,
+ remove_usearch_logs=remove_usearch_logs,
+ working_dir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ intermediate_files.append(output_filepath_dereplicated)
+
+ if verbose:
+ print "Sorting by abundance..."
+ # Sort by abundance, initially no filter based on seqs/otu
+ app_result, output_fp =\
+ usearch_sort_by_abundance(output_filepath_dereplicated,
+ output_filepath=join(
+ output_dir,
+ 'abundance_sorted.fasta'),
+ usersort=True, sizein=sizein, sizeout=sizeout, minsize=0,
+ remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+ HALT_EXEC=HALT_EXEC)
+
+ intermediate_files.append(output_fp)
+
+ if verbose:
+ print "Clustering sequences for error correction..."
+
+ # Create .uc file of clusters file, to identify original sequences
+ # later
+ output_uc_filepath = output_dir + 'err_corrected_clusters.uc'
+
+ app_result, error_clustered_output_fp =\
+ usearch_cluster_error_correction(output_fp,
+ output_filepath=join(output_dir,
+ 'clustered_error_corrected.fasta'),
+ output_uc_filepath=output_uc_filepath,
+ usersort=True, percent_id_err=percent_id_err, sizein=sizein,
+ sizeout=sizeout, w=w, slots=slots, maxrejects=maxrejects,
+ remove_usearch_logs=remove_usearch_logs,
+ save_intermediate_files=save_intermediate_files,
+ working_dir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ intermediate_files.append(error_clustered_output_fp)
+ intermediate_files.append(output_uc_filepath)
+
+ # Series of conditional tests, using generic 'output_fp' name so the
+ # conditional filtering, if any/all are selected, do not matter.
+ if de_novo_chimera_detection:
+
+ if verbose:
+ print "Performing de novo chimera detection..."
+ app_result, output_fp_de_novo_nonchimeras =\
+ usearch_chimera_filter_de_novo(error_clustered_output_fp,
+ abundance_skew=abundance_skew, output_chimera_filepath=
+ join(
+ output_dir,
+ 'de_novo_chimeras.fasta'),
+ output_non_chimera_filepath=join(
+ output_dir,
+ 'de_novo_non_chimeras.fasta'), usersort=True,
+ save_intermediate_files=save_intermediate_files,
+ remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+ HALT_EXEC=HALT_EXEC)
+
+ intermediate_files.append(output_fp_de_novo_nonchimeras)
+
+ output_fp = output_fp_de_novo_nonchimeras
+
+ if reference_chimera_detection:
+ if verbose:
+ print "Performing reference based chimera detection..."
+
+ app_result, output_fp_ref_nonchimeras =\
+ usearch_chimera_filter_ref_based(error_clustered_output_fp,
+ db_filepath=db_filepath, output_chimera_filepath=
+ join(
+ output_dir,
+ 'reference_chimeras.fasta'),
+ output_non_chimera_filepath=
+ join(output_dir, 'reference_non_chimeras.fasta'), usersort=True,
+ save_intermediate_files=save_intermediate_files, rev=rev,
+ remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+ HALT_EXEC=HALT_EXEC)
+
+ intermediate_files.append(output_fp_ref_nonchimeras)
+
+ output_fp = output_fp_ref_nonchimeras
+
+ # get intersection or union if both ref and de novo chimera detection
+ if de_novo_chimera_detection and reference_chimera_detection:
+ if verbose:
+ print "Finding %s of non-chimeras..." % chimeras_retention
+ output_fp = get_retained_chimeras(
+ output_fp_de_novo_nonchimeras, output_fp_ref_nonchimeras,
+ output_combined_fp=
+ join(output_dir, 'combined_non_chimeras.fasta'),
+ chimeras_retention=chimeras_retention)
+
+ intermediate_files.append(output_fp)
+
+ if cluster_size_filtering:
+ # Test for empty filepath following filters, raise error if all seqs
+ # have been removed
+ if verbose:
+ print "Filtering by cluster size..."
+ # chimera detection was not performed, use output file of step 4 as input
+ # to filtering by cluster size
+ if not (reference_chimera_detection and de_novo_chimera_detection):
+ output_fp = error_clustered_output_fp
+ app_result, output_fp =\
+ usearch_sort_by_abundance(output_fp, output_filepath=
+ join(output_dir, 'abundance_sorted_minsize_' + str(minsize) +
+ '.fasta'),
+ minsize=minsize, sizein=sizein, sizeout=sizeout,
+ remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+ HALT_EXEC=HALT_EXEC)
+
+ intermediate_files.append(output_fp)
+
+ # cluster seqs
+ # Should we add in option to use alternative OTU picking here?
+ # Seems like it will be a bit of a mess...maybe after we determine
+ # if usearch_qf should become standard.
+ if refseqs_fp:
+ if verbose:
+ print "Clustering against reference sequences..."
+ app_result, output_filepath =\
+ usearch_cluster_seqs_ref(output_fp, output_filepath=
+ join(
+ output_dir,
+ 'ref_clustered_seqs.uc'),
+ percent_id=percent_id, sizein=sizein,
+ sizeout=sizeout, w=w, slots=slots, maxrejects=maxrejects,
+ save_intermediate_files=save_intermediate_files,
+ remove_usearch_logs=remove_usearch_logs,
+ suppress_new_clusters=suppress_new_clusters, refseqs_fp=refseqs_fp,
+ output_dir=output_dir, working_dir=output_dir, rev=rev,
+ HALT_EXEC=HALT_EXEC
+ )
+
+ else:
+ if verbose:
+ print "De novo clustering sequences..."
+ app_result, output_filepath =\
+ usearch_cluster_seqs(output_fp, output_filepath=
+ join(output_dir, 'clustered_seqs.fasta'),
+ percent_id=percent_id, sizein=sizein,
+ sizeout=sizeout, w=w, slots=slots, maxrejects=maxrejects,
+ save_intermediate_files=save_intermediate_files,
+ remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+ HALT_EXEC=HALT_EXEC)
+
+ intermediate_files.append(output_filepath)
+
+ # Enumerate the OTUs in the clusters
+ if not suppress_new_clusters:
+ if verbose:
+ print "Enumerating OTUs..."
+ output_filepath =\
+ enumerate_otus(output_filepath, output_filepath=
+ join(output_dir, 'enumerated_otus.fasta'),
+ label_prefix=label_prefix,
+ label_suffix=label_suffix, count_start=count_start,
+ retain_label_as_comment=retain_label_as_comment)
+
+ intermediate_files.append(output_filepath)
+
+ # Get original sequence label identities
+ if verbose:
+ print "Assigning sequences to clusters..."
+ app_result, clusters_file = assign_reads_to_otus(fasta_filepath,
+ filtered_fasta=output_filepath, output_filepath=join(
+ output_dir,
+ 'assign_reads_to_otus.uc'), perc_id_blast=percent_id,
+ global_alignment=global_alignment,
+ remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+ HALT_EXEC=HALT_EXEC)
+
+ intermediate_files.append(clusters_file)
+
+ except ApplicationError:
+ raise ApplicationError('Error running usearch. Possible causes are '
+ 'unsupported version (current supported version is usearch ' +
+ 'v5.2.236) is installed or improperly formatted input file was ' +
+ 'provided')
+ except ApplicationNotFoundError:
+ remove_files(files_to_remove)
+ raise ApplicationNotFoundError('usearch not found, is it properly ' +
+ 'installed?')
+
+ # Get dict of clusters, list of failures
+ # Set OTU ID field to 9 for the case of closed reference OTU picking
+ if suppress_new_clusters:
+ otu_id_field = 9
+ else:
+ otu_id_field = 1
+ clusters, failures = clusters_from_blast_uc_file(open(clusters_file, "U"),
+ otu_id_field)
+
+ # Remove temp files unless user specifies output filepath
+ if not save_intermediate_files:
+ remove_files(intermediate_files)
+
+ return clusters, failures
+
+
+def assign_dna_reads_to_database(query_fasta_fp,
+ database_fasta_fp,
+ output_fp,
+ temp_dir=gettempdir(),
+ params={},
+ blast6_fp=None,
+ HALT_EXEC=False):
+ _params = {'--id': 0.97}
+ _params.update(params)
+
+ if blast6_fp is None:
+ blast6_fp = splitext(output_fp)[0] + '.bl6'
+ data = {'--query': query_fasta_fp,
+ '--uc': output_fp,
+ '--db': database_fasta_fp,
+ '--blast6out': blast6_fp,
+ }
+ app = Usearch(_params,
+ WorkingDir=temp_dir,
+ HALT_EXEC=False)
+ app_result = app(data)
+
+assign_dna_reads_to_protein_database =\
+ assign_dna_reads_to_dna_database =\
+ assign_dna_reads_to_database
+# End uclust convenience functions
+
+# Start usearch61 application controller
+
+
+class Usearch61(CommandLineApplication):
+
+ """ Usearch61 ApplicationController
+
+ """
+
+ _command = 'usearch61'
+ _input_handler = '_input_as_parameters'
+ _parameters = {
+
+ # IO filepaths specified by these values
+
+ # Output file, used by several difference functions
+ '--output': ValuedParameter('--', Name='output', Delimiter=' ',
+ IsPath=True),
+
+ # Output filename in uclust (.uc) format
+ '--uc': ValuedParameter('--', Name='uc', Delimiter=' ', IsPath=True),
+
+ # log filepath
+ '--log': ValuedParameter('--', Name='log', Delimiter=' ', IsPath=True),
+
+ # Uses to specify input file for reference based clustering
+ '--usearch_global': ValuedParameter('--', Name='usearch_global',
+ Delimiter=' ', IsPath=True),
+
+ # Used to specify reference sequences to act as seeds
+ '--db': ValuedParameter('--', Name='db', Delimiter=' ', IsPath=True),
+
+ # Default de novo clustering input fasta filepath, memory efficient
+ '--cluster_smallmem': ValuedParameter('--', Name='cluster_smallmem',
+ Delimiter=' ', IsPath=True),
+
+ # Fast de novo clustering input fasta filepath
+ '--cluster_fast': ValuedParameter('--', Name='cluster_fast',
+ Delimiter=' ', IsPath=True),
+
+ # Specifies consensus fasta file output for a cluster
+ '--consout': ValuedParameter('--', Name='consout',
+ Delimiter=' ', IsPath=True),
+
+ # Specifies input consensus/abundance file for de novo chimeras
+ '--uchime_denovo': ValuedParameter('--', Name='uchime_denovo',
+ Delimiter=' ', IsPath=True),
+
+ # Specifies input consensus/abundance file for ref chimera detection
+ '--uchime_ref': ValuedParameter('--', Name='uchime_ref',
+ Delimiter=' ', IsPath=True),
+
+ # Specifies output uchime file for chimera results
+ '--uchimeout': ValuedParameter('--', Name='uchimeout',
+ Delimiter=' ', IsPath=True),
+
+ # Parameters for sorting raw fasta files
+ # specifies fasta filepath to sort by length
+ '--sortbylength': ValuedParameter('--', Name='sortbylength',
+ Delimiter=' ', IsPath=True),
+
+ # specifies fasta filepath to dereplicate, sort by abundance
+ '--derep_fulllength': ValuedParameter('--', Name='derep_fulllength',
+ Delimiter=' ', IsPath=True),
+
+ # Adds label showing abundance of dereplicated sequences
+ '--sizeout': FlagParameter('--', Name='sizeout'),
+
+ # Other parameters for clustering/sorting
+
+ # Needed to use data sorted by abundance and use sizeorder option
+ '--usersort': FlagParameter('--', Name='usersort'),
+
+ # specifies percent identity for clustering
+ '--id': ValuedParameter('--', Name='id', Delimiter=' ', IsPath=False),
+
+ # specifies minimum sequence length allowed
+ '--minseqlength': ValuedParameter('--', Name='minseqlength',
+ Delimiter=' ', IsPath=False),
+
+ # if set as --strand both will enable reverse strand matching
+ '--strand': ValuedParameter('--', Name='strand', Delimiter=' ',
+ IsPath=False),
+
+ # Word length to use, in base pairs
+ '--wordlength': ValuedParameter('--', Name='wordlength',
+ Delimiter=' ', IsPath=False),
+
+ # Max rejects, lower = more speed, higher=higher accuracy
+ '--maxrejects': ValuedParameter('--', Name='maxrejects',
+ Delimiter=' ', IsPath=False),
+
+ # Max accepts, should be greater than 1 for sizeorder option
+ '--maxaccepts': ValuedParameter('--', Name='maxaccepts',
+ Delimiter=' ', IsPath=False),
+
+ # Option to cluster to most abundant seed
+ '--sizeorder': FlagParameter('--', Name='sizeorder'),
+
+ # Chimera-specific parameters
+ # abundance skew for comparing parent/child putative clusters
+ '--abskew': ValuedParameter('--', Name='abskew', Delimiter=' ',
+ IsPath=False),
+
+ # min score to be classified as chimeric
+ '--minh': ValuedParameter('--', Name='minh', Delimiter=' ',
+ IsPath=False),
+
+ # weight of no vote
+ '--xn': ValuedParameter('--', Name='xn', Delimiter=' ',
+ IsPath=False),
+
+ # pseudo count prior for no votes
+ '--dn': ValuedParameter('--', Name='dn', Delimiter=' ',
+ IsPath=False),
+
+ # Minimum number of diffs in a segment
+ '--mindiffs': ValuedParameter('--', Name='mindiffs', Delimiter=' ',
+ IsPath=False),
+
+ # Minimum divergence between query and ref sequence
+ '--mindiv': ValuedParameter('--', Name='mindiv', Delimiter=' ',
+ IsPath=False),
+
+ # Threads allocated for multithreading calls.
+ '--threads': ValuedParameter('--', Name='threads',
+ Delimiter=' ', IsPath=False)
+ }
+
+ _suppress_stdout = False
+ _suppress_stderr = False
+
+ def _input_as_parameters(self, data):
+ """ Set the input path (a fasta filepath)
+ """
+ # The list of values which can be passed on a per-run basis
+ allowed_values = ['--uc', '--output', '--log',
+ '--sortbylength', '--derep_fulllength', '--sizeout',
+ '--minseqlength', '--strand', '--wordlength',
+ '--maxrejects', '--usearch_global', '--db',
+ '--cluster_smallmem', '--cluster_fast', '--id',
+ '--maxaccepts', '--sizeorder', '--usersort',
+ '--abskew', '--minh', '--xn', '--dn', '--mindiffs',
+ '--mindiv', '--uchime_denovo', '--uchimeout',
+ '--uchime_ref', '--threads'
+ ]
+
+ unsupported_parameters = set(data.keys()) - set(allowed_values)
+ if unsupported_parameters:
+ raise ApplicationError(
+ "Unsupported parameter(s) passed when calling %s: %s" %
+ (self._command, ' '.join(unsupported_parameters)))
+
+ for v in allowed_values:
+ # turn the parameter off so subsequent runs are not
+ # affected by parameter settings from previous runs
+ self.Parameters[v].off()
+ if v in data:
+ # turn the parameter on if specified by the user
+ self.Parameters[v].on(data[v])
+
+ return ''
+
+ def _get_result_paths(self, data):
+ """ Set the result paths """
+
+ result = {}
+
+ result['Output'] = ResultPath(
+ Path=self.Parameters['--output'].Value,
+ IsWritten=self.Parameters['--output'].isOn())
+
+ result['ClusterFile'] = ResultPath(
+ Path=self.Parameters['--uc'].Value,
+ IsWritten=self.Parameters['--uc'].isOn())
+
+ return result
+
+ def _accept_exit_status(self, exit_status):
+ """ Test for acceptable exit status
+
+ usearch can seg fault and still generate a parsable .uc file
+ so we explicitly check the exit status
+
+ """
+ return exit_status == 0
+
+ def getHelp(self):
+ """Method that points to documentation"""
+ help_str =\
+ """
+ USEARCH is hosted at:
+ http://www.drive5.com/usearch/
+
+ The following papers should be cited if this resource is used:
+
+ Edgar,RC, Haas,BJ, Clemente,JC, Quince,C, Knight,R (2011) UCHIME
+ improves sensitivity and speed of chimera detection, Bioinformatics
+ """
+ return help_str
+
+# Start Usearch61 convenience functions
+
+
+def usearch61_ref_cluster(seq_path,
+ refseqs_fp,
+ percent_id=0.97,
+ rev=False,
+ save_intermediate_files=True,
+ minlen=64,
+ output_dir='.',
+ remove_usearch_logs=False,
+ verbose=False,
+ wordlength=8,
+ usearch_fast_cluster=False,
+ usearch61_sort_method='abundance',
+ otu_prefix="denovo",
+ usearch61_maxrejects=32,
+ usearch61_maxaccepts=1,
+ sizeorder=False,
+ suppress_new_clusters=False,
+ threads=1.0,
+ HALT_EXEC=False
+ ):
+ """ Returns dictionary of cluster IDs:seq IDs
+
+ Overall function for reference-based clustering with usearch61
+
+ seq_path: fasta filepath to be clustered with usearch61
+ refseqs_fp: reference fasta filepath, used to cluster sequences against.
+ percent_id: percentage id to cluster at
+ rev: enable reverse strand matching for clustering
+ save_intermediate_files: Saves intermediate files created during clustering
+ minlen: minimum sequence length
+ output_dir: directory to output log, OTU mapping, and intermediate files
+ remove_usearch_logs: Saves usearch log files
+ verbose: print current processing step to stdout
+ wordlength: word length to use for clustering
+ usearch_fast_cluster: Use usearch61 fast cluster option, not as memory
+ efficient as the default cluster_smallmem option, requires sorting by
+ length, and does not allow reverse strand matching.
+ usearch61_sort_method: Sort sequences by abundance or length by using
+ functionality provided by usearch61, or do not sort by using None option.
+ otu_prefix: label to place in front of OTU IDs, used to prevent duplicate
+ IDs from appearing with reference based OTU picking.
+ usearch61_maxrejects: Number of rejects allowed by usearch61
+ usearch61_maxaccepts: Number of accepts allowed by usearch61
+ sizeorder: used for clustering based upon abundance of seeds (only applies
+ when doing open reference de novo clustering)
+ suppress_new_clusters: If True, will allow de novo clustering on top of
+ reference clusters.
+ threads: Specify number of threads used per core per CPU
+ HALT_EXEC: application controller option to halt execution.
+
+ Description of analysis workflows
+ ---------------------------------
+ closed-reference approach:
+ dereplicate sequences first, do reference based clustering,
+ merge clusters/failures and dereplicated data,
+ write OTU mapping and failures file.
+
+ open-reference approach:
+ dereplicate sequences first, do reference based clustering, parse failures,
+ sort failures fasta according to chosen method, cluster failures, merge
+ reference clustering results/de novo results/dereplicated data, write
+ OTU mapping file.
+
+ Dereplication should save processing time for large datasets.
+
+ """
+
+ files_to_remove = []
+
+ # Need absolute paths to avoid potential problems with app controller
+ if output_dir:
+ output_dir = join(abspath(output_dir), '')
+
+ seq_path = abspath(seq_path)
+
+ try:
+
+ if verbose:
+ print "Presorting sequences according to abundance..."
+ intermediate_fasta, dereplicated_uc, app_result =\
+ sort_by_abundance_usearch61(seq_path, output_dir, rev,
+ minlen, remove_usearch_logs, HALT_EXEC,
+ output_fna_filepath=join(
+ output_dir,
+ 'abundance_sorted.fna'),
+ output_uc_filepath=join(
+ output_dir,
+ 'abundance_sorted.uc'),
+ threads=threads)
+ if not save_intermediate_files:
+ files_to_remove.append(intermediate_fasta)
+ files_to_remove.append(dereplicated_uc)
+
+ if verbose:
+ print "Performing reference based clustering..."
+ clusters_fp, app_result = usearch61_cluster_ref(intermediate_fasta,
+ refseqs_fp, percent_id, rev, minlen, output_dir,
+ remove_usearch_logs, wordlength, usearch61_maxrejects,
+ usearch61_maxaccepts, HALT_EXEC,
+ output_uc_filepath=join(
+ output_dir,
+ 'ref_clustered.uc'),
+ threads=threads)
+ if not save_intermediate_files:
+ files_to_remove.append(clusters_fp)
+
+ clusters, failures =\
+ parse_usearch61_clusters(open(clusters_fp, "U"), otu_prefix="",
+ ref_clustered=True)
+ dereplicated_clusters =\
+ parse_dereplicated_uc(open(dereplicated_uc, "U"))
+ clusters = merge_clusters_dereplicated_seqs(clusters,
+ dereplicated_clusters)
+ failures = merge_failures_dereplicated_seqs(failures,
+ dereplicated_clusters)
+
+ if not suppress_new_clusters and failures:
+ if verbose:
+ print "Parsing out sequences that failed to cluster..."
+ failures_fasta = parse_usearch61_failures(seq_path, set(failures),
+ output_fasta_fp=join(output_dir, "failures_parsed.fna"))
+ if not save_intermediate_files:
+ files_to_remove.append(failures_fasta)
+ denovo_clusters = usearch61_denovo_cluster(failures_fasta,
+ percent_id, rev, save_intermediate_files, minlen, output_dir,
+ remove_usearch_logs, verbose, wordlength, usearch_fast_cluster,
+ usearch61_sort_method, otu_prefix, usearch61_maxrejects,
+ usearch61_maxaccepts, sizeorder, threads, HALT_EXEC)
+ failures = []
+
+ # Merge ref and denovo clusters
+ clusters.update(denovo_clusters)
+
+ except ApplicationError:
+ raise ApplicationError('Error running usearch61. Possible causes are '
+ 'unsupported version (current supported version is usearch '
+ 'v6.1.544) is installed or improperly formatted input file was '
+ 'provided')
+
+ except ApplicationNotFoundError:
+ remove_files(files_to_remove)
+ raise ApplicationNotFoundError('usearch61 not found, is it properly '
+ 'installed?')
+
+ if not save_intermediate_files:
+ remove_files(files_to_remove)
+
+ return clusters, failures
+
+
+def usearch61_denovo_cluster(seq_path,
+ percent_id=0.97,
+ rev=False,
+ save_intermediate_files=True,
+ minlen=64,
+ output_dir='.',
+ remove_usearch_logs=False,
+ verbose=False,
+ wordlength=8,
+ usearch_fast_cluster=False,
+ usearch61_sort_method='abundance',
+ otu_prefix="denovo",
+ usearch61_maxrejects=32,
+ usearch61_maxaccepts=1,
+ sizeorder=False,
+ threads=1.0,
+ HALT_EXEC=False,
+ file_prefix="denovo_"
+ ):
+ """ Returns dictionary of cluster IDs:seq IDs
+
+ Overall function for de novo clustering with usearch61
+
+ seq_path: fasta filepath to be clustered with usearch61
+ percent_id: percentage id to cluster at
+ rev: enable reverse strand matching for clustering
+ save_intermediate_files: Saves intermediate files created during clustering
+ minlen: minimum sequence length
+ output_dir: directory to output log, OTU mapping, and intermediate files
+ remove_usearch_logs: Saves usearch log files
+ verbose: print current processing step to stdout
+ wordlength: word length to use for clustering
+ usearch_fast_cluster: Use usearch61 fast cluster option, not as memory
+ efficient as the default cluster_smallmem option, requires sorting by
+ length, and does not allow reverse strand matching.
+ usearch61_sort_method: Sort sequences by abundance or length by using
+ functionality provided by usearch61, or do not sort by using None option.
+ otu_prefix: label to place in front of OTU IDs, used to prevent duplicate
+ IDs from appearing with reference based OTU picking.
+ usearch61_maxrejects: Number of rejects allowed by usearch61
+ usearch61_maxaccepts: Number of accepts allowed by usearch61
+ sizeorder: used for clustering based upon abundance of seeds
+ threads: Specify number of threads used per core per CPU
+ HALT_EXEC: application controller option to halt execution.
+ """
+
+ files_to_remove = []
+
+ # Need absolute paths to avoid potential problems with app controller
+ if output_dir:
+ output_dir = abspath(output_dir) + '/'
+ seq_path = abspath(seq_path)
+
+ try:
+ if verbose and usearch61_sort_method is not None and\
+ not usearch_fast_cluster:
+ print "Sorting sequences according to %s..." % usearch61_sort_method
+
+ # fast sorting option automatically performs length sorting
+ if usearch61_sort_method == 'abundance' and not usearch_fast_cluster:
+ intermediate_fasta, dereplicated_uc, app_result =\
+ sort_by_abundance_usearch61(seq_path, output_dir, rev,
+ minlen, remove_usearch_logs, HALT_EXEC,
+ output_fna_filepath=join(
+ output_dir,
+ file_prefix + 'abundance_sorted.fna'),
+ output_uc_filepath=join(output_dir,
+ file_prefix + 'abundance_sorted.uc'), threads=threads)
+ if not save_intermediate_files:
+ files_to_remove.append(intermediate_fasta)
+ files_to_remove.append(dereplicated_uc)
+ elif usearch61_sort_method == 'length' and not usearch_fast_cluster:
+ intermediate_fasta, app_result =\
+ sort_by_length_usearch61(seq_path, output_dir, minlen,
+ remove_usearch_logs, HALT_EXEC,
+ output_fna_filepath=join(output_dir,
+ file_prefix + 'length_sorted.fna'))
+ if not save_intermediate_files:
+ files_to_remove.append(intermediate_fasta)
+ else:
+ intermediate_fasta = seq_path
+
+ if verbose:
+ print "Clustering sequences de novo..."
+
+ if usearch_fast_cluster:
+ clusters_fp, app_result = usearch61_fast_cluster(
+ intermediate_fasta,
+ percent_id, minlen, output_dir, remove_usearch_logs, wordlength,
+ usearch61_maxrejects, usearch61_maxaccepts, HALT_EXEC,
+ output_uc_filepath=join(
+ output_dir,
+ file_prefix + 'fast_clustered.uc'), threads=threads)
+ if not save_intermediate_files:
+ files_to_remove.append(clusters_fp)
+ else:
+ clusters_fp, app_result =\
+ usearch61_smallmem_cluster(intermediate_fasta, percent_id,
+ minlen, rev, output_dir, remove_usearch_logs, wordlength,
+ usearch61_maxrejects, usearch61_maxaccepts, sizeorder, HALT_EXEC,
+ output_uc_filepath=join(output_dir,
+ file_prefix + 'smallmem_clustered.uc'))
+ if not save_intermediate_files:
+ files_to_remove.append(clusters_fp)
+
+ except ApplicationError:
+ raise ApplicationError('Error running usearch61. Possible causes are '
+ 'unsupported version (current supported version is usearch ' +
+ 'v6.1.544) is installed or improperly formatted input file was ' +
+ 'provided')
+
+ except ApplicationNotFoundError:
+ remove_files(files_to_remove)
+ raise ApplicationNotFoundError('usearch61 not found, is it properly ' +
+ 'installed?')
+
+ if usearch61_sort_method == 'abundance' and not usearch_fast_cluster:
+ de_novo_clusters, failures =\
+ parse_usearch61_clusters(open(clusters_fp, "U"), otu_prefix)
+ dereplicated_clusters =\
+ parse_dereplicated_uc(open(dereplicated_uc, "U"))
+ clusters = merge_clusters_dereplicated_seqs(de_novo_clusters,
+ dereplicated_clusters)
+
+ else:
+ clusters, failures =\
+ parse_usearch61_clusters(open(clusters_fp, "U"), otu_prefix)
+
+ if not save_intermediate_files:
+ remove_files(files_to_remove)
+
+ return clusters
+
+
+# Start fasta sorting functions
+def sort_by_abundance_usearch61(seq_path,
+ output_dir='.',
+ rev=False,
+ minlen=64,
+ remove_usearch_logs=False,
+ HALT_EXEC=False,
+ output_fna_filepath=None,
+ output_uc_filepath=None,
+ log_name="abundance_sorted.log",
+ threads=1.0):
+ """ usearch61 application call to sort fasta file by abundance.
+
+ seq_path: fasta filepath to be clustered with usearch61
+ output_dir: directory to output log, OTU mapping, and intermediate files
+ rev: enable reverse strand matching for clustering/sorting
+ minlen: minimum sequence length
+ remove_usearch_logs: Saves usearch log files
+ HALT_EXEC: application controller option to halt execution
+ output_fna_filepath: path to write sorted fasta filepath
+ output_uc_filepath: path to write usearch61 generated .uc file
+ log_name: filepath to write usearch61 generated log file
+ threads: Specify number of threads used per core per CPU
+ """
+
+ if not output_fna_filepath:
+ _, output_fna_filepath = mkstemp(prefix='abundance_sorted',
+ suffix='.fna')
+
+ if not output_uc_filepath:
+ _, output_uc_filepath = mkstemp(prefix='abundance_sorted',
+ suffix='.uc')
+
+ log_filepath = join(output_dir, log_name)
+
+ params = {'--minseqlength': minlen,
+ '--sizeout': True,
+ '--derep_fulllength': seq_path,
+ '--output': output_fna_filepath,
+ '--uc': output_uc_filepath,
+ '--threads': threads
+ }
+
+ if rev:
+ params['--strand'] = 'both'
+ if not remove_usearch_logs:
+ params['--log'] = log_filepath
+
+ app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ app_result = app()
+
+ return output_fna_filepath, output_uc_filepath, app_result
+
+
+def sort_by_length_usearch61(seq_path,
+ output_dir=".",
+ minlen=64,
+ remove_usearch_logs=False,
+ HALT_EXEC=False,
+ output_fna_filepath=None,
+ log_name="length_sorted.log"):
+ """ usearch61 application call to sort fasta file by length.
+
+ seq_path: fasta filepath to be clustered with usearch61
+ output_dir: directory to output log, OTU mapping, and intermediate files
+ minlen: minimum sequence length
+ remove_usearch_logs: Saves usearch log files
+ HALT_EXEC: application controller option to halt execution
+ output_fna_filepath: path to write sorted fasta filepath
+ log_name: filepath to write usearch61 generated log file
+ """
+
+ if not output_fna_filepath:
+ _, output_fna_filepath = mkstemp(prefix='length_sorted', suffix='.fna')
+
+ log_filepath = join(output_dir, log_name)
+
+ params = {'--minseqlength': minlen,
+ '--sortbylength': seq_path,
+ '--output': output_fna_filepath
+ }
+ if not remove_usearch_logs:
+ params['--log'] = log_filepath
+
+ app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ app_result = app()
+
+ return output_fna_filepath, app_result
+
+# End fasta sorting functions
+
+# Start reference clustering functions
+
+
+def usearch61_cluster_ref(intermediate_fasta,
+ refseqs_fp,
+ percent_id=0.97,
+ rev=False,
+ minlen=64,
+ output_dir=".",
+ remove_usearch_logs=False,
+ wordlength=8,
+ usearch61_maxrejects=32,
+ usearch61_maxaccepts=1,
+ HALT_EXEC=False,
+ output_uc_filepath=None,
+ log_filepath="ref_clustered.log",
+ threads=1.0
+ ):
+ """ Cluster input fasta seqs against reference database
+
+ seq_path: fasta filepath to be clustered with usearch61
+ refseqs_fp: reference fasta filepath, used to cluster sequences against.
+ percent_id: percentage id to cluster at
+ rev: enable reverse strand matching for clustering
+ minlen: minimum sequence length
+ output_dir: directory to output log, OTU mapping, and intermediate files
+ remove_usearch_logs: Saves usearch log files
+ wordlength: word length to use for clustering
+ usearch61_maxrejects: Number of rejects allowed by usearch61
+ usearch61_maxaccepts: Number of accepts allowed by usearch61
+ output_uc_filepath: path to write usearch61 generated .uc file
+ threads: Specify number of threads used per core per CPU
+ HALT_EXEC: application controller option to halt execution.
+ """
+
+ log_filepath = join(output_dir, log_filepath)
+
+ params = {
+ '--usearch_global': intermediate_fasta,
+ '--db': refseqs_fp,
+ '--minseqlength': minlen,
+ '--id': percent_id,
+ '--uc': output_uc_filepath,
+ '--wordlength': wordlength,
+ '--maxrejects': usearch61_maxrejects,
+ '--maxaccepts': usearch61_maxaccepts,
+ '--threads': threads
+ }
+
+ if not remove_usearch_logs:
+ params['--log'] = log_filepath
+ if rev:
+ params['--strand'] = 'both'
+ else:
+ params['--strand'] = 'plus'
+
+ clusters_fp = output_uc_filepath
+
+ app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ app_result = app()
+
+ return clusters_fp, app_result
+
+# End reference clustering functions
+
+# Start de novo clustering functions
+
+
+def usearch61_fast_cluster(intermediate_fasta,
+ percent_id=0.97,
+ minlen=64,
+ output_dir=".",
+ remove_usearch_logs=False,
+ wordlength=8,
+ usearch61_maxrejects=8,
+ usearch61_maxaccepts=1,
+ HALT_EXEC=False,
+ output_uc_filepath=None,
+ log_name="fast_clustered.log",
+ threads=1.0):
+ """ Performs usearch61 de novo fast clustering via cluster_fast option
+
+ Only supposed to be used with length sorted data (and performs length
+ sorting automatically) and does not support reverse strand matching
+
+ intermediate_fasta: fasta filepath to be clustered with usearch61
+ percent_id: percentage id to cluster at
+ minlen: minimum sequence length
+ output_dir: directory to output log, OTU mapping, and intermediate files
+ remove_usearch_logs: Saves usearch log files
+ wordlength: word length to use for initial high probability sequence matches
+ usearch61_maxrejects: Set to 'default' or an int value specifying max
+ rejects
+ usearch61_maxaccepts: Number of accepts allowed by usearch61
+ HALT_EXEC: application controller option to halt execution
+ output_uc_filepath: Path to write clusters (.uc) file.
+ log_name: filepath to write usearch61 generated log file
+ threads: Specify number of threads used per core per CPU
+ """
+
+ log_filepath = join(output_dir, log_name)
+
+ params = {'--minseqlength': minlen,
+ '--cluster_fast': intermediate_fasta,
+ '--id': percent_id,
+ '--uc': output_uc_filepath,
+ '--wordlength': wordlength,
+ '--maxrejects': usearch61_maxrejects,
+ '--maxaccepts': usearch61_maxaccepts,
+ '--usersort': True,
+ '--threads': threads
+ }
+
+ if not remove_usearch_logs:
+ params['--log'] = log_filepath
+
+ clusters_fp = output_uc_filepath
+
+ app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ app_result = app()
+
+ return clusters_fp, app_result
+
+
+def usearch61_smallmem_cluster(intermediate_fasta,
+ percent_id=0.97,
+ minlen=64,
+ rev=False,
+ output_dir=".",
+ remove_usearch_logs=False,
+ wordlength=8,
+ usearch61_maxrejects=32,
+ usearch61_maxaccepts=1,
+ sizeorder=False,
+ HALT_EXEC=False,
+ output_uc_filepath=None,
+ log_name="smallmem_clustered.log",
+ sizeout=False,
+ consout_filepath=None):
+ """ Performs usearch61 de novo clustering via cluster_smallmem option
+
+ Only supposed to be used with length sorted data (and performs length
+ sorting automatically) and does not support reverse strand matching
+
+ intermediate_fasta: fasta filepath to be clustered with usearch61
+ percent_id: percentage id to cluster at
+ minlen: minimum sequence length
+ rev: will enable reverse strand matching if True
+ output_dir: directory to output log, OTU mapping, and intermediate files
+ remove_usearch_logs: Saves usearch log files
+ wordlength: word length to use for initial high probability sequence matches
+ usearch61_maxrejects: Set to 'default' or an int value specifying max
+ rejects
+ usearch61_maxaccepts: Number of accepts allowed by usearch61
+ HALT_EXEC: application controller option to halt execution
+ output_uc_filepath: Path to write clusters (.uc) file.
+ log_name: filepath to write usearch61 generated log file
+ sizeout: If True, will save abundance data in output fasta labels.
+ consout_filepath: Needs to be set to save clustered consensus fasta
+ filepath used for chimera checking.
+ """
+
+ log_filepath = join(output_dir, log_name)
+
+ params = {'--minseqlength': minlen,
+ '--cluster_smallmem': intermediate_fasta,
+ '--id': percent_id,
+ '--uc': output_uc_filepath,
+ '--wordlength': wordlength,
+ '--maxrejects': usearch61_maxrejects,
+ '--maxaccepts': usearch61_maxaccepts,
+ '--usersort': True
+ }
+
+ if sizeorder:
+ params['--sizeorder'] = True
+ if not remove_usearch_logs:
+ params['--log'] = log_filepath
+ if rev:
+ params['--strand'] = 'both'
+ else:
+ params['--strand'] = 'plus'
+ if sizeout:
+ params['--sizeout'] = True
+ if consout_filepath:
+ params['--consout'] = consout_filepath
+
+ clusters_fp = output_uc_filepath
+
+ app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ app_result = app()
+
+ return clusters_fp, app_result
+
+# End de novo clustering functions
+
+# Start Chimera checking functions
+
+
+def usearch61_chimera_check_denovo(abundance_fp,
+ uchime_denovo_fp,
+ minlen=64,
+ output_dir=".",
+ remove_usearch_logs=False,
+ uchime_denovo_log_fp="uchime_denovo.log",
+ usearch61_minh=0.28,
+ usearch61_xn=8.0,
+ usearch61_dn=1.4,
+ usearch61_mindiffs=3,
+ usearch61_mindiv=0.8,
+ usearch61_abundance_skew=2.0,
+ HALT_EXEC=False):
+ """ Does de novo, abundance based chimera checking with usearch61
+
+ abundance_fp: input consensus fasta file with abundance information for
+ each cluster.
+ uchime_denovo_fp: output uchime file for chimera results.
+ minlen: minimum sequence length for usearch input fasta seqs.
+ output_dir: output directory
+ removed_usearch_logs: suppresses creation of log file.
+ uchime_denovo_log_fp: output filepath for log file.
+ usearch61_minh: Minimum score (h) to be classified as chimera.
+ Increasing this value tends to the number of false positives (and also
+ sensitivity).
+ usearch61_xn: Weight of "no" vote. Increasing this value tends to the
+ number of false positives (and also sensitivity).
+ usearch61_dn: Pseudo-count prior for "no" votes. (n). Increasing this
+ value tends to the number of false positives (and also sensitivity).
+ usearch61_mindiffs: Minimum number of diffs in a segment. Increasing this
+ value tends to reduce the number of false positives while reducing
+ sensitivity to very low-divergence chimeras.
+ usearch61_mindiv: Minimum divergence, i.e. 100% - identity between the
+ query and closest reference database sequence. Expressed as a percentage,
+ so the default is 0.8%, which allows chimeras that are up to 99.2% similar
+ to a reference sequence.
+ usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
+ HALTEXEC: halt execution and returns command used for app controller.
+ """
+
+ params = {'--minseqlength': minlen,
+ '--uchime_denovo': abundance_fp,
+ '--uchimeout': uchime_denovo_fp,
+ '--minh': usearch61_minh,
+ '--xn': usearch61_xn,
+ '--dn': usearch61_dn,
+ '--mindiffs': usearch61_mindiffs,
+ '--mindiv': usearch61_mindiv,
+ '--abskew': usearch61_abundance_skew
+ }
+
+ if not remove_usearch_logs:
+ params['--log'] = uchime_denovo_log_fp
+
+ app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ app_result = app()
+
+ return uchime_denovo_fp, app_result
+
+
+def usearch61_chimera_check_ref(abundance_fp,
+ uchime_ref_fp,
+ reference_seqs_fp,
+ minlen=64,
+ output_dir=".",
+ remove_usearch_logs=False,
+ uchime_ref_log_fp="uchime_ref.log",
+ usearch61_minh=0.28,
+ usearch61_xn=8.0,
+ usearch61_dn=1.4,
+ usearch61_mindiffs=3,
+ usearch61_mindiv=0.8,
+ threads=1.0,
+ HALT_EXEC=False):
+ """ Does reference based chimera checking with usearch61
+
+ abundance_fp: input consensus fasta file with abundance information for
+ each cluster.
+ uchime_ref_fp: output uchime filepath for reference results
+ reference_seqs_fp: reference fasta database for chimera checking.
+ minlen: minimum sequence length for usearch input fasta seqs.
+ output_dir: output directory
+ removed_usearch_logs: suppresses creation of log file.
+ uchime_denovo_log_fp: output filepath for log file.
+ usearch61_minh: Minimum score (h) to be classified as chimera.
+ Increasing this value tends to the number of false positives (and also
+ sensitivity).
+ usearch61_xn: Weight of "no" vote. Increasing this value tends to the
+ number of false positives (and also sensitivity).
+ usearch61_dn: Pseudo-count prior for "no" votes. (n). Increasing this
+ value tends to the number of false positives (and also sensitivity).
+ usearch61_mindiffs: Minimum number of diffs in a segment. Increasing this
+ value tends to reduce the number of false positives while reducing
+ sensitivity to very low-divergence chimeras.
+ usearch61_mindiv: Minimum divergence, i.e. 100% - identity between the
+ query and closest reference database sequence. Expressed as a percentage,
+ so the default is 0.8%, which allows chimeras that are up to 99.2% similar
+ to a reference sequence.
+ threads: Specify number of threads used per core per CPU
+ HALTEXEC: halt execution and returns command used for app controller.
+ """
+
+ params = {'--minseqlength': minlen,
+ '--uchime_ref': abundance_fp,
+ '--uchimeout': uchime_ref_fp,
+ '--db': reference_seqs_fp,
+ '--minh': usearch61_minh,
+ '--xn': usearch61_xn,
+ '--dn': usearch61_dn,
+ '--mindiffs': usearch61_mindiffs,
+ '--mindiv': usearch61_mindiv,
+ # Only works in plus according to usearch doc
+ '--strand': 'plus',
+ '--threads': threads
+ }
+
+ if not remove_usearch_logs:
+ params['--log'] = uchime_ref_log_fp
+
+ app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+ app_result = app()
+
+ return uchime_ref_fp, app_result
+
+# End chimera checking functions
+
+# Start parsing functions
+
+
+def parse_dereplicated_uc(dereplicated_uc_lines):
+ """ Return dict of seq ID:dereplicated seq IDs from dereplicated .uc lines
+
+ dereplicated_uc_lines: list of lines of .uc file from dereplicated seqs from
+ usearch61 (i.e. open file of abundance sorted .uc data)
+ """
+
+ dereplicated_clusters = {}
+
+ seed_hit_ix = 0
+ seq_id_ix = 8
+ seed_id_ix = 9
+
+ for line in dereplicated_uc_lines:
+ if line.startswith("#") or len(line.strip()) == 0:
+ continue
+ curr_line = line.strip().split('\t')
+ if curr_line[seed_hit_ix] == "S":
+ dereplicated_clusters[curr_line[seq_id_ix]] = []
+ if curr_line[seed_hit_ix] == "H":
+ curr_seq_id = curr_line[seq_id_ix]
+ dereplicated_clusters[curr_line[seed_id_ix]].append(curr_seq_id)
+
+ return dereplicated_clusters
+
+
+def parse_usearch61_clusters(clustered_uc_lines,
+ otu_prefix='denovo',
+ ref_clustered=False):
+ """ Returns dict of cluster ID:seq IDs
+
+ clustered_uc_lines: lines from .uc file resulting from de novo clustering
+ otu_prefix: string added to beginning of OTU ID.
+ ref_clustered: If True, will attempt to create dict keys for clusters as
+ they are read from the .uc file, rather than from seed lines.
+ """
+
+ clusters = {}
+ failures = []
+
+ seed_hit_ix = 0
+ otu_id_ix = 1
+ seq_id_ix = 8
+ ref_id_ix = 9
+
+ for line in clustered_uc_lines:
+ if line.startswith("#") or len(line.strip()) == 0:
+ continue
+ curr_line = line.strip().split('\t')
+ if curr_line[seed_hit_ix] == "S":
+ # Need to split on semicolons for sequence IDs to handle case of
+ # abundance sorted data
+ clusters[otu_prefix + curr_line[otu_id_ix]] =\
+ [curr_line[seq_id_ix].split(';')[0].split()[0]]
+ if curr_line[seed_hit_ix] == "H":
+ curr_id = curr_line[seq_id_ix].split(';')[0].split()[0]
+ if ref_clustered:
+ try:
+ clusters[otu_prefix + curr_line[ref_id_ix]].append(curr_id)
+ except KeyError:
+ clusters[otu_prefix + curr_line[ref_id_ix]] = [curr_id]
+ else:
+ clusters[otu_prefix +
+ curr_line[otu_id_ix]].append(curr_id)
+ if curr_line[seed_hit_ix] == "N":
+ failures.append(curr_line[seq_id_ix].split(';')[0])
+
+ return clusters, failures
+
+
+def merge_clusters_dereplicated_seqs(de_novo_clusters,
+ dereplicated_clusters):
+ """ combines de novo clusters and dereplicated seqs to OTU id:seqs dict
+
+ de_novo_clusters: dict of OTU ID:clustered sequences
+ dereplicated_clusters: dict of seq IDs: dereplicated seq IDs
+ """
+
+ clusters = {}
+
+ for curr_denovo_key in de_novo_clusters.keys():
+ clusters[curr_denovo_key] = de_novo_clusters[curr_denovo_key]
+ curr_clusters = []
+ for curr_denovo_id in de_novo_clusters[curr_denovo_key]:
+ curr_clusters += dereplicated_clusters[curr_denovo_id]
+ clusters[curr_denovo_key] += curr_clusters
+
+ return clusters
+
+
+def merge_failures_dereplicated_seqs(failures,
+ dereplicated_clusters):
+ """ Appends failures from dereplicated seqs to failures list
+
+ failures: list of failures
+ dereplicated_clusters: dict of seq IDs: dereplicated seq IDs
+ """
+
+ curr_failures = set(failures)
+ dereplicated_ids = set(dereplicated_clusters)
+
+ for curr_failure in curr_failures:
+ if curr_failure in dereplicated_ids:
+ failures += dereplicated_clusters[curr_failure]
+
+ return failures
+
+
+def parse_usearch61_failures(seq_path,
+ failures,
+ output_fasta_fp):
+ """ Parses seq IDs from failures list, writes to output_fasta_fp
+
+ seq_path: filepath of original input fasta file.
+ failures: list/set of failure seq IDs
+ output_fasta_fp: path to write parsed sequences
+ """
+
+ parsed_out = open(output_fasta_fp, "w")
+
+ for label, seq in parse_fasta(open(seq_path), "U"):
+ curr_label = label.split()[0]
+ if curr_label in failures:
+ parsed_out.write(">%s\n%s\n" % (label, seq))
+ parsed_out.close()
+ return output_fasta_fp
+
+# End parsing functions
diff --git a/bfillings/vsearch.py b/bfillings/vsearch.py
new file mode 100644
index 0000000..5b1969b
--- /dev/null
+++ b/bfillings/vsearch.py
@@ -0,0 +1,575 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2015--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# -----------------------------------------------------------------------------
+
+""" Application controller for vsearch v1.1.1 """
+
+from os.path import abspath, join, dirname
+
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+ ApplicationError)
+
+
+class Vsearch(CommandLineApplication):
+
+ """ Vsearch ApplicationController """
+
+ _command = 'vsearch'
+ _input_handler = '_input_as_parameters'
+ _parameters = {
+ # Output to specified FASTA file
+ '--output': ValuedParameter('--', Name='output', Delimiter=' ',
+ IsPath=True),
+
+ # Filename for UCLUST-like output
+ '--uc': ValuedParameter('--', Name='uc', Delimiter=' ',
+ IsPath=True),
+
+ # Filename for BLAST-like tab-separated output
+ '--blast6out': ValuedParameter('--', Name='blast6out', Delimiter=' ',
+ IsPath=True),
+
+ # ID percent for OTU, by default is 97%
+ '--id': ValuedParameter('--', Name='id', Delimiter=' ',
+ IsPath=False, Value=None),
+
+ # ID definition, 0-4=CD-HIT,all,int,MBL,BLAST (default vsearch: 2)
+ '--iddef': ValuedParameter('--', Name='iddef',
+ Delimiter=' ', IsPath=False,
+ Value=None),
+
+ # Number of hits to accept and show per strand (default vsearch: 1)
+ '--maxaccepts':
+ ValuedParameter('--', Name='maxaccepts', Delimiter=' ', Value=None),
+
+ # Number of non-matching hits to consider (default vsearch: 32)
+ '--maxrejects':
+ ValuedParameter('--', Name='maxrejects', Delimiter=' ', Value=None),
+
+ # Indicate that input sequences are presorted
+ '--usersort': FlagParameter('--', Name='usersort'),
+
+ # Take into account the abundance annotations present
+ # in the input fasta file
+ '--sizein': FlagParameter('--', Name='sizein'),
+
+ # Add abundance annotations to the output fasta files
+ '--sizeout': FlagParameter('--', Name='sizeout'),
+
+ # Dereplicate exact sequences in the given FASTA file
+ '--derep_fulllength': ValuedParameter('--', Name='derep_fulllength',
+ Delimiter=' ', IsPath=True),
+
+ # Dereplicate plus or both strands (default vsearch: plus)
+ '--strand': ValuedParameter('--', Name='strand', Delimiter=' ',
+ IsPath=False),
+
+ # Discard sequences with an abundance value greater than integer
+ '--maxuniquesize': ValuedParameter('--', Name='maxuniquesize',
+ Delimiter=' ', IsPath=False),
+
+ # Discard sequences with an abundance value smaller than integer
+ '--minuniquesize': ValuedParameter('--', Name='minuniquesize',
+ Delimiter=' ',
+ IsPath=False),
+
+ # Abundance sort sequences in given FASTA file
+ '--sortbysize': ValuedParameter('--', Name='sortbysize', Delimiter=' ',
+ IsPath=True),
+
+ # When using --sortbysize, discard sequences
+ # with an abundance value greater than maxsize
+ '--maxsize': ValuedParameter('--', Name='maxsize', Delimiter=' ',
+ IsPath=False),
+
+ # When using --sortbysize, discard sequences
+ # with an abundance value smaller than minsize
+ '--minsize': ValuedParameter('--', Name='minsize', Delimiter=' ',
+ IsPath=False),
+
+ # Output cluster consensus sequences to FASTA file
+ '--consout': ValuedParameter('--', Name='consout', Delimiter=' ',
+ IsPath=True),
+
+ # Chimera detection: min abundance ratio of parent vs chimera
+ # (default vsearch: 2.0)
+ '--abskew': ValuedParameter('--', Name='abskew', Delimiter=' ',
+ IsPath=False, Value=None),
+ # Detect chimeras de novo
+ '--uchime_denovo': ValuedParameter('--', Name='uchime_denovo',
+ Delimiter=' ', IsPath=True),
+
+ # Detect chimeras using a reference database
+ '--uchime_ref': ValuedParameter('--', Name='uchime_ref',
+ Delimiter=' ', IsPath=True),
+
+ # Output chimera alignments to 3-way alignment file (filepath)
+ '--uchimealns': ValuedParameter('--', Name='uchimealns', Delimiter=' ',
+ IsPath=True),
+
+ # Output chimeric sequences to file (filepath)
+ '--chimeras': ValuedParameter('--', Name='chimeras',
+ Delimiter=' ', IsPath=True),
+
+ # Output non-chimera filepath
+ '--nonchimeras': ValuedParameter('--', Name='nonchimeras',
+ Delimiter=' ', IsPath=True),
+
+ # Reference database for --uchime_ref
+ '--db': ValuedParameter('--', Name='db', Delimiter=' ', IsPath=True),
+
+ # Output to chimera info to tab-separated file
+ '--uchimeout': ValuedParameter('--', Name='uchimeout', Delimiter=' ',
+ IsPath=True),
+
+ # Number of computation threads to use (1 to 256)
+ # note: by default, keep the value set to 1 for all commands
+ # since otherwise (if no other value is given) VSEARCH will use
+ # all available cores
+ '--threads': ValuedParameter('--', Name='threads', Delimiter=' ',
+ IsPath=False, Value="1"),
+
+ # Write messages, timing and memory info to file
+ '--log': ValuedParameter('--', Name='log', Delimiter=' ',
+ IsPath=True)
+ }
+
+ _suppress_stdout = False
+ _suppress_stderr = False
+
+ def _input_as_parameters(self, data):
+ """ Set the input path (a fasta filepath)
+ """
+ # The list of values which can be passed on a per-run basis
+ allowed_values = ['--uc', '--output', '--sortbysize',
+ '--consout', '--uchime_denovo',
+ '--derep_fulllength', '--maxuniquesize',
+ '--minuniquesize', '--sizein',
+ '--sizeout', '--strand', '--threads',
+ '--uchime_ref', '--chimeras',
+ '--nonchimeras', '--db', '--uchimeout',
+ '--blast6out', '--abskew',
+ '--sortbysize', '--maxsize', '--minsize']
+
+ unsupported_parameters = set(data.keys()) - set(allowed_values)
+ if unsupported_parameters:
+ raise ApplicationError(
+ "Unsupported parameter(s) passed when calling vsearch: %s" %
+ ' '.join(unsupported_parameters))
+
+ for v in allowed_values:
+ # turn the parameter off so subsequent runs are not
+ # affected by parameter settings from previous runs
+ self.Parameters[v].off()
+ if v in data:
+ # turn the parameter on if specified by the user
+ self.Parameters[v].on(data[v])
+
+ return ''
+
+ def _get_result_paths(self, data):
+ """ Set the result paths """
+
+ result = {}
+
+ result['Output'] = ResultPath(
+ Path=self.Parameters['--output'].Value,
+ IsWritten=self.Parameters['--output'].isOn())
+
+ result['ClusterFile'] = ResultPath(
+ Path=self.Parameters['--uc'].Value,
+ IsWritten=self.Parameters['--uc'].isOn())
+
+ # uchime 3-way global alignments
+ result['Output_aln'] = ResultPath(
+ Path=self.Parameters['--uchimealns'].Value,
+ IsWritten=self.Parameters['--uchimealns'].isOn())
+
+ # uchime tab-separated format
+ result['Output_tabular'] = ResultPath(
+ Path=self.Parameters['--uchimeout'].Value,
+ IsWritten=self.Parameters['--uchimeout'].isOn())
+
+ # chimeras fasta file output
+ result['Output_chimeras'] = ResultPath(
+ Path=self.Parameters['--chimeras'].Value,
+ IsWritten=self.Parameters['--chimeras'].isOn())
+
+ # nonchimeras fasta file output
+ result['Output_nonchimeras'] = ResultPath(
+ Path=self.Parameters['--nonchimeras'].Value,
+ IsWritten=self.Parameters['--nonchimeras'].isOn())
+
+ # log file
+ result['LogFile'] = ResultPath(
+ Path=self.Parameters['--log'].Value,
+ IsWritten=self.Parameters['--log'].isOn())
+
+ return result
+
+ def getHelp(self):
+ """Method that points to documentation"""
+ help_str = """
+ VSEARCH is hosted at:
+ https://github.com/torognes/vsearch
+ Please cite the above URL if this wrapper is used in published work.
+ """
+ return help_str
+
+
+def vsearch_dereplicate_exact_seqs(
+ fasta_filepath,
+ output_filepath,
+ output_uc=False,
+ working_dir=None,
+ strand="both",
+ maxuniquesize=None,
+ minuniquesize=None,
+ sizein=False,
+ sizeout=True,
+ log_name="derep.log",
+ HALT_EXEC=False):
+ """ Generates clusters and fasta file of
+ dereplicated subsequences
+
+ Parameters
+ ----------
+
+ fasta_filepath : string
+ input filepath of fasta file to be dereplicated
+ output_filepath : string
+ write the dereplicated sequences to output_filepath
+ working_dir : string, optional
+ directory path for storing intermediate output
+ output_uc : boolean, optional
+ uutput dereplication results in a file using a
+ uclust-like format
+ strand : string, optional
+ when searching for strictly identical sequences,
+ check the 'strand' only (default: both) or
+ check the plus strand only
+ maxuniquesize : integer, optional
+ discard sequences with an abundance value greater
+ than maxuniquesize
+ minuniquesize : integer, optional
+ discard sequences with an abundance value smaller
+ than integer
+ sizein : boolean, optional
+ take into account the abundance annotations present in
+ the input fasta file, (search for the pattern
+ "[>;]size=integer[;]" in sequence headers)
+ sizeout : boolean, optional
+ add abundance annotations to the output fasta file
+ (add the pattern ";size=integer;" to sequence headers)
+ log_name : string, optional
+ specifies log filename
+ HALT_EXEC : boolean, optional
+ used for debugging app controller
+
+ Return
+ ------
+
+ output_filepath : string
+ filepath to dereplicated fasta file
+ uc_filepath : string
+ filepath to dereplication results in uclust-like format
+ log_filepath : string
+ filepath to log file
+ """
+
+ # write all vsearch output files to same directory
+ # as output_filepath if working_dir is not specified
+ if not working_dir:
+ working_dir = dirname(abspath(output_filepath))
+
+ app = Vsearch(WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ log_filepath = join(working_dir, log_name)
+ uc_filepath = None
+ if output_uc:
+ uc_filepath = join(working_dir, 'vsearch_uc_dereplicated.uc')
+ app.Parameters['--uc'].on(uc_filepath)
+
+ if maxuniquesize:
+ app.Parameters['--maxuniquesize'].on(maxuniquesize)
+ if minuniquesize:
+ app.Parameters['--minuniquesize'].on(minuniquesize)
+ if sizein:
+ app.Parameters['--sizein'].on()
+ if sizeout:
+ app.Parameters['--sizeout'].on()
+ if (strand == "both" or strand == "plus"):
+ app.Parameters['--strand'].on(strand)
+ else:
+ raise ValueError("Option --strand accepts only 'both'"
+ "or 'plus' values")
+ app.Parameters['--derep_fulllength'].on(fasta_filepath)
+ app.Parameters['--output'].on(output_filepath)
+ app.Parameters['--log'].on(log_filepath)
+
+ app_result = app()
+
+ return output_filepath, uc_filepath, log_filepath
+
+
+def vsearch_sort_by_abundance(
+ fasta_filepath,
+ output_filepath,
+ working_dir=None,
+ minsize=None,
+ maxsize=None,
+ log_name="abundance_sort.log",
+ HALT_EXEC=False):
+ """ Fasta entries are sorted by decreasing abundance
+ (Fasta entries are assumed to be dereplicated with
+ the pattern "[>;]size=integer[;]" present in the
+ read label, ex. use function vsearch_dereplicate_exact_seqs
+ prior to calling this function)
+
+ Parameters
+ ----------
+
+ fasta_filepath : string
+ input fasta file (dereplicated fasta)
+ output_filepath : string
+ output filepath for the sorted sequences in fasta format
+ working_dir : string, optional
+ working directory to store intermediate files
+ minsize : integer, optional
+ discard sequences with an abundance value smaller than
+ minsize
+ maxsize : integer, optional
+ discard sequences with an abundance value greater than
+ maxsize
+ log_name : string, optional
+ log filename
+ HALT_EXEC : boolean, optional
+ used for debugging app controller
+
+ Return
+ ------
+
+ output_filepath : string
+ filepath to sorted fasta file
+ log_filepath : string
+ filepath to log file
+ """
+
+ # set working dir to same directory as the output
+ # file (if not provided)
+ if not working_dir:
+ working_dir = dirname(output_filepath)
+
+ app = Vsearch(WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ log_filepath = join(working_dir, log_name)
+
+ if minsize:
+ app.Parameters['--minsize'].on(minsize)
+
+ if maxsize:
+ app.Parameters['--maxsize'].on(maxsize)
+
+ app.Parameters['--sortbysize'].on(fasta_filepath)
+ app.Parameters['--output'].on(output_filepath)
+ app.Parameters['--log'].on(log_filepath)
+
+ app_result = app()
+
+ return output_filepath, log_filepath
+
+
+def vsearch_chimera_filter_de_novo(
+ fasta_filepath,
+ working_dir,
+ output_chimeras=True,
+ output_nonchimeras=True,
+ output_alns=False,
+ output_tabular=False,
+ log_name="vsearch_uchime_de_novo_chimera_filtering.log",
+ HALT_EXEC=False):
+ """ Detect chimeras present in the fasta-formatted filename,
+ without external references (i.e. de novo). Automatically
+ sort the sequences in filename by decreasing abundance
+ beforehand. Output chimeras and non-chimeras to FASTA files
+ and/or 3-way global alignments and/or tabular output.
+
+ Parameters
+ ----------
+
+ fasta_filepath : string
+ input fasta file (dereplicated fasta with pattern
+ [>;]size=integer[;] in the fasta header)
+ working_dir : string
+ directory path for all output files
+ output_chimeras : boolean, optional
+ output chimeric sequences to file, in fasta format
+ output_nonchimeras : boolean, optional
+ output nonchimeric sequences to file, in fasta format
+ output_alns : boolean, optional
+ output 3-way global alignments (parentA, parentB, chimera)
+ in human readable format to file
+ output_tabular : boolean, optional
+ output results using the uchime tab-separated format of
+ 18 fields (see Vsearch user manual)
+ HALT_EXEC : boolean, optional
+ used for debugging app controller
+
+ Return
+ ------
+
+ output_chimera_filepath : string
+ filepath to chimeric fasta sequences
+ output_non_chimera_filepath : string
+ filepath to nonchimeric fasta sequences
+ output_alns_filepath : string
+ filepath to chimeric sequences alignment
+ file
+ output_tabular_filepath : string
+ filepath to chimeric sequences tabular
+ output file
+ log_filepath : string
+ filepath to log file
+ """
+
+ app = Vsearch(WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if not (output_chimeras or
+ output_nonchimeras or
+ output_alns or
+ output_tabular):
+ raise ValueError("At least one output format (output_chimeras,"
+ "output_nonchimeras, output_alns, output_tabular)"
+ "must be selected")
+
+ output_chimera_filepath = None
+ output_non_chimera_filepath = None
+ output_alns_filepath = None
+ output_tabular_filepath = None
+
+ # set output filepaths
+ if output_chimeras:
+ output_chimera_filepath = join(working_dir, 'uchime_chimeras.fasta')
+ app.Parameters['--chimeras'].on(output_chimera_filepath)
+ if output_nonchimeras:
+ output_non_chimera_filepath = join(working_dir,
+ 'uchime_non_chimeras.fasta')
+ app.Parameters['--nonchimeras'].on(output_non_chimera_filepath)
+ if output_alns:
+ output_alns_filepath = join(working_dir, 'uchime_alignments.txt')
+ app.Parameters['--uchimealns'].on(output_alns_filepath)
+ if output_tabular:
+ output_tabular_filepath = join(working_dir, 'uchime_tabular.txt')
+ app.Parameters['--uchimeout'].on(output_tabular_filepath)
+ log_filepath = join(working_dir, log_name)
+
+ app.Parameters['--uchime_denovo'].on(fasta_filepath)
+ app.Parameters['--log'].on(log_filepath)
+
+ app_result = app()
+
+ return output_chimera_filepath, output_non_chimera_filepath,\
+ output_alns_filepath, output_tabular_filepath, log_filepath
+
+
+def vsearch_chimera_filter_ref(
+ fasta_filepath,
+ working_dir,
+ db_filepath,
+ output_chimeras=True,
+ output_nonchimeras=True,
+ output_alns=False,
+ output_tabular=False,
+ log_name="vsearch_uchime_ref_chimera_filtering.log",
+ threads=1,
+ HALT_EXEC=False):
+ """ Detect chimeras present in the fasta-formatted filename,
+ with an external reference (i.e. database). Output
+ chimeras and non-chimeras to FASTA files and/or 3-way
+ global alignments and/or tabular output.
+
+ Parameters
+ ----------
+
+ fasta_filepath : string
+ input fasta file (dereplicated fasta)
+ working_dir : string
+ directory path for all output files
+ db_filepath : string
+ filepath to reference database
+ output_chimeras : boolean, optional
+ output chimeric sequences to file, in fasta format
+ output_nonchimeras : boolean, optional
+ output nonchimeric sequences to file, in fasta format
+ output_alns : boolean, optional
+ output 3-way global alignments (parentA, parentB, chimera)
+ in human readable format to file
+ output_tabular : boolean, optional
+ output results using the uchime tab-separated format of
+ 18 fields (see Vsearch user manual)
+ threads : integer, optional
+ number of computation threads to use (1 to 256)
+ HALT_EXEC : boolean, optional
+ used for debugging app controller
+
+ Return
+ ------
+
+ output_chimera_filepath : string
+ filepath to chimeric fasta sequences
+ output_non_chimera_filepath : string
+ filepath to nonchimeric fasta sequences
+ output_alns_filepath : string
+ filepath to chimeric sequences alignment
+ file
+ output_tabular_filepath : string
+ filepath to chimeric sequences tabular
+ output file
+ log_filepath : string
+ filepath to log file
+ """
+
+ app = Vsearch(WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+ if not (output_chimeras or
+ output_nonchimeras or
+ output_alns or
+ output_tabular):
+ raise ValueError("At least one output format (output_chimeras,"
+ "output_nonchimeras, output_alns, output_tabular)"
+ "must be selected")
+
+ output_chimera_filepath = None
+ output_non_chimera_filepath = None
+ output_alns_filepath = None
+ output_tabular_filepath = None
+
+ # set output filepaths
+ if output_chimeras:
+ output_chimera_filepath = join(working_dir, 'uchime_chimeras.fasta')
+ app.Parameters['--chimeras'].on(output_chimera_filepath)
+ if output_nonchimeras:
+ output_non_chimera_filepath = join(working_dir,
+ 'uchime_non_chimeras.fasta')
+ app.Parameters['--nonchimeras'].on(output_non_chimera_filepath)
+ if output_alns:
+ output_alns_filepath = join(working_dir, 'uchime_alignments.txt')
+ app.Parameters['--uchimealns'].on(output_alns_filepath)
+ if output_tabular:
+ output_tabular_filepath = join(working_dir, 'uchime_tabular.txt')
+ app.Parameters['--uchimeout'].on(output_tabular_filepath)
+ log_filepath = join(working_dir, log_name)
+
+ app.Parameters['--db'].on(db_filepath)
+ app.Parameters['--uchime_ref'].on(fasta_filepath)
+ app.Parameters['--log'].on(log_filepath)
+
+ app_result = app()
+
+ return output_chimera_filepath, output_non_chimera_filepath,\
+ output_alns_filepath, output_tabular_filepath, log_filepath
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index e3bc501..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,35 +0,0 @@
-python-burrito-fillings (0.1.1-1) UNRELEASED; urgency=medium
-
- * Initial upload to Debian (Closes: #800739)
- * Enhance long description
- * cme fix dpkg-control (found need to s/clustal-w/clustalw/ !)
-
- -- Andreas Tille <tille at debian.org> Sat, 03 Oct 2015 07:21:52 +0200
-
-python-burrito-fillings (0.1.1-0biolinux1) trusty; urgency=medium
-
- * Bugfix release to go with burrito 0.9.1
- * Refreshed patches
- * Patched RDP Classifier runner - see notes in the patch
- * Patched out 3 internal tests which now fail and we don't need
- them
-
- -- Tim Booth <tbooth at ceh.ac.uk> Wed, 29 Jul 2015 14:35:48 +0100
-
-python-burrito-fillings (0.1.0-0biolinux4) trusty; urgency=medium
-
- * Limit to 64-bit architectures. Some deps are 64-bit only.
- * Insist on newer rdp-classifier, and modify tests to match
- * Disable BLAST tests as they fail on launchpad and I have no
- way to debug the failure
-
- -- Tim Booth <tbooth at ceh.ac.uk> Thu, 05 Mar 2015 16:25:18 +0000
-
-python-burrito-fillings (0.1.0-0biolinux1) trusty; urgency=medium
-
- * Initial release for QIIME 1.9
- * Remove many tests - see comments in rules
- * Lots of patches
- * Build for Python2, as QIIME is Python2 only.
-
- -- Tim Booth <tbooth at ceh.ac.uk> Thu, 05 Mar 2015 18:10:56 +0000
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 435bfc6..0000000
--- a/debian/control
+++ /dev/null
@@ -1,67 +0,0 @@
-Source: python-burrito-fillings
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Tim Booth <tbooth at ceh.ac.uk>,
- Andreas Tille <tille at debian.org>
-Section: python
-Priority: optional
-Build-Depends: debhelper (>= 9),
- python-all (>= 2.7),
- dh-python,
- python-burrito,
- python-skbio,
- python-cogent,
- python-lockfile,
- python-setuptools,
- python-tk,
- blast2,
- bwa,
- clearcut,
- muscle,
- parsinsert,
- raxml,
- rdp-classifier,
- sortmerna,
- sumatra,
- sumaclust,
- swarm,
- vsearch (>= 1.1.3)
-Standards-Version: 3.9.6
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/python-burrito-fillings/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/python-burrito-fillings/trunk/
-Homepage: https://github.com/biocore/burrito-fillings
-
-Package: python-burrito-fillings
-Architecture: amd64 kfreebsd-amd64
-Depends: ${shlibs:Depends},
- ${misc:Depends},
- ${python:Depends}
-Recommends: blast2,
- bwa,
- cd-hit,
- clearcut,
- clustalw,
- ea-utils,
- fasttree,
- infernal,
- mafft,
- mothur,
- muscle,
- parsinsert,
- raxml,
- rdp-classifier,
-# rtax,
- seqprep,
- sortmerna,
- sumatra,
- swarm,
- vsearch
-Description: burrito application controllers for bioinformatics
- The burrito-fillings project provides wrappers for bioinformatics tools
- using the burrito framework.
- .
- burrito-fillings (canonically pronounced boar-ee-toe phil-ings; python
- package name bfillings) contains burrito CommandLineApplication
- subclasses (i.e., application controllers) for bioinformatics
- applications. This is intended to be a temporary package for the
- application controllers that are used in QIIME as we figure out which of
- these we will continue to support.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index b5507e5..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,35 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: burrito-fillings
-Upstream-Contact: gregcaporaso at gmail.com
-Source: https://github.com/biocore/burrito/
-
-Files: *
-Copyright: © burrito development team <gregcaporaso at gmail.com>
-License:
- Copyright (c) 2014, burrito development team.
- All rights reserved.
- .
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- .
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
- .
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- .
- * Neither the names burrito or biocore nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
- .
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/debian/patches/cd_hit_leaves_no_bak_file b/debian/patches/cd_hit_leaves_no_bak_file
deleted file mode 100644
index c55c563..0000000
--- a/debian/patches/cd_hit_leaves_no_bak_file
+++ /dev/null
@@ -1,32 +0,0 @@
-This is a port of the accept_newer_cdhit patch from python-cogent.
-The code in question seems to be copied form Cogent in the first place.
-
---- a/bfillings/cd_hit.py
-+++ b/bfillings/cd_hit.py
-@@ -269,7 +269,12 @@
- # perform cleanup
- res.cleanUp()
- shutil.rmtree(working_dir)
-- remove(params['-o'] + '.bak.clstr')
-+ try:
-+ remove(params['-o'] + '.bak.clstr')
-+ except:
-+ #No file to clean up from later CD-HIT
-+ pass
-+
-
- return remapped_clusters
-
-@@ -311,7 +316,11 @@
- # perform cleanup
- res.cleanUp()
- shutil.rmtree(working_dir)
-- remove(params['-o'] + '.bak.clstr')
-+ try:
-+ remove(params['-o'] + '.bak.clstr')
-+ except:
-+ #No file to clean up from later CD-HIT
-+ pass
-
- return SequenceCollection(new_seqs, MolType=moltype)
-
diff --git a/debian/patches/handle_renamed_binaries b/debian/patches/handle_renamed_binaries
deleted file mode 100644
index a7ea520..0000000
--- a/debian/patches/handle_renamed_binaries
+++ /dev/null
@@ -1,168 +0,0 @@
-# Some binaries are renamed when packaging, mostly to satisfy the guideline
-# that there should be no capitalization.
-# Also explicit call to /usr/lib/mafft/bin/mafft-profile
-# I was also going to use the rdp_classifier wrapper in Debian but it's
-# too much faff. Instead look for /usr/share/java/rdp_classifier.jar
-
---- a/bfillings/parsinsert.py
-+++ b/bfillings/parsinsert.py
-@@ -29,7 +29,7 @@
- class ParsInsert(CommandLineApplication):
- """ParsInsert application Controller"""
-
-- _command = 'ParsInsert'
-+ _command = 'parsinsert'
- _input_handler = '_input_as_multiline_string'
- _parameters = {
- # read mask from this file
---- a/bfillings/seqprep.py
-+++ b/bfillings/seqprep.py
-@@ -30,7 +30,7 @@
- class SeqPrep(CommandLineApplication):
-
- """SeqPrep application controller for joining paired-end reads"""
-- _command = 'SeqPrep'
-+ _command = 'seqprep'
- _parameters = {
- # Required Arguments
- # -f <first read input fastq filename>
-@@ -232,7 +232,7 @@
- """seqprep help"""
- help_str = """
- For basic help, type the following at the command line:
-- 'SeqPrep -h'
-+ 'seqprep -h'
-
- Website:
- https://github.com/jstjohn/SeqPrep
---- a/bfillings/fasttree.py
-+++ b/bfillings/fasttree.py
-@@ -27,7 +27,7 @@
- class FastTree(CommandLineApplication):
- """FastTree application Controller"""
-
-- _command = 'FastTree'
-+ _command = 'fasttree'
- _input_handler = '_input_as_multiline_string'
- _parameters = {
- '-quiet':FlagParameter('-',Name='quiet'),
---- a/bfillings/mafft.py
-+++ b/bfillings/mafft.py
-@@ -439,7 +439,7 @@
- app = Mafft(InputHandler='_input_as_paths',\
- params=params,
- SuppressStderr=False)
-- app._command = 'mafft-profile'
-+ app._command = '/usr/lib/mafft/bin/mafft-profile'
-
- aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta())
- aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta())
---- a/bfillings/tests/test_parsinsert.py
-+++ b/bfillings/tests/test_parsinsert.py
-@@ -64,7 +64,7 @@
-
- app = ParsInsert()
- self.assertEqual(app.BaseCommand, \
-- ''.join(['cd "',getcwd(),'/"; ','ParsInsert']))
-+ ''.join(['cd "',getcwd(),'/"; ','parsinsert']))
-
- def test_change_working_dir(self):
- """Change working dir"""
-@@ -72,7 +72,7 @@
- app = ParsInsert(WorkingDir='/tmp/ParsInsertTest')
- self.assertEqual(app.BaseCommand, \
- ''.join(['cd "','/tmp/ParsInsertTest',\
-- '/"; ','ParsInsert']))
-+ '/"; ','parsinsert']))
-
- rmtree('/tmp/ParsInsertTest')
-
---- a/bfillings/rdp_classifier.py
-+++ b/bfillings/rdp_classifier.py
-@@ -32,12 +32,10 @@
- """RDP Classifier application controller
-
- The RDP Classifier program is distributed as a java archive (.jar)
-- file. If the file 'rdp_classifier-2.2.jar' is not found in the
-- current directory, the app controller uses the JAR file specified
-+ file. If set, the app controller uses the JAR file specified
- by the environment variable RDP_JAR_PATH. If this variable is not
-- set, and 'rdp_classifier-2.2.jar' is not found in the current
-- directory, the application controller raises an
-- ApplicationNotFoundError.
-+ set, and '/usr/share/java/rdp_classifier.jar' is not found,
-+ the application controller raises an ApplicationNotFoundError.
-
- The RDP Classifier often requires memory in excess of Java's
- default 64M. To correct this situation, the authors recommend
-@@ -51,7 +49,7 @@
- '-training-data'.
- """
- _input_handler = '_input_as_lines'
-- _command = "rdp_classifier-2.2.jar"
-+ _command = "rdp_classifier.jar"
- _options = {
- # output file name for classification assignment
- '-o': ValuedParameter('-', Name='o', Delimiter=' ', IsPath=True),
-@@ -140,7 +138,7 @@
- jar_fp = self._get_jar_fp()
- if jar_fp is None:
- raise ApplicationNotFoundError(
-- "JAR file not found in current directory and the RDP_JAR_PATH "
-+ "JAR file not found in /usr/share/java and the RDP_JAR_PATH "
- "environment variable is not set. Please set RDP_JAR_PATH to "
- "the full pathname of the JAR file.")
- if not os.path.exists(jar_fp):
-@@ -150,19 +148,9 @@
- def _get_jar_fp(self):
- """Returns the full path to the JAR file.
-
-- If the JAR file cannot be found in the current directory and
-- the environment variable RDP_JAR_PATH is not set, returns
-- None.
-+ If the RDP_JAR_PATH is not set, returns /usr/share/java/rdp_classifier.jar
- """
-- # handles case where the jar file is in the current working directory
-- if os.path.exists(self._command):
-- return self._command
-- # handles the case where the user has specified the location via
-- # an environment variable
-- elif 'RDP_JAR_PATH' in environ:
-- return getenv('RDP_JAR_PATH')
-- else:
-- return None
-+ return getenv('RDP_JAR_PATH', '/usr/share/java/rdp_classifier.jar')
-
- # Overridden to pull out JVM-specific command-line arguments.
- def _get_base_command(self):
---- a/bfillings/tests/test_rdp_classifier.py
-+++ b/bfillings/tests/test_rdp_classifier.py
-@@ -27,7 +27,7 @@
- if 'RDP_JAR_PATH' in environ:
- self.user_rdp_jar_path = environ['RDP_JAR_PATH']
- else:
-- self.user_rdp_jar_path = 'rdp_classifier-2.2.jar'
-+ self.user_rdp_jar_path = '/usr/share/java/rdp_classifier.jar'
- self.output_file = tempfile.NamedTemporaryFile()
-
- def test_default_java_vm_parameters(self):
---- a/bfillings/swarm_v127.py
-+++ b/bfillings/swarm_v127.py
-@@ -106,7 +106,7 @@
-
- Return: clusters, a list of lists
- """
-- swarm_breaker_command = ["swarm_breaker.py",
-+ swarm_breaker_command = ["/usr/share/swarm/scripts/swarm_breaker.py",
- "-f",
- seq_path,
- "-s",
-@@ -140,7 +140,7 @@
- clusters.append(seq_ids)
- except OSError:
- raise ApplicationNotFoundError("Cannot find swarm_breaker.py "
-- "in the $PATH directories.")
-+ "in the expected location /usr/share/swarm/scripts.")
-
- return clusters
-
diff --git a/debian/patches/mothur_skip_list_header b/debian/patches/mothur_skip_list_header
deleted file mode 100644
index 3efe1e8..0000000
--- a/debian/patches/mothur_skip_list_header
+++ /dev/null
@@ -1,63 +0,0 @@
-This fixes the main error revealed by the tests, but they still fail as the output is
-not byte identical.
---- a/bfillings/mothur.py
-+++ b/bfillings/mothur.py
-@@ -52,7 +52,10 @@
- tokens = line.strip().split('\t')
-
- distance_str = tokens.pop(0)
-- if distance_str.lstrip().lower().startswith('u'):
-+ if distance_str.lstrip().lower().startswith('l'):
-+ #This is the header line
-+ continue
-+ elif distance_str.lstrip().lower().startswith('u'):
- distance = 0.0
- elif distance_str == '0.0':
- distance = float(precision)
---- a/bfillings/tests/test_mothur.py
-+++ b/bfillings/tests/test_mothur.py
-@@ -121,7 +121,7 @@
- """Mothur.__call__() should return correct otu's for input as single string"""
- app = Mothur()
- result = app(self.small_fasta)
-- observed_otus = result['otu list'].read()
-+ observed_otus = result['otu list'].read().split('\n',1)[1]
- self.assertEquals(observed_otus, self.small_otus)
- result.cleanUp()
-
-@@ -130,7 +130,7 @@
- lines = self.small_fasta.split('\n')
- app = Mothur(InputHandler='_input_as_lines')
- result = app(lines)
-- observed_otus = result['otu list'].read()
-+ observed_otus = result['otu list'].read().split('\n',1)[1]
- self.assertEquals(observed_otus, self.small_otus)
- result.cleanUp()
-
-@@ -142,7 +142,7 @@
- f.write(self.small_fasta)
- app = Mothur(InputHandler='_input_as_path', WorkingDir=working_dir)
- result = app(filename)
-- observed_otus = result['otu list'].read()
-+ observed_otus = result['otu list'].read().split('\n',1)[1]
- self.assertEquals(observed_otus, self.small_otus)
- remove(filename)
- result.cleanUp()
-@@ -153,7 +153,7 @@
- working_dir = mkdtemp()
- app = Mothur(WorkingDir=working_dir)
- result = app(self.small_fasta)
-- observed_otus = result['otu list'].read()
-+ observed_otus = result['otu list'].read().split('\n',1)[1]
- self.assertEquals(observed_otus, self.small_otus)
- result.cleanUp()
- rmdir(working_dir)
-@@ -162,7 +162,7 @@
- """Mothur.__call__() should return correct otu's for input sequences which are reverse complements"""
- app = Mothur()
- result = app(self.complement_fasta)
-- observed_otus = result['otu list'].read()
-+ observed_otus = result['otu list'].read().split('\n',1)[1]
- self.assertEquals(observed_otus, self.complement_otus)
- result.cleanUp()
-
diff --git a/debian/patches/no_set_blastmat b/debian/patches/no_set_blastmat
deleted file mode 100644
index 3a48d58..0000000
--- a/debian/patches/no_set_blastmat
+++ /dev/null
@@ -1,12 +0,0 @@
-BLAST on Debian does not need this variable set, so suppress the error.
---- a/bfillings/blast.py
-+++ b/bfillings/blast.py
-@@ -168,7 +168,7 @@
- access(path.expanduser("~/.ncbirc"), F_OK) or \
- access(".ncbirc", F_OK)):
- ## SHOULD THIS BE CHANGED TO RAISE AN ApplicationError?
-- raise RuntimeError, blastmat_error_message
-+ pass
- self._command = command
-
- super(Blast, self).__init__(params=params,
diff --git a/debian/patches/rdp_classifier_2.10 b/debian/patches/rdp_classifier_2.10
deleted file mode 100644
index fc5fe82..0000000
--- a/debian/patches/rdp_classifier_2.10
+++ /dev/null
@@ -1,106 +0,0 @@
-Newer RDP classifier takes the same params but needs parameter names
-to preceed them. The whole way Burrito handles this is broken, so
-this is a crude patch-up.
-
-Also, the new RDP Classifier JAR uses the entry point:
-edu.msu.cme.rdp.classifier.cli.ClassifierMain
-and not:
-edu.msu.cme.rdp.classifier.ClassifierCmd
-
-On cursory inspection, it looks like the default behaviour of the new
-entry point is the same as the old entry point, but for Burrito it
-isn't.
-This change was made in RDP Classifier ages ago but I only just
-fixed the entry point in the DEB and thus triggered the bug. The patch
-calls the entry point explicitly.
-
---- a/bfillings/rdp_classifier.py
-+++ b/bfillings/rdp_classifier.py
-@@ -162,7 +162,7 @@
- jvm_command = "java"
- jvm_arguments = self._commandline_join(
- [self.Parameters[k] for k in self._jvm_parameters])
-- jar_arguments = '-jar "%s"' % self._get_jar_fp()
-+ jar_arguments = '-cp "%s" edu.msu.cme.rdp.classifier.ClassifierCmd' % self._get_jar_fp()
- rdp_arguments = self._commandline_join(
- [self.Parameters[k] for k in self._options])
-
-@@ -197,11 +197,11 @@
- PropertiesFile = 'RdpClassifier.properties'
-
- _parameters = {
-- 'taxonomy_file': ValuedParameter(None, None, IsPath=True),
-- 'model_output_dir': ValuedParameter(None, None, IsPath=True),
-- 'training_set_id': ValuedParameter(None, None, Value='1'),
-- 'taxonomy_version': ValuedParameter(None, None, Value='version1'),
-- 'modification_info': ValuedParameter(None, None, Value='cogent'),
-+ 'taxonomy_file': ValuedParameter('-', Name='t', IsPath=True),
-+ 'model_output_dir': ValuedParameter('-', Name='o', IsPath=True),
-+ 'training_set_id': ValuedParameter('-', Name='n', Value='1'),
-+ 'taxonomy_version': ValuedParameter('-', Name='v', Value='version1'),
-+ 'modification_info': ValuedParameter('-', Name='m', Value='cogent'),
- }
- _jvm_parameters = {
- # Maximum heap size for JVM.
-@@ -253,11 +253,11 @@
- input_handler = getattr(self, self.__InputHandler)
- input_parts = [
- self.Parameters['taxonomy_file'],
-- input_handler(data),
-+ '-s ' + input_handler(data),
- self.Parameters['training_set_id'],
- self.Parameters['taxonomy_version'],
- self.Parameters['modification_info'],
-- self.ModelDir,
-+ '-o ' + self.ModelDir,
- ]
- return self._commandline_join(input_parts)
-
---- a/bfillings/tests/test_rdp_classifier.py
-+++ b/bfillings/tests/test_rdp_classifier.py
-@@ -13,7 +13,7 @@
- from os import getcwd, environ, remove, listdir
- from shutil import rmtree
- import tempfile
--from unittest import TestCase, main
-+from unittest import TestCase, main, expectedFailure
-
- from bfillings.rdp_classifier import (RdpClassifier, RdpTrainer, assign_taxonomy,
- train_rdp_classifier,
-@@ -42,6 +42,7 @@
- parameters.sort()
- self.assertEqual(parameters, ['-Xmx', '-f', '-o', '-t'])
-
-+ @expectedFailure
- def test_assign_jvm_parameters(self):
- """RdpCalssifier should pass alternate parameters to Java VM."""
- app = RdpClassifier()
-@@ -56,6 +57,7 @@
- app = RdpClassifier()
- self.assertEqual(app.BaseCommand, app._get_base_command())
-
-+ @expectedFailure
- def test_base_command(self):
- """RdpClassifier should return expected shell command."""
- app = RdpClassifier()
-@@ -64,6 +66,7 @@
- self.user_rdp_jar_path, '" -q'])
- self.assertEqual(app.BaseCommand, exp)
-
-+ @expectedFailure
- def test_change_working_dir(self):
- """RdpClassifier should run program in expected working directory."""
- test_dir = '/tmp/RdpTest'
-@@ -387,10 +390,10 @@
- rdp_expected_out = {
- 'AY800210 description field': 'Archaea;Euryarchaeota',
- 'EU883771': 'Archaea;Euryarchaeota;Methanomicrobia;Methanomicrobiales;Methanomicrobiaceae;Methanomicrobium',
-- 'EF503699': 'Archaea;Crenarchaeota;Thermoprotei',
-+ 'EF503699': 'Archaea;Thaumarchaeota;Nitrososphaerales;Nitrososphaerales;Nitrososphaeraceae;Nitrososphaera',
- 'random_seq': 'Bacteria',
- 'DQ260310': 'Archaea;Euryarchaeota;Methanobacteria;Methanobacteriales;Methanobacteriaceae;Methanosphaera',
-- 'EF503697': 'Archaea;Crenarchaeota;Thermoprotei',
-+ 'EF503697': 'Archaea;Thaumarchaeota;Nitrososphaerales;Nitrososphaerales;Nitrososphaeraceae;Nitrososphaera',
- 'short_seq': 'Unassignable',
- }
-
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 80fd631..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1,7 +0,0 @@
-handle_renamed_binaries
-no_set_blastmat
-cd_hit_leaves_no_bak_file
-mothur_skip_list_header
-test_raxml_accept_new_version
-rdp_classifier_2.10
-test_usearch_known_failures
diff --git a/debian/patches/test_raxml_accept_new_version b/debian/patches/test_raxml_accept_new_version
deleted file mode 100644
index b949cdc..0000000
--- a/debian/patches/test_raxml_accept_new_version
+++ /dev/null
@@ -1,32 +0,0 @@
-Modify the tests to pass with the newer raXML
---- a/bfillings/tests/test_raxml_v730.py
-+++ b/bfillings/tests/test_raxml_v730.py
-@@ -38,7 +38,9 @@
- version_string = stdout.strip().split(' ')[4].strip()
- try:
- version = tuple(map(int,version_string.split('.')))
-- pass_test = version == acceptable_version
-+ # This is a stupid thing to do and a stupid place to do it.
-+ # Bypassed check for DEB build.
-+ pass_test = True
- except ValueError:
- pass_test = False
- version_string = stdout
-@@ -199,7 +201,7 @@
- node.Name = align_map[new_node_name]
-
- self.assertTrue(isinstance(tree, PhyloNode))
-- self.assertEqual(tree.getNewick(with_distances=True),RESULT_TREE)
-+ self.assertTrue(re.match(RESULT_TREE, tree.getNewick(with_distances=True)))
- self.assertEqual(len(tree.tips()), 7)
- self.assertRaises(NotImplementedError, build_tree_from_alignment, \
- self.align1, RNA, True)
-@@ -230,7 +232,7 @@
- REF_TREE="""((seq0000004:0.08408,seq0000005:0.13713)0.609:0.00215,seq0000003:0.02032,(seq0000001:0.00014,seq0000002:0.00014)0.766:0.00015);
- """
-
--RESULT_TREE="""(Species003:0.0194919169324,(Species001:4.34281710439e-07,Species002:4.34281710439e-07):4.34281710439e-07,(((Species006:0.0,Species007:0.0):0.0,Species004:0.0438017433031):0.0438017433031,Species005:0.171345128781):0.00331197405878);"""
-+RESULT_TREE=r"""\(Species003:0\.019[0-9]*,\(Species001:4\.34[0-9]*e-07,Species002:4\.34[0-9]*e-07\):4\.34[0-9]*e-07,\(\(\(Species006:0\.0,Species007:0\.0\):0\.0,Species004:0\.043[0-9]*\):0\.043[0-9]*,Species005:0\.171[0-9]*\):0.00331[0-9]*\);$"""
-
- if __name__ == '__main__':
- main()
diff --git a/debian/patches/test_usearch_known_failures b/debian/patches/test_usearch_known_failures
deleted file mode 100644
index 0eb019c..0000000
--- a/debian/patches/test_usearch_known_failures
+++ /dev/null
@@ -1,39 +0,0 @@
-I've replaced uSearch with vSearch. It seems to mostly work as expected.
-The main oddity is the ref_open_ref test that seems to return a very
-different number of clusters. May be a vSearch bug?
---- a/bfillings/tests/test_usearch.py
-+++ b/bfillings/tests/test_usearch.py
-@@ -16,7 +16,7 @@
- from os.path import basename, join, exists
- from shutil import rmtree
- from glob import glob
--from unittest import TestCase, main
-+from unittest import TestCase, main, expectedFailure, skip
- from tempfile import mkstemp, mkdtemp
-
- from skbio.util import remove_files
-@@ -202,6 +202,7 @@
- self.assertEqual(clusters, expected_clusters)
- self.assertEqual(failures, expected_failures)
-
-+ @expectedFailure
- def test_usearch61_ref_open_ref(self):
- """ usearch61 does open reference OTU picking """
-
-@@ -540,6 +541,7 @@
-
- self._files_to_remove.append(uchime_fp)
-
-+ @expectedFailure
- def test_usearch61_ref_chimera_detection(self):
- """ usearch61 ref chimera detection correctly flags chimeras """
-
-@@ -562,7 +564,7 @@
-
- self._files_to_remove.append(uchime_fp)
-
--
-+ at skip("no usearch in Debian")
- class UsearchTests(TestCase):
-
- def setUp(self):
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index bc5dcfc..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/make -f
-# -*- makefile -*-
-
-# Uncomment this to turn on verbose mode.
-#export DH_VERBOSE=1
-
-PKG := $(shell dpkg-parsechangelog | sed -n 's/^Source: //p')
-
-# At the moment only build for Py2 because QIIME only works with Py2
-
-%:
- dh $@ --with python2 --buildsystem=pybuild
-
-
-override_dh_clean:
- dh_clean
- rm -f *.log
- #rm -rf *.egg-info/*
-
-override_dh_auto_test:
- # Eliminate expected failures. We have no blat
- rm -f .pybuild/*/build/bfillings/tests/test_blat.*
- # BLAST/formatdb works fine in pbuilder but on launchpad.net if the
- # blast tests run the later formatdb tests fail. No idea why. Cleanup? Disk space??
- #rm -f .pybuild/*/build/bfillings/tests/test_blast*
- # cd_hit no longer returns byte-identical results to the ancient 3.1.1
- rm -f .pybuild/*/build/bfillings/tests/test_cd_hit.*
- # clustalw is broken but not normally used by QIIME in any case
- rm -f .pybuild/*/build/bfillings/tests/test_clustalw.*
- # fasttree tests are borked and test nothing useful anyway
- rm -f .pybuild/*/build/bfillings/tests/test_fasttree*
- # Infernal tests are for old 1.0, we have 1.1
- rm -f .pybuild/*/build/bfillings/tests/test_infernal*
- # Mafft tests are for some ancient version
- rm -f .pybuild/*/build/bfillings/tests/test_mafft*
- # Mothur produces equivalent but not byte-identical output following
- # my little patch.
- rm -f .pybuild/*/build/bfillings/tests/test_mothur*
- # pplacer is a big job to package! For now users can grab the binary
- # if they want it.
- rm -f .pybuild/*/build/bfillings/tests/test_pplacer*
- # rtax is awful and seems totally wedded to uSearch - as in, it relies on specific
- # I/O buffering behaviour not just the paremeters and output formats.
- rm -f .pybuild/*/build/bfillings/tests/test_rtax*
- # We don't have uClust, though it is in the bio-linux-qiime package
- rm -f .pybuild/*/build/bfillings/tests/test_uclust*
- dh_auto_test
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 665b356..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,3 +0,0 @@
-version=4
-
-https://github.com/biocore/burrito-fillings/releases .*/archive/@ANY_VERSION@@ARCHIVE_EXT@
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..1de6f57
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+__version__ = '0.1.1'
+
+from setuptools import find_packages, setup
+from distutils.command.build_py import build_py
+
+classes = """
+ Development Status :: 1 - Planning
+ License :: OSI Approved :: BSD License
+ Topic :: Software Development :: Libraries
+ Topic :: Scientific/Engineering
+ Topic :: Scientific/Engineering :: Bio-Informatics
+ Programming Language :: Python
+ Programming Language :: Python :: 2.7
+ Operating System :: Unix
+ Operating System :: POSIX
+ Operating System :: MacOS :: MacOS X
+"""
+classifiers = [s.strip() for s in classes.split('\n') if s]
+
+long_description = """The burrito-fillings project"""
+
+setup(name='burrito-fillings',
+ cmdclass={'build_py': build_py},
+ version=__version__,
+ license='BSD',
+ description=\
+ 'burrito-fillings: burrito application controllers for bioinformatics',
+ long_description=long_description,
+ author="biocore",
+ author_email="gregcaporaso at gmail.com",
+ maintainer="biocore",
+ maintainer_email="gregcaporaso at gmail.com",
+ url='https://github.com/biocore/burrito-fillings',
+ packages=find_packages(),
+ install_requires=['scikit-bio >= 0.2.1, < 0.3.0', 'burrito < 1.0.0'],
+ classifiers=classifiers)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-burrito-fillings.git
More information about the debian-med-commit
mailing list