[med-svn] [python-burrito-fillings] 10/13: New upstream version 0.1.1

Tue Dec 26 22:22:15 UTC 2017

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository python-burrito-fillings.

commit 119ac5c7f010d061d771d83971619e9d2040b213
Author: Andreas Tille <tille at debian.org>
Date:   Tue Dec 26 23:15:53 2017 +0100

    New upstream version 0.1.1
---
 .gitignore                                   |   39 +
 CHANGELOG.md                                 |   11 +
 COPYING.txt                                  |   27 +
 README.md                                    |   11 +
 bfillings/__init__.py                        |   11 +
 bfillings/align.py                           |   40 +
 bfillings/blast.py                           | 1243 +++++++++++++
 bfillings/blat.py                            |  422 +++++
 bfillings/bwa.py                             |  762 ++++++++
 bfillings/cd_hit.py                          |  343 ++++
 bfillings/clearcut.py                        |  401 ++++
 bfillings/clustalw.py                        |  724 ++++++++
 bfillings/denoiser.py                        |   25 +
 bfillings/fastq_join.py                      |  229 +++
 bfillings/fasttree.py                        |  162 ++
 bfillings/fasttree_v1.py                     |  145 ++
 bfillings/formatdb.py                        |  239 +++
 bfillings/infernal.py                        | 1571 ++++++++++++++++
 bfillings/mafft.py                           |  470 +++++
 bfillings/mothur.py                          |  589 ++++++
 bfillings/muscle_v38.py                      |  777 ++++++++
 bfillings/parsinsert.py                      |   92 +
 bfillings/pplacer.py                         |  201 ++
 bfillings/raxml_v730.py                      |  875 +++++++++
 bfillings/rdp_classifier.py                  |  589 ++++++
 bfillings/rtax.py                            |  293 +++
 bfillings/seqprep.py                         |  351 ++++
 bfillings/sortmerna_v2.py                    |  544 ++++++
 bfillings/sumaclust_v1.py                    |  173 ++
 bfillings/swarm_v127.py                      |  299 +++
 bfillings/tests/__init__.py                  |    9 +
 bfillings/tests/test_blast.py                |  256 +++
 bfillings/tests/test_blat.py                 |  346 ++++
 bfillings/tests/test_bwa.py                  |  319 ++++
 bfillings/tests/test_cd_hit.py               |  214 +++
 bfillings/tests/test_clearcut.py             |  255 +++
 bfillings/tests/test_clustalw.py             |  627 +++++++
 bfillings/tests/test_fasttree.py             |  182 ++
 bfillings/tests/test_fasttree_v1.py          |  174 ++
 bfillings/tests/test_formatdb.py             |  233 +++
 bfillings/tests/test_infernal.py             |  620 +++++++
 bfillings/tests/test_mafft.py                |  132 ++
 bfillings/tests/test_mothur.py               |  315 ++++
 bfillings/tests/test_muscle_v38.py           |  286 +++
 bfillings/tests/test_parsinsert.py           |  138 ++
 bfillings/tests/test_pplacer.py              |  254 +++
 bfillings/tests/test_raxml_v730.py           |  236 +++
 bfillings/tests/test_rdp_classifier.py       |  398 ++++
 bfillings/tests/test_rtax.py                 |  228 +++
 bfillings/tests/test_sortmerna_v2.py         |  855 +++++++++
 bfillings/tests/test_sumaclust_v1.py         |  259 +++
 bfillings/tests/test_swarm_v127.py           |  190 ++
 bfillings/tests/test_uclust.py               |  758 ++++++++
 bfillings/tests/test_usearch.py              | 2000 ++++++++++++++++++++
 bfillings/tests/test_vsearch.py              | 1686 +++++++++++++++++
 bfillings/uclust.py                          |  606 ++++++
 bfillings/usearch.py                         | 2547 ++++++++++++++++++++++++++
 bfillings/vsearch.py                         |  575 ++++++
 debian/changelog                             |   35 -
 debian/compat                                |    1 -
 debian/control                               |   67 -
 debian/copyright                             |   35 -
 debian/patches/cd_hit_leaves_no_bak_file     |   32 -
 debian/patches/handle_renamed_binaries       |  168 --
 debian/patches/mothur_skip_list_header       |   63 -
 debian/patches/no_set_blastmat               |   12 -
 debian/patches/rdp_classifier_2.10           |  106 --
 debian/patches/series                        |    7 -
 debian/patches/test_raxml_accept_new_version |   32 -
 debian/patches/test_usearch_known_failures   |   39 -
 debian/rules                                 |   47 -
 debian/source/format                         |    1 -
 debian/watch                                 |    3 -
 setup.py                                     |   46 +
 74 files changed, 26402 insertions(+), 648 deletions(-)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d7ef01f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,39 @@
+*.py[cod]
+
+# C extensions
+*.so
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64
+__pycache__
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+nosetests.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+# vi
+.*.swp
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..8203cf1
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,11 @@
+# burrito-fillings changelog
+
+## Version 0.1.1 (2015-05-22)
+
+* Updated handling of temporary files to make better use of python ``tempfile.gettempdir()`` for some of the most widely used burrito fillings ([#61](https://github.com/biocore/burrito-fillings/pull/61), [#64](https://github.com/biocore/burrito-fillings/pull/64)).
+* Fixed bug where swarm wrapper would silently ignore ``swarm`` failures ([#67](https://github.com/biocore/burrito-fillings/pull/67), [biocore/qiime#2014](https://github.com/biocore/qiime/issues/2014)).
+* Added ``__version__`` to ``bfillings/__init__.py`` so that other python packages have access to the version number ([#54](https://github.com/biocore/burrito-fillings/issues/54)).
+
+## Version 0.1.0 (2014-11-12)
+
+Initial release.
diff --git a/COPYING.txt b/COPYING.txt
new file mode 100644
index 0000000..b6785a9
--- /dev/null
+++ b/COPYING.txt
@@ -0,0 +1,27 @@
+Copyright (c) 2013--, biocore development team.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+* Neither the name of the {organization} nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ffdd95f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,11 @@
+burrito-fillings
+================
+
+burrito-fillings (canonically pronounced *boar-ee-toe phil-ings*; python package name ``bfillings``) contains [burrito](https://github.com/biocore/burrito) [CommandLineApplication](https://github.com/biocore/burrito/blob/master/burrito/util.py#L161) subclasses (i.e., *application controllers*) for bioinformatics applications. This is intended to be a temporary package for the application controllers that are used in QIIME as we figure out which of these we will continue to support.
+
+**Note:** burrito fillings is currently under active development and its API is not stable. Major compatibility-breaking API changes will likely happen as development progresses.
+
+The pre-history of burrito-fillings
+-----------------------------------
+
+burrito-fillings derives from code in [PyCogent](http://www.pycogent.org) and [QIIME](http://www.qiime.org), and the contributors and/or copyright holders have agreed to make the code they wrote for PyCogent and/or QIIME available under the BSD license. The contributors to PyCogent and/or QIIME modules that have been ported to bfillings are: Rob Knight (@rob-knight), Gavin Huttley (@gavin-huttley), Daniel McDonald (@wasade), Micah Hamady, Antonio Gonzalez (@antgonza), Sandra Smit, Greg C [...]
diff --git a/bfillings/__init__.py b/bfillings/__init__.py
new file mode 100644
index 0000000..0510013
--- /dev/null
+++ b/bfillings/__init__.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+__version__ = '0.1.1'
diff --git a/bfillings/align.py b/bfillings/align.py
new file mode 100644
index 0000000..3a1d448
--- /dev/null
+++ b/bfillings/align.py
@@ -0,0 +1,40 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from cogent import DNA as DNA_cogent, LoadSeqs
+from cogent.align.align import make_dna_scoring_dict, local_pairwise
+
+def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA_cogent, params={}):
+    """
+        Checks parameters for pairwise alignment, returns alignment.
+
+        Code from Greg Caporaso.
+    """
+
+    seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False)
+    try:
+        s1, s2 = seqs.values()
+    except ValueError:
+        raise ValueError(
+            "Pairwise aligning of seqs requires exactly two seqs.")
+
+    try:
+        gap_open = params['gap_open']
+    except KeyError:
+        gap_open = 5
+    try:
+        gap_extend = params['gap_extend']
+    except KeyError:
+        gap_extend = 2
+    try:
+        score_matrix = params['score_matrix']
+    except KeyError:
+        score_matrix = make_dna_scoring_dict(
+            match=1, transition=-1, transversion=-1)
+
+    return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
diff --git a/bfillings/blast.py b/bfillings/blast.py
new file mode 100644
index 0000000..3030a40
--- /dev/null
+++ b/bfillings/blast.py
@@ -0,0 +1,1243 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from string import strip
+from os import remove, access, F_OK, environ, path
+from random import choice
+from copy import copy
+import tempfile
+
+from burrito.parameters import FlagParameter, ValuedParameter, MixedParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+                            get_tmp_filename, guess_input_handler,
+                            ApplicationNotFoundError)
+from skbio.parse.sequences.fasta import FastaFinder, is_fasta_label
+from skbio.parse.record_finder import LabeledRecordFinder
+
+from cogent.parse.blast import (LastProteinIds9, QMEBlast9, QMEPsiBlast9,
+                                BlastResult)
+from cogent.util.misc import app_path
+
+
+class Blast(CommandLineApplication):
+    """BLAST generic application controller"""
+
+    _common_options ={
+        # defaults to non-redundant database
+        #WARNING: This will only work if BLASTDB environment variable is set
+        '-d':ValuedParameter('-',Name='d',Delimiter=' ', Value="nr"),
+
+        # query file
+        '-i':ValuedParameter('-',Name='i',Delimiter=' '),
+
+        # Multiple Hits window size [Integer]
+        '-A':ValuedParameter('-',Name='A',Delimiter=' '),
+
+        # Threshold for extending hits [Integer]
+        '-f':ValuedParameter('-',Name='f',Delimiter=' '),
+
+        # Expectation value (E) [Real]
+        '-e':ValuedParameter('-',Name='e',Delimiter=' ', Value="10.0"),
+
+        # alignment view options:
+        # 0 = pairwise,
+        # 1 = query-anchored showing identities,
+        # 2 = query-anchored no identities,
+        # 3 = flat query-anchored, show identities,
+        # 4 = flat query-anchored, no identities,
+        # 5 = query-anchored no identities and blunt ends,
+        # 6 = flat query-anchored, no identities and blunt ends,
+        # 7 = XML Blast output,
+        # 8 = Tabular output,
+        # 9 = Tabular output with comments
+        # 10 = ASN, text
+        # 11 = ASN, binary [Integer]
+        '-m':ValuedParameter('-',Name='m',Delimiter=' ', Value="9"),
+
+        # Output File for Alignment [File Out]  Optional
+        '-o':ValuedParameter('-',Name='o',Delimiter=' '),
+
+         # Filter query sequence with SEG [String]
+        '-F':ValuedParameter('-',Name='F',Delimiter=' '),
+
+         #  Cost to open a gap [Integer]
+        '-G':ValuedParameter('-',Name='G',Delimiter=' '),
+
+        # Cost to extend a gap [Integer]
+        '-E':ValuedParameter('-',Name='E',Delimiter=' '),
+
+        # X dropoff value for gapped alignment (in bits) [Integer]
+        # blastn 30, megablast 20, tblastx 0, all others 15 [Integer]
+        '-X':ValuedParameter('-',Name='X',Delimiter=' '),
+
+         # Show GI's in deflines [T/F]
+        '-I':ValuedParameter('-',Name='I',Delimiter=' '),
+
+        # Number of database seqs to show one-line descriptionss for [Integer]
+        '-v':ValuedParameter('-',Name='v',Delimiter=' '),
+
+        # Number of database sequence to show alignments for (B) [Integer]
+        '-b':ValuedParameter('-',Name='b',Delimiter=' '),
+
+        # Perform gapped alignment (not available with tblastx) [T/F]
+        '-g':ValuedParameter('-',Name='g',Delimiter=' '),
+
+        # Number of processors to use [Integer]
+        '-a':ValuedParameter('-',Name='a',Delimiter=' ', Value="1"),
+
+        # Believe the query defline [T/F]
+        '-J':ValuedParameter('-',Name='J',Delimiter=' '),
+
+        # SeqAlign file ('Believe the query defline' must be TRUE) [File Out]
+        # Optional
+        '-O':ValuedParameter('-',Name='O',Delimiter=' '),
+
+        # Matrix [String]
+        '-M':ValuedParameter('-',Name='M',Delimiter=' ', Value="BLOSUM62"),
+
+        # Word size [Integer]  (blastn 11, megablast 28, all others 3)
+        '-W':ValuedParameter('-',Name='W',Delimiter=' '),
+
+        # Effective length of the database (use zero for the real size) [Real]
+        '-z':ValuedParameter('-',Name='z',Delimiter=' '),
+
+        # Number of best hits from a region to keep [Integer]
+        '-K':ValuedParameter('-',Name='K',Delimiter=' '),
+
+        # 0 for multiple hit, 1 for single hit [Integer]
+        '-P':ValuedParameter('-',Name='P',Delimiter=' '),
+
+        # Effective length of the search space (use zero for real size) [Real]
+        '-Y':ValuedParameter('-',Name='Y',Delimiter=' '),
+
+        # Produce HTML output [T/F]
+        '-T':ValuedParameter('-',Name='T',Delimiter=' ', Value="F"),
+
+        # Restrict search of database to list of GI's [String]  Optional
+        '-l':ValuedParameter('-',Name='l',Delimiter=' '),
+
+        # Use lower case filtering of FASTA sequence [T/F] Optional
+        '-U':ValuedParameter('-',Name='U',Delimiter=' '),
+
+        # Dropoff (X) for blast extensions in bits (default if zero) [Real]
+        # blastn 20, megablast 10, all others 7
+        '-y':ValuedParameter('-',Name='y',Delimiter=' '),
+
+        # X dropoff value for final gapped alignment (in bits) [Integer]
+        # blastn/megablast 50, tblastx 0, all others 25
+        '-Z':ValuedParameter('-',Name='Z',Delimiter=' '),
+
+        # Input File for PSI-BLAST Restart [File In]  Optional
+        '-R':ValuedParameter('-',Name='R',Delimiter=' '),
+
+    }
+
+    _executable = 'blastall'
+
+    _parameters = {}
+    _parameters.update(_common_options)
+
+    def __init__(self, cur_options, command, blast_mat_root=None,
+                 extra_env="",
+                 params=None,InputHandler=None,
+                 SuppressStderr=None, SuppressStdout=None,WorkingDir=None,\
+                 HALT_EXEC=False):
+        """ Initialize blast """
+        # update options
+        self._parameters.update(cur_options)
+
+        # check if need to set env variable (for cgi calls)
+        if blast_mat_root:
+            self._command = "export BLASTMAT=%s;%s%s" % (blast_mat_root,
+                                                    extra_env, command)
+        else:
+            # Determine if blast is installed and raise an ApplicationError
+            # if not -- this is done here so the user will get the most
+            # informative error message available.
+            self._error_on_missing_application(params)
+
+            # Otherwise raise error about $BLASTMAT not being set
+            if not ('BLASTMAT' in environ or \
+                    access(path.expanduser("~/.ncbirc"), F_OK) or \
+                    access(".ncbirc", F_OK)):
+                ## SHOULD THIS BE CHANGED TO RAISE AN ApplicationError?
+                raise RuntimeError, blastmat_error_message
+            self._command = command
+
+        super(Blast, self).__init__(params=params,
+                    InputHandler=InputHandler,SuppressStderr=SuppressStderr,
+                    SuppressStdout=SuppressStdout,WorkingDir=WorkingDir,\
+                    HALT_EXEC=HALT_EXEC)
+
+    def _error_on_missing_application(self,params):
+        """ Raise an ApplicationNotFoundError if the app is not accessible
+        """
+        if not app_path('blastall'):
+            raise ApplicationNotFoundError,\
+             "Cannot find blastall. Is it installed? Is it in your path?"
+
+    def _input_as_seqs(self,data):
+        lines = []
+        for i,s in enumerate(data):
+            #will number the sequences 1,2,3,etc.
+            lines.append(''.join(['>',str(i+1)]))
+            lines.append(s)
+        return self._input_as_lines(lines)
+
+    def _input_as_seq_id_seq_pairs(self,data):
+        lines = []
+        for seq_id,seq in data:
+            lines.append(''.join(['>',str(seq_id)]))
+            lines.append(seq)
+        return self._input_as_lines(lines)
+
+    def _input_as_lines(self,data):
+        if data:
+            self.Parameters['-i']\
+                .on(super(Blast,self)._input_as_lines(data))
+
+        return ''
+
+    def _input_as_string(self,data):
+        """Makes data the value of a specific parameter
+
+        This method returns the empty string. The parameter will be printed
+        automatically once set.
+        """
+        if data:
+            self.Parameters['-i'].on(str(data))
+        return ''
+
+    def _input_as_multiline_string(self, data):
+        if data:
+            self.Parameters['-i']\
+                .on(super(Blast,self)._input_as_multiline_string(data))
+        return ''
+
+    def _align_out_filename(self):
+
+        if self.Parameters['-o'].isOn():
+            aln_filename = self._absolute(str(self.Parameters['-o'].Value))
+        else:
+            raise ValueError, "No output file specified."
+        return aln_filename
+
+    def _get_result_paths(self,data):
+
+        result = {}
+        if self.Parameters['-o'].isOn():
+            out_name = self._align_out_filename()
+            result['BlastOut'] = ResultPath(Path=out_name,IsWritten=True)
+        return result
+
+blastmat_error_message =\
+"""BLAST cannot run if the BLASTMAT environment variable is not set.
+
+Usually, the BLASTMAT environment variable points to the NCBI data directory,
+which contains matrices like PAM30 and PAM70, etc.
+
+Alternatively, you may create a .ncbirc file to define these variables.
+
+From help file:
+
+2) Create a .ncbirc file. In order for Standalone BLAST to operate, you
+have will need to have a .ncbirc file that contains the following lines:
+
+[NCBI]
+Data="path/data/"
+
+Where "path/data/" is the path to the location of the Standalone BLAST
+"data" subdirectory. For Example:
+
+Data=/root/blast/data
+
+The data subdirectory should automatically appear in the directory where
+the downloaded file was extracted. Please note that in many cases it may
+be necessary to delimit the entire path including the machine name and
+or the net work you are located on. Your systems administrator can help
+you if you do not know the entire path to the data subdirectory.
+
+Make sure that your .ncbirc file is either in the directory that you
+call the Standalone BLAST program from or in your root directory.
+"""
+
+class PsiBlast(Blast):
+    """PSI-BLAST application controller - Prototype"""
+    _options ={
+
+        # ASN.1 Scoremat input of checkpoint data:
+        # 0: no scoremat input
+        # 1: Restart is from ASCII scoremat checkpoint file,
+        # 2: Restart is from binary scoremat checkpoint file [Integer]  Optional
+        '-q':ValuedParameter('-',Name='q',Delimiter=' '),
+
+        # Output File for PSI-BLAST Matrix in ASCII [File Out] Optional
+        '-Q':ValuedParameter('-',Name='Q',Delimiter=' '),
+
+        # Start of required region in query [Integer]
+        '-S':ValuedParameter('-',Name='S',Delimiter=' ', Value="1"),
+
+        # ASN.1 Scoremat output of checkpoint data:
+        # 0: no scoremat output
+        # 1: Output is ASCII scoremat checkpoint file (requires -J),
+        # 2: Output is binary scoremat checkpoint file (requires -J) Optional
+        '-u':ValuedParameter('-',Name='u',Delimiter=' '),
+
+        # Cost to decline alignment (disabled when 0) [Integer]
+        '-L':ValuedParameter('-',Name='L',Delimiter=' ', Value="0"),
+
+        # program option for PHI-BLAST [String]
+        '-p':ValuedParameter('-',Name='p',Delimiter=' ', Value="blastpgp"),
+
+        # Use composition based statistics [T/F]
+        '-t':ValuedParameter('-',Name='t',Delimiter=' ', Value="T"),
+
+        # Input Alignment File for PSI-BLAST Restart [File In] Optional
+        '-B':ValuedParameter('-',Name='B',Delimiter=' '),
+
+        # Number of bits to trigger gapping [Real]
+        '-N':ValuedParameter('-',Name='N',Delimiter=' ', Value="22.0"),
+
+        # End of required region in query (-1 indicates end of query) [Integer]
+        '-H':ValuedParameter('-',Name='H',Delimiter=' ', Value="-1"),
+
+        # e-value threshold for inclusion in multipass model [Real]
+        '-h':ValuedParameter('-',Name='h',Delimiter=' ', Value="0.001"),
+
+        # Constant in pseudocounts for multipass version [Integer]
+        '-c':ValuedParameter('-',Name='c',Delimiter=' ', Value="9"),
+
+        # Maximum number of passes to use in  multipass version [Integer]
+        '-j':ValuedParameter('-',Name='j',Delimiter=' ', Value="1"),
+
+        # Output File for PSI-BLAST Checkpointing [File Out]  Optional
+        '-C':ValuedParameter('-',Name='C',Delimiter=' '),
+
+        # Compute locally optimal Smith-Waterman alignments [T/F]
+        '-s':ValuedParameter('-',Name='s',Delimiter=' ', Value="F"),
+
+        # Hit File for PHI-BLAST [File In]
+        '-k':ValuedParameter('-',Name='k',Delimiter=' '),
+
+    }
+
+    def __init__(self, blast_mat_root=None, params=None,
+                 extra_env="",
+                 InputHandler=None,SuppressStderr=None,
+                 SuppressStdout=None,WorkingDir=None,
+                 HALT_EXEC=False):
+        """ Initialize the Psi-Blast"""
+        super(PsiBlast, self).__init__(self._options,
+                    "blastpgp",
+                    extra_env=extra_env,
+                    blast_mat_root=blast_mat_root,
+                    params=params,
+                    InputHandler=InputHandler,SuppressStderr=SuppressStderr,
+                    SuppressStdout=SuppressStdout,WorkingDir=WorkingDir,
+                    HALT_EXEC=HALT_EXEC)
+
+
+# should probably go into blastall superclass. it's late, works for now
+BLASTALL_OPTIONS ={
+        # Use lower case filtering of FASTA sequence [T/F] Optional
+        '-U':ValuedParameter('-',Name='U',Delimiter=' '),
+
+        # Penalty for a nucleotide mismatch (blastn only) [Integer]
+        # default = -3
+        '-q':ValuedParameter('-',Name='q',Delimiter=' '),
+
+        # Reward for a nucleotide match (blastn only) [Integer]
+        '-r':ValuedParameter('-',Name='r',Delimiter=' '),
+
+        # Query Genetic code to use [Integer] default = 1
+        '-Q':ValuedParameter('-',Name='Q',Delimiter=' '),
+
+        # DB Genetic code (for tblast[nx] only) [Integer]
+        '-D':ValuedParameter('-',Name='D',Delimiter=' '),
+
+        # Query strands to search against database (for blast[nx], and tblastx)
+        # 3 is both, 1 is top, 2 is bottom [Integer]
+        '-S':ValuedParameter('-',Name='S',Delimiter=' '),
+
+        # Program Name
+        '-p':ValuedParameter('-',Name='p',Delimiter=' '),
+
+        # MegaBlast search [T/F]
+        '-n':ValuedParameter('-',Name='n',Delimiter=' '),
+
+        # Location on query sequence [String]  Option
+        '-L':ValuedParameter('-',Name='L',Delimiter=' '),
+
+        # Frame shift penalty (OOF algorithm for blastx) [Integer]
+        '-w':ValuedParameter('-',Name='w',Delimiter=' '),
+
+        # Length of the largest intron allowed in tblastn for linking HSPs
+        #(0 disables linking) [Integer]
+        '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+        # Number of concatenated queries, for blastn and tblastn [Integer]
+        '-B':ValuedParameter('-',Name='B',Delimiter=' '),
+    }
+
+
+class Blastall(Blast):
+    """blastall application controller - Prototype """
+
+    def __init__(self, blast_mat_root=None, params=None,
+                 extra_env="",
+                 InputHandler=None,SuppressStderr=None,
+                 SuppressStdout=None,WorkingDir=None,
+                 HALT_EXEC=False):
+        """ Initialize the blastall"""
+        super(Blastall, self).__init__(BLASTALL_OPTIONS,
+                    "blastall",
+                    blast_mat_root=blast_mat_root,
+                    extra_env=extra_env,
+                    params=params,
+                    InputHandler=InputHandler,SuppressStderr=SuppressStderr,
+                    SuppressStdout=SuppressStdout,WorkingDir=WorkingDir,
+                    HALT_EXEC=HALT_EXEC)
+class MpiBlast(Blast):
+    """mpblast application controller - Prototype """
+
+    _mpi_options ={
+        # Produces verbose debugging output for each node, optionally logs the
+        # output to a file
+        '--debug':ValuedParameter('-',Name='--debug',Delimiter='='),
+
+        # Set the scheduler process' MPI Rank (default is 1). Because the
+        # scheduler uses very little CPU it can be useful to force the
+        # scheduler to run on the same physical machine as the writer (rank 0).
+        '--scheduler-rank':ValuedParameter('-',Name='--scheduler-rank',
+                                           Delimiter='='),
+
+        # Print the Altschul. et. al. 1997 paper reference instead of the
+        # mpiBLAST paper reference. With this option mpiblast output is nearly
+        # identical to NCBI-BLAST output.
+        '--altschul-reference':FlagParameter(Prefix='--',
+                                             Name='altschul-reference'),
+
+        #Removes the local copy of the database from each node before
+        # terminating execution
+        '--removedb':FlagParameter(Prefix='--', Name='removedb'),
+
+        # Sets the method of copying files that each worker will use.
+        #  Default = "cp"
+        # * cp : use standard file system "cp" command.
+        #        Additional option is --concurrent.
+        # * rcp : use rsh "rcp" command. Additonal option is --concurrent.
+        # * scp : use ssh "scp" command. Additional option is --concurrent.
+        # * mpi : use MPI_Send/MPI_Recv to copy files.
+        #         Additional option is --mpi-size.
+        # * none : do not copy files,instead use shared storage as local storage
+        '--copy-via':ValuedParameter('-',Name='--copy-via', Delimiter='='),
+
+
+        # set the number of concurrent accesses to shared storage. Default = 1
+        '--concurrent':ValuedParameter('-',Name='--concurrent', Delimiter='='),
+
+
+        # in bytes, set the maximum buffer size that MPI will use to send data
+        # when transferring files. Default = 65536
+        '--mpi-size':ValuedParameter('-',Name='--mpi-size', Delimiter='='),
+
+
+        # set whether file locking should be used to manage local fragment
+        # lists. Defaults to off. When --concurrency > 1 defaults to on
+        # [on|off]
+        '--lock':ValuedParameter('-',Name='--lock', Delimiter='='),
+
+        # When set, the writer will use the database on shared storage for
+        # sequence lookup. Can drastically reduce overhead for some blastn
+        # searches.
+        '--disable-mpi-db':FlagParameter(Prefix='--', Name='disable-mpi-db'),
+
+        # Under unix, sets the nice value for each mpiblast process.
+        '--nice':ValuedParameter('-',Name='--nice', Delimiter='='),
+
+        # Under unix, sets the nice value for each mpiblast process.
+        '--config-file':ValuedParameter('--',Name='config-file', Delimiter='='),
+
+
+        # Experimental. When set, mpiblast will read the output file and
+        # attempt to continue a previously aborted run where it left off
+        '--resume-run':FlagParameter(Prefix='--', Name='resume-run'),
+
+        # print the mpiBLAST version
+        '--version':FlagParameter(Prefix='--', Name='version'),
+    }
+
+    _mpi_options.update(BLASTALL_OPTIONS)
+
+    def __init__(self, blast_mat_root=None, params=None,
+                 mpiblast_root="/usr/local/bin/",
+                 local_root="/var/scratch/mpiblastdata/",
+                 shared_root="/quicksand/hamady/data/blast/mpidb/",
+                 config_file="/quicksand2/downloads2/mpiblast/mpiblast.conf",
+                 num_db_frags=40,
+                 InputHandler=None,SuppressStderr=None,
+                 SuppressStdout=None,WorkingDir=None,
+                 HALT_EXEC=False):
+        """ Initialize mpiblast"""
+        if config_file:
+            params["--config-file"] = config_file
+        super(MpiBlast, self).__init__(self._mpi_options,
+                    "mpirun -np %d %smpiblast" % ((num_db_frags + 2),
+                                                    mpiblast_root),
+                    blast_mat_root=blast_mat_root,
+                    extra_env="export Local=%s; export Shared=%s;" %(local_root,
+                        shared_root),
+                    params=params,
+                    InputHandler=InputHandler,SuppressStderr=SuppressStderr,
+                    SuppressStdout=SuppressStdout,WorkingDir=WorkingDir,
+                    HALT_EXEC=HALT_EXEC)
+
+class FastaCmd(CommandLineApplication):
+    """FastaCmd application controller - Prototype"""
+
+    _options ={
+        # Database [String]  Optional
+        '-d':ValuedParameter('-',Name='d',Delimiter=' '),
+
+        # Type of file
+        # G - guess mode (look for protein, then nucleotide)
+        # T - protein
+        # F - nucleotide [String]  Optional
+        '-p':ValuedParameter('-',Name='p',Delimiter=' ', Value="G"),
+
+        # Search str: GIs, accessions and loci may be used delimited by comma
+        '-s':ValuedParameter('-',Name='s',Delimiter=' '),
+
+        # Input file wilth GIs/accessions/loci for batch retrieval Optional
+        '-i':ValuedParameter('-',Name='i',Delimiter=' '),
+
+        # Retrieve duplicate accessions [T/F]  Optional
+        '-a':ValuedParameter('-',Name='a',Delimiter=' ', Value='F'),
+
+        # Line length for sequence [Integer]  Optional
+        '-l':ValuedParameter('-',Name='l',Delimiter=' '),
+
+        # Definition line should contain target gi only [T/F]  Optional
+        '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+        # Output file [File Out]  Optional
+        '-o':ValuedParameter('-',Name='o',Delimiter=' '),
+
+        # Use Ctrl-A's as non-redundant defline separator [T/F]  Optional
+        '-c':ValuedParameter('-',Name='c',Delimiter=' '),
+
+        # Dump the entire database in fasta format [T/F]  Optional
+        '-D':ValuedParameter('-',Name='D',Delimiter=' '),
+
+        # Range of sequence to extract (Format: start,stop)
+        # 0 in 'start' refers to the beginning of the sequence
+        # 0 in 'stop' refers to the end of the sequence [String]  Optional
+        '-L':ValuedParameter('-',Name='L',Delimiter=' '),
+
+        # Strand on subsequence (nucleotide only): 1 is top, 2 is bottom [Int]
+        '-S':ValuedParameter('-',Name='S',Delimiter=' '),
+
+        # Print taxonomic information for requested sequence(s) [T/F]
+        '-T':ValuedParameter('-',Name='T',Delimiter=' '),
+
+        # Print database information only (overrides all other options) [T/F]
+        '-I':ValuedParameter('-',Name='I',Delimiter=' '),
+
+        #  Retrieve sequences with this PIG [Integer]  Optional
+        '-P':ValuedParameter('-',Name='P',Delimiter=' '),
+
+    }
+    _parameters = {}
+    _parameters.update(_options)
+    _command = 'fastacmd'
+
+    def _input_as_lines(self,data):
+        if data:
+            self.Parameters['-i']\
+                .on(super(FastaCmd,self)._input_as_lines(data))
+        return ''
+
+    def _input_as_seqs(self,data):
+        lines = []
+        for i,s in enumerate(data):
+            #will number the sequences 1,2,3,etc.
+            lines.append(''.join(['>',str(i+1)]))
+            lines.append(s)
+        return self._input_as_lines(lines)
+
+    def _input_as_string(self,data):
+        """Makes data the value of a specific parameter
+
+        This method returns the empty string. The parameter will be printed
+        automatically once set.
+        """
+        if data:
+            self.Parameters['-s'].on(data)
+        return ''
+
+    def _out_filename(self):
+
+        if self.Parameters['-o'].isOn():
+            aln_filename = self._absolute(str(self.Parameters['-o'].Value))
+        else:
+            raise ValueError, "No output file specified."
+        return aln_filename
+
+    def _get_result_paths(self,data):
+
+        result = {}
+        if self.Parameters['-o'].isOn():
+            out_name = self._out_filename()
+            result['FastaOut'] = ResultPath(Path=out_name,IsWritten=True)
+        return result
+
+def seqs_to_stream(seqs, ih):
+    """Converts seqs into stream of FASTA records, depending on input handler.
+
+    Each FASTA record will be a list of lines.
+    """
+    if ih == '_input_as_multiline_string':
+        recs = FastaFinder(seqs.split('\n'))
+    elif ih == '_input_as_string':
+        recs = FastaFinder(open(seqs))
+    elif ih == '_input_as_seqs':
+        recs = [['>'+str(i), s] for i, s in enumerate(seqs)]
+    elif ih == '_input_as_lines':
+        recs = FastaFinder(seqs)
+    else:
+        raise TypeError, "Unknown input handler %s" % ih
+    return recs
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+def blast_seqs(seqs,
+                 blast_constructor,
+                 blast_db=None,
+                 blast_mat_root=None,
+                 params={},
+                 add_seq_names=True,
+                 out_filename=None,
+                 WorkingDir=None,
+                 SuppressStderr=None,
+                 SuppressStdout=None,
+                 input_handler=None,
+                 HALT_EXEC=False
+                 ):
+    """Blast list of sequences.
+
+    seqs: either file name or list of sequence objects or list of strings or
+    single multiline string containing sequences.
+
+    WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules
+    for data are as follows. If it's s list, treat as lines, unless
+    add_seq_names is true (in which case treat as list of seqs). If it's a
+    string, test whether it has newlines. If it doesn't have newlines, assume
+    it's a filename. If it does have newlines, it can't be a filename, so
+    assume it's a multiline string containing sequences.
+
+    If you want to skip the detection and force a specific type of input
+    handler, use input_handler='your_favorite_handler'.
+
+    add_seq_names: boolean. if True, sequence names are inserted in the list
+        of sequences. if False, it assumes seqs is a list of lines of some
+        proper format that the program can handle
+    """
+
+    # set num keep
+
+    if blast_db:
+        params["-d"] = blast_db
+
+    if out_filename:
+        params["-o"] = out_filename
+
+    ih = input_handler or guess_input_handler(seqs, add_seq_names)
+
+    blast_app = blast_constructor(
+                   params=params,
+                   blast_mat_root=blast_mat_root,
+                   InputHandler=ih,
+                   WorkingDir=WorkingDir,
+                   SuppressStderr=SuppressStderr,
+                   SuppressStdout=SuppressStdout,
+                   HALT_EXEC=HALT_EXEC)
+
+    return blast_app(seqs)
+
+
+def fasta_cmd_get_seqs(acc_list,
+                 blast_db=None,
+                 is_protein=None,
+                 out_filename=None,
+                 params={},
+                 WorkingDir=tempfile.gettempdir(),
+                 SuppressStderr=None,
+                 SuppressStdout=None):
+    """Retrieve sequences for list of accessions """
+
+    if is_protein is None:
+        params["-p"] = 'G'
+    elif is_protein:
+        params["-p"] = 'T'
+    else:
+        params["-p"] = 'F'
+
+    if blast_db:
+        params["-d"] = blast_db
+
+    if out_filename:
+        params["-o"] = out_filename
+
+    # turn off duplicate accessions
+    params["-a"] = "F"
+
+    # create Psi-BLAST
+    fasta_cmd = FastaCmd(params=params,
+                       InputHandler='_input_as_string',
+                       WorkingDir=WorkingDir,
+                       SuppressStderr=SuppressStderr,
+                       SuppressStdout=SuppressStdout)
+
+    # return results
+    return fasta_cmd("\"%s\"" % ','.join(acc_list))
+
+def fastacmd_is_crap(line):
+    """Handles missing ids..."""
+    return (not line) or line.isspace() or line.startswith('[')
+
+FastaCmdFinder = LabeledRecordFinder(is_fasta_label, ignore=fastacmd_is_crap)
+
+def seqs_from_fastacmd(acc_list, blast_db,is_protein=True):
+    """Get dict of description:seq from fastacmd."""
+    fasta_cmd_res = fasta_cmd_get_seqs(acc_list, blast_db=blast_db, \
+        is_protein=is_protein)
+    recs = FastaCmdFinder(fasta_cmd_res['StdOut'])
+    result = {}
+    for rec in recs:
+        try:
+            result[rec[0][1:].strip()] = ''.join(map(strip, rec[1:]))
+        except IndexError:  #maybe we didn't get a sequence?
+            pass
+    fasta_cmd_res.cleanUp()
+    return result
+
+def psiblast_n_neighbors(seqs,
+                 n=100,
+                 blast_db=None,
+                 core_threshold=1e-50,
+                 extra_threshold=1e-10,
+                 lower_threshold=1e-6,
+                 step=100,
+                 method="two-step",
+                 blast_mat_root=None,
+                 params={},
+                 add_seq_names=False,
+                 WorkingDir=None,
+                 SuppressStderr=None,
+                 SuppressStdout=None,
+                 input_handler=None,
+                 scorer=3,   #shotgun with 3 hits needed to keep
+                 second_db=None
+                 ):
+    """PsiBlasts sequences, stopping when n neighbors are reached.
+
+    core_threshold: threshold for the core profile (default: 1e-50)
+    extra_threshold: threshold for pulling in additional seqs (default:1e-10)
+    lower_threshold: threshold for seqs in final round (default:1e-6)
+
+    seqs: either file name or list of sequence objects or list of strings or
+    single multiline string containing sequences.
+    If you want to skip the detection and force a specific type of input
+    handler, use input_handler='your_favorite_handler'.
+
+    add_seq_names: boolean. if True, sequence names are inserted in the list
+        of sequences. if False, it assumes seqs is a list of lines of some
+        proper format that the program can handle
+    """
+    if blast_db:
+        params["-d"] = blast_db
+
+    ih = input_handler or guess_input_handler(seqs, add_seq_names)
+    recs = seqs_to_stream(seqs, ih) #checkpointing can only handle one seq...
+
+    #set up the parameters for the core and additional runs
+    max_iterations = params['-j']
+    params['-j'] = 2    #won't checkpoint with single iteration
+
+    app = PsiBlast(params=params,
+                   blast_mat_root=blast_mat_root,
+                   InputHandler='_input_as_lines',
+                   WorkingDir=WorkingDir,
+                   SuppressStderr=SuppressStderr,
+                   SuppressStdout=SuppressStdout,
+                   )
+    result = {}
+    for seq in recs:
+        query_id = seq[0][1:].split(None,1)[0]
+        if method == "two-step":
+            result[query_id] = ids_from_seq_two_step(seq, n, max_iterations, \
+                app, core_threshold, extra_threshold, lower_threshold, second_db)
+        elif method == "lower_threshold":
+            result[query_id] = ids_from_seq_lower_threshold(seq, n, \
+                max_iterations, app, core_threshold, lower_threshold, step)
+        elif method == "iterative":
+            result[query_id] = ids_from_seqs_iterative(seq, app, \
+               QMEPsiBlast9, scorer, params['-j'], n)
+        else:
+            raise TypeError, "Got unknown method %s" % method
+
+    params['-j'] = max_iterations
+    return result
+
+def ids_from_seq_two_step(seq, n, max_iterations, app, core_threshold, \
+    extra_threshold, lower_threshold, second_db=None):
+    """Returns ids that match a seq, using a 2-tiered strategy.
+
+    Optionally uses a second database for the second search.
+    """
+    #first time through: reset 'h' and 'e' to core
+    #-h is the e-value threshold for including seqs in the score matrix model
+    app.Parameters['-h'].on(core_threshold)
+    #-e is the e-value threshold for the final blast
+    app.Parameters['-e'].on(core_threshold)
+    checkpoints = []
+    ids = []
+    last_num_ids = None
+    for i in range(max_iterations):
+        if checkpoints:
+            app.Parameters['-R'].on(checkpoints[-1])
+        curr_check = 'checkpoint_%s.chk' % i
+        app.Parameters['-C'].on(curr_check)
+
+        output = app(seq)
+        #if we didn't write a checkpoint, bail out
+        if not access(curr_check, F_OK):
+            break
+        #if we got here, we wrote a checkpoint file
+        checkpoints.append(curr_check)
+        result = list(output.get('BlastOut', output['StdOut']))
+        output.cleanUp()
+        if result:
+            ids = LastProteinIds9(result,keep_values=True,filter_identity=False)
+        num_ids = len(ids)
+        if num_ids >= n:
+            break
+        if num_ids == last_num_ids:
+            break
+        last_num_ids = num_ids
+
+    #if we didn't write any checkpoints, second run won't work, so return ids
+    if not checkpoints:
+        return ids
+
+    #if we got too many ids and don't have a second database, return the ids we got
+    if (not second_db) and num_ids >= n:
+        return ids
+
+    #second time through: reset 'h' and 'e' to get extra hits, and switch the
+    #database if appropriate
+    app.Parameters['-h'].on(extra_threshold)
+    app.Parameters['-e'].on(lower_threshold)
+    if second_db:
+        app.Parameters['-d'].on(second_db)
+    for i in range(max_iterations): #will always have last_check if we get here
+        app.Parameters['-R'].on(checkpoints[-1])
+        curr_check = 'checkpoint_b_%s.chk' % i
+        app.Parameters['-C'].on(curr_check)
+        output = app(seq)
+        #bail out if we couldn't write a checkpoint
+        if not access(curr_check, F_OK):
+            break
+        #if we got here, the checkpoint worked
+        checkpoints.append(curr_check)
+        result = list(output.get('BlastOut', output['StdOut']))
+        if result:
+            ids = LastProteinIds9(result,keep_values=True,filter_identity=False)
+        num_ids = len(ids)
+        if num_ids >= n:
+            break
+        if num_ids == last_num_ids:
+            break
+        last_num_ids = num_ids
+    #return the ids we got. may not be as many as we wanted.
+    for c in checkpoints:
+        remove(c)
+    return ids
+
+class ThresholdFound(Exception): pass
+
+def ids_from_seq_lower_threshold(seq, n, max_iterations, app, core_threshold, \
+    lower_threshold, step=100):
+    """Returns ids that match a seq, decreasing the sensitivity."""
+    last_num_ids = None
+    checkpoints = []
+    cp_name_base = make_unique_str()
+
+    # cache ides for each iteration
+    # store { iteration_num:(core_threshold, [list of matching ids]) }
+    all_ids = {}
+    try:
+        i=0
+        while 1:
+            #-h is the e-value threshold for inclusion in the score matrix model
+            app.Parameters['-h'].on(core_threshold)
+            app.Parameters['-e'].on(core_threshold)
+            if core_threshold > lower_threshold:
+                raise ThresholdFound
+            if checkpoints:
+                #-R restarts from a previously stored file
+                app.Parameters['-R'].on(checkpoints[-1])
+            #store the score model from this iteration
+            curr_check = 'checkpoint_' + cp_name_base + '_' + str(i) + \
+                    '.chk'
+            app.Parameters['-C'].on(curr_check)
+            output = app(seq)
+            result = list(output.get('BlastOut', output['StdOut']))
+            #sometimes fails on first try -- don't know why, but this seems
+            #to fix problem
+            while not result:
+                output = app(seq)
+                result = list(output.get('BlastOut', output['StdOut']))
+
+            ids = LastProteinIds9(result,keep_values=True,filter_identity=False)
+            output.cleanUp()
+            all_ids[i + 1] = (core_threshold, copy(ids))
+            if not access(curr_check, F_OK):
+                raise ThresholdFound
+            checkpoints.append(curr_check)
+            num_ids = len(ids)
+            if num_ids >= n:
+                raise ThresholdFound
+            last_num_ids = num_ids
+            core_threshold *= step
+            if i >= max_iterations - 1: #because max_iterations is 1-based
+                raise ThresholdFound
+            i += 1
+    except ThresholdFound:
+        for c in checkpoints:
+            remove(c)
+        #turn app.Parameters['-R'] off so that for the next file it does not
+        #try and read in a checkpoint file that is not there
+        app.Parameters['-R'].off()
+        return ids, i + 1, all_ids
+
+def make_unique_str(num_chars=20):
+    """make a random string of characters for a temp filename"""
+    chars = 'abcdefghigklmnopqrstuvwxyz'
+    all_chars = chars + chars.upper() + '01234567890'
+    picks = list(all_chars)
+    return ''.join([choice(picks) for i in range(num_chars)])
+
+def make_subject_match_scorer(count):
+    def subject_match_scorer(checked_ids):
+        """From {subject:{query:score}} returns subject ids w/ >= count hits.
+
+        Useful for elminating subjects with few homologs.
+        """
+        return [key for key, val in checked_ids.items() if len(val) >= count]
+    return subject_match_scorer
+
+def make_shotgun_scorer(count):
+    def shotgun_scorer(checked_ids):
+        """From {subject:{query:score}} returns any ids w/ >= count hits.
+
+        A hit counts towards a sequence's score if it was either the subject
+        or the query, but we don't double-count (subject, query) pairs, i.e.
+        if A hits B and B hits A, only one (A,B) hit will be counted, although
+        it will be counted as both (A,B) and (B,A) (i.e. it will help preserve
+        both A and B).
+        """
+        result = {}
+        for subject, val in checked_ids.items():
+            for query in val.keys():
+                if subject not in result:
+                    result[subject] = {}
+                result[subject][query] = True
+                if query not in result:
+                    result[query] = {}
+                result[query][subject] = True
+        return [key for key, val in result.items() if len(val) >= count]
+    return shotgun_scorer
+
+def keep_everything_scorer(checked_ids):
+    """Returns every query and every match in checked_ids, with best score."""
+    result = checked_ids.keys()
+    for i in checked_ids.values():
+        result.extend(i.keys())
+    return dict.fromkeys(result).keys()
+
+def ids_from_seqs_iterative(seqs, app, query_parser, \
+    scorer=keep_everything_scorer, max_iterations=None, blast_db=None,\
+    max_seqs=None, ):
+    """Gets the ids from each seq, then does each additional id until all done.
+
+    If scorer is passed in as an int, uses shotgun scorer with that # hits.
+    """
+    if isinstance(scorer, int):
+        scorer = make_shotgun_scorer(scorer)
+    seqs_to_check = list(seqs)
+    checked_ids = {}
+    curr_iteration = 0
+    while seqs_to_check:
+        unchecked_ids = {}
+        #pass seqs to command
+        all_output = app(seqs_to_check)
+        output = all_output.get('BlastOut', all_output['StdOut'])
+
+        for query_id, match_id, match_score in query_parser(output):
+            if query_id not in checked_ids:
+                checked_ids[query_id] = {}
+            checked_ids[query_id][match_id] = match_score
+            if match_id not in checked_ids:
+                unchecked_ids[match_id] = True
+        all_output.cleanUp()
+        if unchecked_ids:
+            seq_file = fasta_cmd_get_seqs(unchecked_ids.keys(),
+                app.Parameters['-d'].Value)['StdOut']
+            seqs_to_check = []
+            for s in FastaCmdFinder(fasta_cmd_get_seqs(\
+                unchecked_ids.keys(), app.Parameters['-d'].Value)['StdOut']):
+                seqs_to_check.extend(s)
+        else:
+            seqs_to_check = []
+        #bail out if max iterations or max seqs was defined and we've reached it
+        curr_iteration += 1
+        if max_iterations and (curr_iteration >= max_iterations):
+            break
+        if max_seqs:
+            curr = scorer(checked_ids)
+            if len(curr) >= max_seqs:
+                return curr
+    return scorer(checked_ids)  #scorer should return list of good ids
+
+
+def blastp(seqs, blast_db="nr", e_value="1e-20", max_hits=200,
+           working_dir=tempfile.gettempdir(), blast_mat_root=None,
+           extra_params={}):
+    """
+    Returns BlastResult from input seqs, using blastp.
+
+    Need to add doc string
+    """
+
+    # set up params to use with blastp
+    params = {
+        # matrix
+        "-M":"BLOSUM62",
+
+        # max procs
+        "-a":"1",
+
+        # expectation
+        "-e":e_value,
+
+        # max seqs to show
+        "-b":max_hits,
+
+        # max one line descriptions
+        "-v":max_hits,
+
+        # program
+        "-p":"blastp"
+    }
+    params.update(extra_params)
+
+    # blast
+    blast_res =  blast_seqs(seqs,
+        Blastall,
+        blast_mat_root=blast_mat_root,
+        blast_db=blast_db,
+        params=params,
+        add_seq_names=False,
+        WorkingDir=working_dir
+        )
+
+    # get prot id map
+    if blast_res['StdOut']:
+        lines = [x for x in blast_res['StdOut']]
+        return BlastResult(lines)
+
+    return None
+
+def blastn(seqs, blast_db="nt", e_value="1e-20", max_hits=200,
+           working_dir=tempfile.gettempdir(), blast_mat_root=None,
+           extra_params={}):
+    """
+    Returns BlastResult from input seqs, using blastn.
+
+    Need to add doc string
+    """
+
+    # set up params to use with blastp
+    params = {
+        # matrix
+        "-M":"BLOSUM62",
+
+        # max procs
+        "-a":"1",
+
+        # expectation
+        "-e":e_value,
+
+        # max seqs to show
+        "-b":max_hits,
+
+        # max one line descriptions
+        "-v":max_hits,
+
+        # program
+        "-p":"blastn"
+    }
+    params.update(extra_params)
+
+    # blast
+    blast_res =  blast_seqs(seqs,
+        Blastall,
+        blast_mat_root=blast_mat_root,
+        blast_db=blast_db,
+        params=params,
+        add_seq_names=False,
+        WorkingDir=working_dir
+        )
+
+    # get prot id map
+    if blast_res['StdOut']:
+        lines = [x for x in blast_res['StdOut']]
+        return BlastResult(lines)
+
+    return None
+
+
+
+def blastx(seqs, params=None):
+    """Returns BlastResults from input seqs, using blastx."""
+    raise NotImplementedError
+
+def tblastx(seqs, params=None):
+    """Returns BlastResults from input seqs, using tblastx."""
+    raise NotImplementedError
+
+def psiblast(seqs, params=None):
+    """Returns BlastResults from input seqs, using psiblast."""
+    raise NotImplementedError
+
+def reciprocal_best_blast_hit(query_id, db_1, db_2, exclude_self_hits=True,\
+    params=None):
+    """Returns best hit in db_2 that maps back to query_id in db_1, or None.
+
+    exclude_self_hits: if True (the default), returns the best hit that
+    doesn't have the same id. Otherwise, will return the same id if it is in
+    both databases (assuming it's the same sequence in both).
+    """
+    raise NotImplementedError
+
+    #make with factory functions for the blast hits
+
+
+if __name__ == "__main__":
+
+    print "Debug. examples of how i've been using."
+
+    print "Example of straightforward BLAST"
+
+# WARNING: I changed a bunch of stuff to make testing easier, since nr doesn't
+# fit in memory on my laptop. I created a database 'eco' using formatdb on the
+# E. coli K12 fasta file from this URL:
+# ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Escherichia_coli_K12/NC_000913.faa
+# Because we're blasting an archaeal sequence against one bacterial genome, I
+# relaxed the inclusion thresholds substantially. DO NOT USE THESE AGAINST NR!
+
+    in_filename = "test_seq.fasta"
+    out_filename = "test.out"
+    # if blast env variable set, can just say 'nr'
+    #BLAST_DB = "/home/hamady/quicksand/data/blast/db/nr"
+    BLAST_DB = 'nr' #'nr'
+    BLAST_MAT_ROOT="/home/hamady/apps/blast-2.2.9/data"
+    #BLAST_MAT_ROOT='/Users/rob/ncbi/data'
+    # set up params to use with iterative
+
+    #print seqs_from_fastacmd(['16766313'], 'nr', True)
+    #raise ValueError, "dbug"
+    params = {
+
+        # matrix
+        "-M":"PAM70",
+        # max procs
+        "-a":2,
+         # expect
+        "-e":1e-15,
+
+# blastall
+#        # program
+#        "-p":"blastp",
+
+# psi-blast
+        # max iterations
+        "-j":2,
+
+        # max seqs to show
+        "-b":50,
+         # inclusion
+        "-h":1e-2,
+    }
+
+    in_seqs = """>stm:STMabcdef  thrA; aspartokinase I
+    MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTIGGQDA
+    LPNISDAERIFSDLLAGLASAQPGFPLARLKMVVEQEFAQIKHVLHGISLLGQCPDSINA
+    ALICRGEKMSIAIMAGLLEARGHRVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASQIP
+    ADHMILMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQV
+    PDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASDS
+    DDNLPVKGISNLNNMAMFSVSGPGMKGMIGMAARVFAAMSRAGISVVLITQSSSEYSISF
+    CVPQSDCARARRAMQDEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAAL
+    ARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGAL"""
+
+# The following should now give the same output:
+#
+#    in_seqs = 'tiny.faa'    #tiny.faa in cwd contains the sequence above
+#
+#    in_seqs = """>gi|2501594|sp|Q57997|Y577_METJA PROTEIN MJ0577
+#MSVMYKKILYPTDFSETAEIALKHVKAFKTLKAEEVILLHVIDEREIKKRDIFSLLLGVAGLNKSVEEFE
+#NELKNKLTEEAKNKMENIKKELEDVGFKVKDIIVVGIPHEEIVKIAEDEGVDIIIMGSHGKTNLKEILLG
+#SVTENVIKKSNKPVLVVKRKNS""".split()  #lines instead of multiline string
+#
+    blast_res =  blast_seqs(in_seqs, Blastall,
+                        blast_mat_root=BLAST_MAT_ROOT,
+                        add_seq_names=False,
+                        blast_db=BLAST_DB,
+                        params={'-p': 'blastp','-e': '1','-m': 9},
+                        out_filename=out_filename)
+
+    print [x for x in blast_res['StdOut']]
+    print [x for x in blast_res['StdErr']]
+    print blast_res
+    #for x in blast_res['BlastOut']:
+    #    print x.rstrip()
+    blast_res.cleanUp()
+    #print '\n\n'
+    #print "Example of psiblast_n_neighbors"
+    #print "Method 1: two-step with high- and low-confidence matches"
+    #print psiblast_n_neighbors(in_seqs, n=10, blast_db=BLAST_DB, \
+    #    method="two-step", blast_mat_root=BLAST_MAT_ROOT,params=params,\
+    #    core_threshold=1e-5, extra_threshold=1e-2, lower_threshold=1e-1)
+    #print
+    #print "Method 2: keep lowering threshold"
+    #print psiblast_n_neighbors(in_seqs, n=10, blast_db=BLAST_DB, \
+    #    method="lower_threshold", blast_mat_root=BLAST_MAT_ROOT,params=params,
+    #    core_threshold=1e-6, lower_threshold=1e-2)
+    #print
+    #print "Method 3: psi-blast shotgun"
+    #print psiblast_n_neighbors(in_seqs, n=10, blast_db=BLAST_DB, \
+    #    method="iterative", blast_mat_root=BLAST_MAT_ROOT,params=params,
+    #    core_threshold=1e-5, lower_threshold=1e-2)
+    #print
+    #print "Method 4: two-step with high- and low-confidence matches, diff dbs"
+    #print psiblast_n_neighbors(in_seqs, n=10, blast_db=BLAST_DB, \
+    #    method="two-step", blast_mat_root=BLAST_MAT_ROOT,params=params,\
+    #    core_threshold=1e-5, extra_threshold=1e-2, lower_threshold=1e-1, second_db='stm')
+    #print
diff --git a/bfillings/blat.py b/bfillings/blat.py
new file mode 100644
index 0000000..424a091
--- /dev/null
+++ b/bfillings/blat.py
@@ -0,0 +1,422 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for BLAT v34"""
+
+from os import remove
+from os.path import isabs
+from tempfile import mkstemp
+
+from cogent import DNA
+from cogent.core.genetic_code import GeneticCodes
+from cogent.parse.blast import MinimalBlatParser9
+
+from skbio.parse.sequences import parse_fasta
+from burrito.util import (CommandLineApplication, ResultPath,
+                            ApplicationError)
+from burrito.parameters import FlagParameter, ValuedParameter, FilePath
+
+
+class Blat(CommandLineApplication):
+
+    """BLAT generic application controller"""
+
+    _command = 'blat'
+    _input_handler = "_input_as_list"
+
+    _database_types = ['dna', 'prot', 'dnax']
+    _query_types = ['dna', 'rna', 'prot', 'dnax', 'rnax']
+    _mask_types = ['lower', 'upper', 'out', 'file.out']
+    _out_types = ['psl', 'pslx', 'axt', 'maf', 'sim4', 'wublast', 'blast',
+                  'blast8', 'blast9']
+    _valid_combinations = [('dna', 'dna'), ('dna', 'rna'), ('prot', 'prot'),
+                           ('dnax', 'prot'), ('dnax', 'dnax'),
+                           ('dnax', 'rnax')]
+    _database = None
+    _query = None
+    _output = None
+
+    _parameters = {
+        # database type (dna, prot, or dnax, where dnax is DNA sequence
+        # translated in six frames to protein
+        '-t': ValuedParameter('-', Delimiter='=', Name='t'),
+
+        # query type (dna, rna, prot, dnax, rnax, where rnax is DNA sequence
+        # translated in three frames to protein
+        '-q': ValuedParameter('-', Delimiter='=', Name='q'),
+
+        # Use overused tile file N.ooc, and N should correspond to the tileSize
+        '-ooc': ValuedParameter('-', Delimiter='=', Name='ooc', IsPath=True),
+
+        # Sets the size of at match that that triggers an alignment
+        '-tileSize': ValuedParameter('-', Delimiter='=', Name='tileSize'),
+
+        # Spacing between tiles.
+        '-stepSize': ValuedParameter('-', Delimiter='=', Name='stepSize'),
+
+        # If set to 1, allows one mismatch in the tile and still triggers
+        # an alignment.
+        '-oneOff': ValuedParameter('-', Delimiter='=', Name='oneOff'),
+
+        # sets the number of tile matches
+        '-minMatch': ValuedParameter('-', Delimiter='=', Name='minMatch'),
+
+        # sets the minimum score
+        '-minScore': ValuedParameter('-', Delimiter='=', Name='minScore'),
+
+        # sets the minimum sequence identity in percent
+        '-minIdentity':
+        ValuedParameter('-', Delimiter='=', Name='minIdentity'),
+
+        # sets the size o the maximum gap between tiles in a clump
+        '-maxGap': ValuedParameter('-', Delimiter='=', Name='maxGap'),
+
+        # make an overused tile file. Target needs to be complete genome.
+        '-makeOoc': ValuedParameter('-', Delimiter='=', Name='makeOoc',
+                                    IsPath=True),
+
+        # sets the number of repetitions of a tile allowed before it is marked
+        # as overused
+        '-repMatch': ValuedParameter('-', Delimiter='=', Name='repMatch'),
+
+        # mask out repeats.  Alignments won't be started in masked region but
+        # may extend through it in nucleotide searches.  Masked areas are
+        # ignored entirely in protein or translated searches.  Types are:
+        # lower, upper, out, file.out (file.out - mask database according to
+        # RepeatMasker file.out
+        '-mask': ValuedParameter('-', Delimiter='=', Name='mask'),
+
+        # Mask out repeats in query sequence.  similar to -mask but for query
+        # rather than target sequence
+        '-qMask': ValuedParameter('-', Delimiter='=', Name='qMask'),
+
+        # repeat bases will not be masked in any way, but matches in
+        # repeat areas will be reported separately from matches in other
+        # areas in the pls output
+        '-repeats': ValuedParameter('-', Delimiter='=', Name='repeats'),
+
+        # minimum percent divergence of repeats to allow them to be unmasked
+        '-minRepDivergence': ValuedParameter('-', Delimiter='=',
+                                             Name='minRepDivergence'),
+
+        # output dot every N sequences to show program's progress
+        '-dots': ValuedParameter('-', Delimiter='=', Name='dots'),
+
+        # controls output file format.  One of:
+        # psl - Default.  Tab separated format, no sequence
+        # pslx - Tab separated format with sequence
+        # axt - blastz-associated axt format
+        # maf - multiz-associated maf format
+        # sim4 - similar to sim4 format
+        # wublast - similar to wublast format
+        # blast - similar to NCBI blast format
+        # blast8- NCBI blast tabular format
+        # blast9 - NCBI blast tabular format with comments
+        '-out': ValuedParameter('-', Delimiter='=', Name='out'),
+
+        # sets maximum intron size
+        '-maxIntron': ValuedParameter('-', Delimiter='=', Name='maxIntron'),
+
+        # suppress column headers in psl output
+        '-noHead': FlagParameter('-', Name='noHead'),
+
+        # trim leading poly-T
+        '-trimT': FlagParameter('-', Name='trimT'),
+
+        # do not trim trailing poly-A
+        '-noTrimA': FlagParameter('-', Name='noTrimA'),
+
+        # Remove poly-A tail from qSize as well as alignments in psl output
+        '-trimHardA': FlagParameter('-', Name='trimHardA'),
+
+        # run for fast DNA/DNA remapping - not allowing introns,
+        # requiring high %ID
+        '-fastMap': FlagParameter('-', Name='fastMap'),
+
+        # for high quality mRNAs, look harder for small initial and terminal
+        # exons
+        '-fine': FlagParameter('-', Name='fine'),
+
+        # Allows extension of alignment through large blocks of N's
+        '-extendThroughN': FlagParameter('-', Name='extendThroughN')
+    }
+
+    def _get_result_paths(self, data):
+        """Returns the file location for result output
+        """
+
+        return {'output': ResultPath(data[2], IsWritten=True)}
+
+    def _get_base_command(self):
+        """Gets the command that will be run when the app controller is
+        called.
+        """
+        command_parts = []
+        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
+        if self._command is None:
+            raise ApplicationError('_command has not been set.')
+        command = self._command
+        parameters = sorted([str(x) for x in self.Parameters.values()
+                            if str(x)])
+
+        synonyms = self._synonyms
+
+        command_parts.append(cd_command)
+        command_parts.append(command)
+        command_parts.append(self._database)  # Positional argument
+        command_parts.append(self._query)  # Positional argument
+        command_parts += parameters
+        if self._output:
+            command_parts.append(self._output.Path)  # Positional
+
+        return (
+            self._command_delimiter.join(filter(None, command_parts)).strip()
+        )
+
+    BaseCommand = property(_get_base_command)
+
+    def _input_as_list(self, data):
+        '''Takes the positional arguments as input in a list.
+
+        The list input here should be [query_file_path, database_file_path,
+        output_file_path]'''
+        query, database, output = data
+        if (not isabs(database)) \
+                or (not isabs(query)) \
+                or (not isabs(output)):
+            raise ApplicationError("Only absolute paths allowed.\n%s" %
+                                   ', '.join(data))
+
+        self._database = FilePath(database)
+        self._query = FilePath(query)
+        self._output = ResultPath(output, IsWritten=True)
+
+        # check parameters that can only take a particular set of values
+        # check combination of databse and query type
+        if self.Parameters['-t'].isOn() and self.Parameters['-q'].isOn() and \
+                (self.Parameters['-t'].Value, self.Parameters['-q'].Value) not in \
+                self._valid_combinations:
+            error_message = "Invalid combination of database and query " + \
+                            "types ('%s', '%s').\n" % \
+                            (self.Paramters['-t'].Value,
+                             self.Parameters['-q'].Value)
+
+            error_message += "Must be one of: %s\n" % \
+                             repr(self._valid_combinations)
+
+            raise ApplicationError(error_message)
+
+        # check database type
+        if self.Parameters['-t'].isOn() and \
+                self.Parameters['-t'].Value not in self._database_types:
+            error_message = "Invalid database type %s\n" % \
+                            self.Parameters['-t'].Value
+
+            error_message += "Allowed values: %s\n" % \
+                             ', '.join(self._database_types)
+
+            raise ApplicationError(error_message)
+
+        # check query type
+        if self.Parameters['-q'].isOn() and \
+                self.Parameters['-q'].Value not in self._query_types:
+            error_message = "Invalid query type %s\n" % \
+                            self.Parameters['-q'].Value
+
+            error_message += "Allowed values: %s\n" % \
+                ', '.join(self._query_types)
+
+            raise ApplicationError(error_message)
+
+        # check mask type
+        if self.Parameters['-mask'].isOn() and \
+                self.Parameters['-mask'].Value not in self._mask_types:
+            error_message = "Invalid mask type %s\n" % \
+                            self.Parameters['-mask']
+
+            error_message += "Allowed Values: %s\n" % \
+                ', '.join(self._mask_types)
+
+            raise ApplicationError(error_message)
+
+        # check qmask type
+        if self.Parameters['-qMask'].isOn() and \
+                self.Parameters['-qMask'].Value not in self._mask_types:
+            error_message = "Invalid qMask type %s\n" % \
+                            self.Parameters['-qMask'].Value
+
+            error_message += "Allowed values: %s\n" % \
+                             ', '.join(self._mask_types)
+
+            raise ApplicationError(error_message)
+
+        # check repeat type
+        if self.Parameters['-repeats'].isOn() and \
+                self.Parameters['-repeats'].Value not in self._mask_types:
+            error_message = "Invalid repeat type %s\n" % \
+                            self.Parameters['-repeat'].Value
+
+            error_message += "Allowed values: %s\n" % \
+                             ', '.join(self._mask_types)
+
+            raise ApplicationError(error_message)
+
+        # check output format
+        if self.Parameters['-out'].isOn() and \
+                self.Parameters['-out'].Value not in self._out_types:
+            error_message = "Invalid output type %s\n" % \
+                            self.Parameters['-out']
+
+            error_message += "Allowed values: %s\n" % \
+                             ', '.join(self._out_types)
+
+            raise ApplicationError(error_message)
+
+        return ''
+
+
+def assign_reads_to_database(query_fasta_fp, database_fasta_fp, output_fp,
+                             params=None):
+    """Assign a set of query sequences to a reference database
+
+    query_fasta_fp : absolute file path to query sequences
+    database_fasta_fp : absolute file path to the reference database
+    output_fp : absolute file path of the output file to write
+    params : dict of BLAT specific parameters.
+
+    This method returns an open file object. The output format
+    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
+    """
+    if params is None:
+        params = {}
+    if '-out' not in params:
+        params['-out'] = 'blast9'
+    blat = Blat(params=params)
+
+    result = blat([query_fasta_fp, database_fasta_fp, output_fp])
+    return result['output']
+
+
+def assign_dna_reads_to_dna_database(query_fasta_fp, database_fasta_fp,
+                                     output_fp, params=None):
+    """Assign DNA reads to a database fasta of DNA sequences.
+
+    Wraps assign_reads_to_database, setting database and query types. All
+    parameters are set to default unless params is passed.
+
+    query_fasta_fp: absolute path to the query fasta file containing DNA
+                   sequences.
+    database_fasta_fp: absolute path to the database fasta file containing
+                      DNA sequences.
+    output_fp: absolute path where the output file will be generated.
+    params: optional. dict containing parameter settings to be used
+                  instead of default values. Cannot change database or query
+                  file types from dna and dna, respectively.
+
+    This method returns an open file object. The output format
+    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
+    """
+    if params is None:
+        params = {}
+
+    my_params = {'-t': 'dna',
+                 '-q': 'dna'
+                 }
+
+    # if the user specified parameters other than default, then use them.
+    # However, if they try to change the database or query types, raise an
+    # applciation error.
+    if '-t' in params or '-q' in params:
+        raise ApplicationError("Cannot change database or query types when " +
+                               "using assign_dna_reads_to_dna_database. " +
+                               "Use assign_reads_to_database instead.\n")
+
+    my_params.update(params)
+
+    result = assign_reads_to_database(query_fasta_fp, database_fasta_fp,
+                                      output_fp, my_params)
+
+    return result
+
+
+def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp,
+                                         output_fp, temp_dir="/tmp", params=None):
+    """Assign DNA reads to a database fasta of protein sequences.
+
+    Wraps assign_reads_to_database, setting database and query types. All
+    parameters are set to default unless params is passed. A temporary
+    file must be written containing the translated sequences from the input
+    query fasta file because BLAT cannot do this automatically.
+
+    query_fasta_fp: absolute path to the query fasta file containing DNA
+                   sequences.
+    database_fasta_fp: absolute path to the database fasta file containing
+                      protein sequences.
+    output_fp: absolute path where the output file will be generated.
+    temp_dir: optional. Change the location where the translated sequences
+              will be written before being used as the query. Defaults to
+              /tmp.
+    params: optional. dict containing parameter settings to be used
+                  instead of default values. Cannot change database or query
+                  file types from protein and dna, respectively.
+
+    This method returns an open file object. The output format
+    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
+    """
+    if params is None:
+        params = {}
+
+    my_params = {'-t': 'prot', '-q': 'prot'}
+
+    # make sure temp_dir specifies an absolute path
+    if not isabs(temp_dir):
+        raise ApplicationError("temp_dir must be an absolute path.")
+
+    # if the user specified parameters other than default, then use them.
+    # However, if they try to change the database or query types, raise an
+    # applciation error.
+    if '-t' in params or '-q' in params:
+        raise ApplicationError("Cannot change database or query types "
+                               "when using assign_dna_reads_to_dna_database. Use "
+                               "assign_reads_to_database instead.")
+
+    if 'genetic_code' in params:
+        my_genetic_code = GeneticCodes[params['genetic_code']]
+        del params['genetic_code']
+    else:
+        my_genetic_code = GeneticCodes[1]
+
+    my_params.update(params)
+
+    # get six-frame translation of the input DNA sequences and write them to
+    # temporary file.
+    _, tmp = mkstemp(dir=temp_dir)
+    tmp_out = open(tmp, 'w')
+
+    for label, sequence in parse_fasta(open(query_fasta_fp)):
+        seq_id = label.split()[0]
+
+        s = DNA.makeSequence(sequence)
+        translations = my_genetic_code.sixframes(s)
+        frames = [1, 2, 3, -1, -2, -3]
+        translations = dict(zip(frames, translations))
+
+        for frame, translation in sorted(translations.iteritems()):
+            entry = '>{seq_id}_frame_{frame}\n{trans}\n'
+            entry = entry.format(seq_id=seq_id, frame=frame, trans=translation)
+            tmp_out.write(entry)
+
+    tmp_out.close()
+    result = assign_reads_to_database(tmp, database_fasta_fp, output_fp,
+                                      params=my_params)
+
+    remove(tmp)
+
+    return result
diff --git a/bfillings/bwa.py b/bfillings/bwa.py
new file mode 100644
index 0000000..7fd36e2
--- /dev/null
+++ b/bfillings/bwa.py
@@ -0,0 +1,762 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for BWA 0.6.2 (release 19 June 2012)"""
+
+from os.path import isabs
+from tempfile import mkstemp
+
+from burrito.parameters import FlagParameter, ValuedParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+                            ApplicationError)
+
+__author__ = "Adam Robbins-Pianka"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Adam Robbins-Pianka", "Jai Ram Rideout"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "Adam Robbins-Pianka"
+__email__ = "adam.robbinspianka at colorado.edu"
+__status__ = "Production"
+
+# helper functions for argument checking
+
+
+def is_int(x):
+    # return true if it's an int
+    return ((isinstance(x, int)) or
+            # or it's a string that is all digits
+            (isinstance(x, str) and x.isdigit()) or
+            # otherwise return False
+            False)
+
+
+def is_float(x):
+    return (is_int(x) or
+            # or if it's a float
+            (isinstance(x, float)) or
+            # or it's a string with exactly one decimal and all digits on both sides of
+            # the decimal
+            (isinstance(x, str)
+             and '.' in x and all(map(str.isdigit, x.split('.', 1))))
+            # otherwise return False
+            or False)
+
+# Exceptions
+
+
+class InvalidArgumentApplicationError(Exception):
+    pass
+
+
+class MissingRequiredArgumentApplicationError(Exception):
+    pass
+
+# Base class
+
+
+class BWA(CommandLineApplication):
+
+    """BWA generic application controller. Do not instantiate directly.
+
+    Instead of instantiating this class, instantiate a subclass for each
+    subcommand.  Available subclasses are:
+    BWA_index
+    BWA_aln
+    BWA_samse
+    BWA_sampe
+    BWA_bwasw
+    """
+
+    # all subclasses will accept dictionaries as input that specify input
+    # and output files. The required (and optional) types of input and output
+    # files differ by subcommand.
+    _input_handler = "_input_as_dict"
+
+    # the main command. The program bwa should be in the PATH
+    _command = "bwa"
+
+    # holds the values of the dict handled by the input handler
+    _input = {}
+
+    # Each subclass can have a dictionary (keys = option names, e.g., -a
+    # and values = boolean fucntions) called _valid_arguments
+    # that specifies checks to be made on the parameters.
+    def check_arguments(self):
+        """Sanity check the arguments passed in.
+
+        Uses the boolean functions specified in the subclasses in the
+        _valid_arguments dictionary to determine if an argument is valid
+        or invalid.
+        """
+        for k, v in self.Parameters.iteritems():
+            if self.Parameters[k].isOn():
+                if k in self._valid_arguments:
+                    if not self._valid_arguments[k](v.Value):
+                        error_message = 'Invalid argument (%s) ' % v.Value
+                        error_message += 'for parameter %s\n' % k
+                        raise InvalidArgumentApplicationError(error_message)
+
+    def _get_base_command(self):
+        """ Returns the full command string
+
+        Overridden here because there are positional arguments (specifically
+        the input and output files).
+        """
+        command_parts = []
+        # Append a change directory to the beginning of the command to change
+        # to self.WorkingDir before running the command
+        # WorkingDir should be in quotes -- filenames might contain spaces
+        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
+        if self._command is None:
+            raise ApplicationError('_command has not been set.')
+        command = self._command
+        # also make sure there's a subcommand!
+        if self._subcommand is None:
+            raise ApplicationError('_subcommand has not been set.')
+        subcommand = self._subcommand
+        # sorting makes testing easier, since the options will be written out
+        # in alphabetical order. Could of course use option parsing scripts
+        # in cogent for this, but this works as well.
+        parameters = sorted([str(x) for x in self.Parameters.values()
+                            if str(x)])
+        synonyms = self._synonyms
+
+        command_parts.append(cd_command)
+        command_parts.append(command)
+        # add in subcommand
+        command_parts.append(subcommand)
+        command_parts += parameters
+        # add in the positional arguments in the correct order
+        for k in self._input_order:
+            # this check is necessary to account for optional positional
+            # arguments, such as the mate file for bwa bwasw
+            # Note that the input handler will ensure that all required
+            # parameters have valid values
+            if k in self._input:
+                command_parts.append(self._input[k])
+
+        return self._command_delimiter.join(command_parts).strip()
+
+    BaseCommand = property(_get_base_command)
+
+    def _input_as_dict(self, data):
+        """Takes dictionary that sets input and output files.
+
+        Valid keys for the dictionary are specified in the subclasses. File
+        paths must be absolute.
+        """
+        # clear self._input; ready to receive new input and output files
+        self._input = {}
+        # Check that the arguments to the
+        # subcommand-specific parameters are valid
+        self.check_arguments()
+
+        # Ensure that we have all required input (file I/O)
+        for k in self._input_order:
+            # N.B.: optional positional arguments begin with underscore (_)!
+            # (e.g., see _mate_in for bwa bwasw)
+            if k[0] != '_' and k not in data:
+                raise MissingRequiredArgumentApplicationError("Missing "
+                                                              "required "
+                                                              "input %s" % k)
+
+        # Set values for input and output files
+        for k in data:
+            # check for unexpected keys in the dict
+            if k not in self._input_order:
+                error_message = "Invalid input arguments (%s)\n" % k
+                error_message += "Valid keys are: %s" % repr(self._input_order)
+                raise InvalidArgumentApplicationError(error_message + '\n')
+
+            # check for absolute paths
+            if not isabs(data[k][0]):
+                raise InvalidArgumentApplicationError("Only absolute paths "
+                                                      "allowed.\n%s" %
+                                                      repr(data))
+            self._input[k] = data[k]
+
+        # if there is a -f option to specify an output file, force the user to
+        # use it (otherwise things to to stdout)
+        if '-f' in self.Parameters and not self.Parameters['-f'].isOn():
+            raise InvalidArgumentApplicationError("Please specify an output "
+                                                  "file with -f")
+
+        return ''
+
+
+class BWA_index(BWA):
+
+    """Controls the "index" subcommand of the bwa application.
+
+    Valid input keys are: fasta_in
+    """
+
+    # the subcommand for bwa index
+    _subcommand = "index"
+
+    _parameters = {
+        # which algorithm to use.
+        # is
+        # IS linear-time algorithm for constructing suffix array. It requires
+        # 5.37N memory where N is the size of the database. IS is moderately
+        # fast, but does not work with database larger than 2GB. IS is the
+        # default algorithm due to its simplicity. The current codes for IS
+        # algorithm are reimplemented by Yuta Mori.
+        #
+        # bwtsw
+        # Algorithm implemented in BWT-SW. This method works with the whole
+        # human genome, but it does not work with database smaller than 10MB
+        # and it is usually slower than IS.
+        #
+        # DEFAULTs to auto-select (based on input fasta file size)
+        '-a': ValuedParameter('-', Delimiter=' ', Name='a'),
+
+        # prefix for the output index.
+        # DEFAULTs to the base name of the input fasta file
+        '-p': ValuedParameter('-', Delimiter=' ', Name='p'),
+
+        # index files named as <in.fasta>.64.* instead of <in.fasta>.*
+        '-6': FlagParameter('-', Name='6')
+    }
+
+    # The -a command can take on of only two possible values
+    # the -p command allows the user to specify a prefix; for our purposes,
+    # this prefix should be an abolute path
+    _valid_arguments = {
+        '-a': lambda x: x in ['is', 'bwtsw'],
+        '-p': isabs
+    }
+
+    # For the position specific arguments, this is the order that they will
+    # be written in the base command
+    # input file keys beginning with _ are optional inputs
+    _input_order = ['fasta_in']
+
+    def _get_result_paths(self, data):
+        """Gets the results for a run of bwa index.
+
+        bwa index outputs 5 files when the index is created. The filename
+        prefix will be the same as the input fasta, unless overridden with
+        the -p option, and the 5 extensions are listed below:
+
+        .amb
+        .ann
+        .bwt
+        .pac
+        .sa
+
+        and these extentions (including the period) are the keys to the
+        dictionary that is returned.
+        """
+
+        # determine the names of the files. The name will be the same as the
+        # input fasta file unless overridden with the -p option
+        if self.Parameters['-p'].isOn():
+            prefix = self.Parameters['-p'].Value
+        else:
+            prefix = data['fasta_in']
+
+        # the 5 output file suffixes
+        suffixes = ['.amb', '.ann', '.bwt', '.pac', '.sa']
+        out_files = {}
+        for suffix in suffixes:
+            out_files[suffix] = ResultPath(prefix + suffix, IsWritten=True)
+
+        return out_files
+
+
+class BWA_aln(BWA):
+
+    """Controls the "aln" subcommand of the bwa application.
+
+    Valid input keys are: prefix, fastq_in
+    """
+    _parameters = {
+        # max #diff (int) or missing prob under 0.02 err rate (float) [0.04]
+        '-n': ValuedParameter('-', Delimiter=' ', Name='n'),
+        # maximum number or fraction of gap opens [1]
+        '-o': ValuedParameter('-', Delimiter=' ', Name='o'),
+
+        # maximum number of gap extensions, -1 for disabling long gaps
+        # [-1]
+        '-e': ValuedParameter('-', Delimiter=' ', Name='e'),
+
+        # do not put an indel within bp towards the ends [5]
+        '-i': ValuedParameter('-', Delimiter=' ', Name='i'),
+
+        # maximum occurrences for extending a long deletion [10]
+        '-d': ValuedParameter('-', Delimiter=' ', Name='d'),
+
+        # seed length [32]
+        '-l': ValuedParameter('-', Delimiter=' ', Name='l'),
+
+        # maximum differences in the seed [2]
+        '-k': ValuedParameter('-', Delimiter=' ', Name='k'),
+
+        # maximum entries in the queue [2000000]
+        '-m': ValuedParameter('-', Delimiter=' ', Name='m'),
+
+        # number of threads [1]
+        '-t': ValuedParameter('-', Delimiter=' ', Name='t'),
+
+        # mismatch penalty [3]
+        '-M': ValuedParameter('-', Delimiter=' ', Name='M'),
+
+        # gap open penalty [11]
+        '-O': ValuedParameter('-', Delimiter=' ', Name='O'),
+
+        # gap extension penalty [4]
+        '-E': ValuedParameter('-', Delimiter=' ', Name='E'),
+
+        # stop searching when there are > equally best hits [30]
+        '-R': ValuedParameter('-', Delimiter=' ', Name='R'),
+
+        # quality threshold for read trimming down to 35bp [0]
+        '-q': ValuedParameter('-', Delimiter=' ', Name='q'),
+
+        # file to write output to instead of stdout
+        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),
+
+        # length of barcode
+        '-B': ValuedParameter('-', Delimiter=' ', Name='B'),
+
+        # log-scaled gap penalty for long deletions
+        '-L': FlagParameter('-', Name='L'),
+
+        # non-iterative mode: search for all n-difference hits (slooow)
+        '-N': FlagParameter('-', Name='N'),
+
+        # the input is in the Illumina 1.3+ FASTQ-like format
+        '-I': FlagParameter('-', Name='I'),
+
+        # the input read file is in the BAM format
+        '-b': FlagParameter('-', Name='b'),
+
+        # use single-end reads only (effective with -b)
+        '-0': FlagParameter('-', Name='0'),
+
+        # use the 1st read in a pair (effective with -b)
+        '-1': FlagParameter('-', Name='1'),
+
+        # use the 2nd read in a pair (effective with -b)
+        '-2': FlagParameter('-', Name='2'),
+
+        # filter Casava-filtered sequences
+        '-Y': FlagParameter('-', Name='Y')
+    }
+
+    # the subcommand for bwa aln
+    _subcommand = 'aln'
+
+    _valid_arguments = {
+        # check to see if this is decimal numbers
+        '-n': is_float,
+
+        # check to see if these are integers
+        '-o': is_int,
+        '-e': is_int,
+        '-i': is_int,
+        '-d': is_int,
+        '-l': is_int,
+        '-k': is_int,
+        '-m': is_int,
+        '-t': is_int,
+        '-M': is_int,
+        '-O': is_int,
+        '-E': is_int,
+        '-R': is_int,
+        '-q': is_int,
+        '-B': is_int,
+
+        # check to see if this is an absolute file path
+        '-f': isabs
+    }
+
+    # input file keys beginning with _ are optional inputs
+    _input_order = ['prefix', 'fastq_in']
+
+    def _get_result_paths(self, data):
+        """Gets the result file for a bwa aln run.
+
+        There is only one output file of a bwa aln run, a .sai file
+        and it can be retrieved with the key 'output'.
+        """
+        return {'output': ResultPath(self.Parameters['-f'].Value,
+                                     IsWritten=True)}
+
+
+class BWA_samse(BWA):
+
+    """Controls the "samse" subcommand of the bwa application.
+
+    Valid input keys are: prefix, sai_in, fastq_in
+    """
+    _parameters = {
+        # Maximum number of alignments to output in the XA tag for reads
+        # paired properly. If a read has more than this number of hits, the
+        # XA tag will not be written
+        '-n': ValuedParameter('-', Delimiter=' ', Name='n'),
+
+        # file to write output to instead of stdout
+        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),
+
+        # Specify the read group in a format like '@RG\tID:foo\tSM:bar'
+        '-r': ValuedParameter('-', Delimiter=' ', Name='r')
+    }
+
+    # the subcommand for samse
+    _subcommand = 'samse'
+
+    _valid_arguments = {
+        # make sure that this is an int
+        '-n': is_int,
+
+        # check to see if this is an absolute file path
+        '-f': isabs
+    }
+
+    # input file keys beginning with _ are optional inputs
+    _input_order = ['prefix', 'sai_in', 'fastq_in']
+
+    def _get_result_paths(self, data):
+        """Gets the result file for a bwa samse run.
+
+        There is only one output file of a bwa samse run, a .sam file
+        and it can be retrieved with the key 'output'.
+        """
+        return {'output': ResultPath(self.Parameters['-f'].Value,
+                                     IsWritten=True)}
+
+
+class BWA_sampe(BWA):
+
+    """Controls the "sampe" subcommand of the bwa application.
+
+    Valid input keys are: prefix, sai1_in, sai2_in, fastq1_in,
+    fastq2_in
+    """
+    _parameters = {
+        # Maximum insert size for a read pair to be considered being mapped
+        # properly
+        '-a': ValuedParameter('-', Delimiter=' ', Name='a'),
+
+        # Maximum occurrences of a read for pairing
+        '-o': ValuedParameter('-', Delimiter=' ', Name='o'),
+
+        # Load the entire FM-index into memory to reduce disk operations
+        '-P': FlagParameter('-', Name='P'),
+
+        # maximum hits to output for paired reads [3]
+        '-n': ValuedParameter('-', Delimiter=' ', Name='n'),
+
+        # maximum hits to output for discordant pairs [10]
+        '-N': ValuedParameter('-', Delimiter=' ', Name='N'),
+
+        # file to write output to instead of stdout
+        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),
+
+        # Specify the read group in a format like '@RG\tID:foo\tSM:bar'
+        '-r': ValuedParameter('-', Delimiter=' ', Name='r'),
+
+        # disable Smith-Waterman for the unmapped mate
+        '-s': FlagParameter('-', Name='s'),
+
+        # prior of chimeric rate (lower bound) [1.0e-05]
+        '-c': ValuedParameter('-', Delimiter=' ', Name='c'),
+
+        # disable insert size estimate (force -s)
+        '-A': FlagParameter('-', Name='A')
+    }
+
+    # the subcommand for sampe
+    _subcommand = 'sampe'
+
+    _valid_arguments = {
+        # make sure this is a float
+        '-c': is_float,
+
+        # make sure these are all ints
+        '-a': is_int,
+        '-o': is_int,
+        '-n': is_int,
+        '-N': is_int,
+
+        # check to see if this is an absolute file path
+        '-f': isabs
+    }
+
+    # input file keys beginning with _ are optional inputs
+    _input_order = ['prefix', 'sai1_in', 'sai2_in',
+                    'fastq1_in', 'fastq2_in']
+
+    def _get_result_paths(self, data):
+        """Gets the result file for a bwa sampe run.
+
+        There is only one output file of a bwa sampe run, a .sam file,
+        and it can be retrieved with the key 'output'.
+        """
+        return {'output': ResultPath(self.Parameters['-f'].Value,
+                                     IsWritten=True)}
+
+
+class BWA_bwasw(BWA):
+
+    """Controls the "bwasw" subcommand of the bwa application.
+
+    Valid input keys are: prefix, query_fasta, _query_fasta2
+    input keys beginning with an underscore are optional.
+    """
+    _parameters = {
+        # Score of a match [1]
+        '-a': ValuedParameter('-', Delimiter=' ', Name='a'),
+
+        # Mismatch penalty [3]
+        '-b': ValuedParameter('-', Delimiter=' ', Name='b'),
+
+        # Gap open penalty [5]
+        '-q': ValuedParameter('-', Delimiter=' ', Name='q'),
+
+        # Gap  extension  penalty.
+        '-r': ValuedParameter('-', Delimiter=' ', Name='r'),
+
+        # mask level [0.50]
+        '-m': ValuedParameter('-', Delimiter=' ', Name='m'),
+
+        # Number of threads in the multi-threading mode [1]
+        '-t': ValuedParameter('-', Delimiter=' ', Name='t'),
+
+        # file to output results to instead of stdout
+        '-f': ValuedParameter('-', Delimiter=' ', Name='f'),
+
+        # Band width in the banded alignment [33]
+        '-w': ValuedParameter('-', Delimiter=' ', Name='w'),
+
+        # Minimum score threshold divided by a [30]
+        '-T': ValuedParameter('-', Delimiter=' ', Name='T'),
+
+        # Coefficient  for  threshold  adjustment  according  to query length.
+        # Given an l-long query, the threshold for a hit to be retained is
+        # a*max{T,c*log(l)}. [5.5]
+        '-c': ValuedParameter('-', Delimiter=' ', Name='c'),
+
+        # Z-best heuristics. Higher -z increases accuracy at the cost
+        # of speed. [1]
+        '-z': ValuedParameter('-', Delimiter=' ', Name='z'),
+
+        # Maximum SA interval size for initiating a seed. Higher -s increases
+        # accuracy at the cost of speed. [3]
+        '-s': ValuedParameter('-', Delimiter=' ', Name='s'),
+
+        # Minimum  number  of  seeds  supporting  the  resultant alignment to
+        # trigger reverse alignment. [5]
+        '-N': ValuedParameter('-', Delimiter=' ', Name='N'),
+
+        # in SAM output, use hard clipping instead of soft clipping
+        '-H': FlagParameter('-', Name='H'),
+
+        # mark multi-part alignments as secondary
+        '-M': FlagParameter('-', Name='M'),
+
+        # skip Smith-Waterman read pariing
+        '-S': FlagParameter('-', Name='S'),
+
+        # ignore pairs with insert >= INT for inferring the size of distr
+        # [20000]
+        '-I': ValuedParameter('-', Delimiter=' ', Name='I')
+    }
+
+    # the subcommand fo bwasw
+    _subcommand = 'bwasw'
+
+    # input file keys beginning with _ are optional inputs
+    _input_order = ['prefix', 'query_fasta', '_query_fasta_2']
+
+    _valid_arguments = {
+        # Make sure this is a float
+        '-c': is_float,
+        '-m': is_float,
+
+        # Make sure these are ints
+        '-a': is_int,
+        '-b': is_int,
+        '-q': is_int,
+        '-r': is_int,
+        '-t': is_int,
+        '-w': is_int,
+        '-T': is_int,
+        '-z': is_int,
+        '-s': is_int,
+        '-N': is_int,
+        '-I': is_int,
+
+        # make sure this is an absolute path
+        '-f': isabs
+    }
+
+    def _get_result_paths(self, data):
+        """Gets the result file for a bwa bwasw run.
+
+        There is only one output file of a bwa bwasw run, a .sam file,
+        and it can be retrieved with the key 'output'.
+        """
+        return {'output': ResultPath(self.Parameters['-f'].Value,
+                                     IsWritten=True)}
+
+
+def create_bwa_index_from_fasta_file(fasta_in, params=None):
+    """Create a BWA index from an input fasta file.
+
+    fasta_in: the input fasta file from which to create the index
+    params: dict of bwa index specific paramters
+
+    This method returns a dictionary where the keys are the various
+    output suffixes (.amb, .ann, .bwt, .pac, .sa) and the values
+    are open file objects.
+
+    The index prefix will be the same as fasta_in, unless the -p parameter
+    is passed in params.
+    """
+    if params is None:
+        params = {}
+
+    # Instantiate the app controller
+    index = BWA_index(params)
+
+    # call the application, passing the fasta file in
+    results = index({'fasta_in': fasta_in})
+    return results
+
+
+def assign_reads_to_database(query, database_fasta, out_path, params=None):
+    """Assign a set of query sequences to a reference database
+
+    database_fasta_fp: absolute file path to the reference database
+    query_fasta_fp: absolute file path to query sequences
+    output_fp: absolute file path of the file to be output
+    params: dict of BWA specific parameters.
+            * Specify which algorithm to use (bwa-short or bwasw) using the
+            dict key "algorithm"
+            * if algorithm is bwasw, specify params for the bwa bwasw
+            subcommand
+            * if algorithm is bwa-short, specify params for the bwa samse
+            subcommand
+            * if algorithm is bwa-short, must also specify params to use with
+            bwa aln, which is used to get the sai file necessary to run samse.
+            bwa aln params should be passed in using dict key "aln_params" and
+            the associated value should be a dict of params for the bwa aln
+            subcommand
+            * if a temporary directory is not specified in params using dict
+            key "temp_dir", it will be assumed to be /tmp
+
+    This method returns an open file object (SAM format).
+    """
+    if params is None:
+        params = {}
+
+    # set the output path
+    params['-f'] = out_path
+
+    # if the algorithm is not specified in the params dict, or the algorithm
+    # is not recognized, raise an exception
+    if 'algorithm' not in params:
+        raise InvalidArgumentApplicationError("Must specify which algorithm to"
+                                              " use ('bwa-short' or 'bwasw')")
+    elif params['algorithm'] not in ('bwa-short', 'bwasw'):
+        raise InvalidArgumentApplicationError("Unknown algorithm '%s' Please "
+                                              "enter either 'bwa-short' or "
+                                              "'bwasw'." % params['algorithm'])
+
+    # if the temp directory is not specified, assume /tmp
+    if 'temp_dir' not in params:
+        params['temp_dir'] = '/tmp'
+
+    # if the algorithm is bwa-short, we must build use bwa aln to get an sai
+    # file before calling bwa samse on that sai file, so we need to know how
+    # to run bwa aln. Therefore, we must ensure there's an entry containing
+    # those parameters
+    if params['algorithm'] == 'bwa-short':
+        if 'aln_params' not in params:
+            raise InvalidArgumentApplicationError("With bwa-short, need to "
+                                                  "specify a key 'aln_params' "
+                                                  "and its value, a dictionary"
+                                                  " to pass to bwa aln, since"
+                                                  " bwa aln is an intermediate"
+                                                  " step when doing "
+                                                  "bwa-short.")
+
+    # we have this params dict, with "algorithm" and "temp_dir", etc which are
+    # not for any of the subcommands, so make a new params dict that is the
+    # same as the original minus these addendums
+    subcommand_params = {}
+    for k, v in params.iteritems():
+        if k not in ('algorithm', 'temp_dir', 'aln_params'):
+            subcommand_params[k] = v
+
+    # build index from database_fasta
+    # get a temporary file name that is not in use
+    _, index_prefix = mkstemp(dir=params['temp_dir'], suffix='')
+
+    create_bwa_index_from_fasta_file(database_fasta, {'-p': index_prefix})
+
+    # if the algorithm is bwasw, things are pretty simple. Just instantiate
+    # the proper controller and set the files
+    if params['algorithm'] == 'bwasw':
+        bwa = BWA_bwasw(params=subcommand_params)
+        files = {'prefix': index_prefix, 'query_fasta': query}
+
+    # if the algorithm is bwa-short, it's not so simple
+    elif params['algorithm'] == 'bwa-short':
+        # we have to call bwa_aln to get the sai file needed for samse
+        # use the aln_params we ensured we had above
+        bwa_aln = BWA_aln(params=params['aln_params'])
+        aln_files = {'prefix': index_prefix, 'fastq_in': query}
+        # get the path to the sai file
+        sai_file_path = bwa_aln(aln_files)['output'].name
+
+        # we will use that sai file to run samse
+        bwa = BWA_samse(params=subcommand_params)
+        files = {'prefix': index_prefix, 'sai_in': sai_file_path,
+                 'fastq_in': query}
+
+    # run which ever app controller we decided was correct on the files
+    # we set up
+    result = bwa(files)
+
+    # they both return a SAM file, so return that
+    return result['output']
+
+
+def assign_dna_reads_to_dna_database(query_fasta_fp, database_fasta_fp, out_fp,
+                                     params={}):
+    """Wraps assign_reads_to_database, setting various parameters.
+
+    The default settings are below, but may be overwritten and/or added to
+    using the params dict:
+
+    algorithm:      bwasw
+    """
+    my_params = {'algorithm': 'bwasw'}
+    my_params.update(params)
+
+    result = assign_reads_to_database(query_fasta_fp, database_fasta_fp,
+                                      out_fp, my_params)
+
+    return result
+
+
+def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp,
+                                         out_fp, temp_dir='/tmp',
+                                         params={}):
+    """Wraps assign_reads_to_database, setting various parameters.
+
+    Not yet implemented, as BWA can only align DNA reads to DNA databases.
+    """
+    raise NotImplementedError("BWA cannot at this point align DNA to protein")
diff --git a/bfillings/cd_hit.py b/bfillings/cd_hit.py
new file mode 100644
index 0000000..943f511
--- /dev/null
+++ b/bfillings/cd_hit.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for CD-HIT v3.1.1"""
+
+import shutil
+from os import remove
+from tempfile import mkstemp, mkdtemp
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import ValuedParameter
+from burrito.util import CommandLineApplication, ResultPath
+
+from cogent.core.moltype import RNA, DNA, PROTEIN
+from cogent.core.alignment import SequenceCollection
+
+__author__ = "Daniel McDonald"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Daniel McDonald"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "Daniel McDonald"
+__email__ = "mcdonadt at colorado.edu"
+__status__ = "Development"
+
+
+class CD_HIT(CommandLineApplication):
+    """cd-hit Application Controller
+
+    Use this version of CD-HIT if your MolType is PROTEIN
+    """
+
+    _command = 'cd-hit'
+    _input_handler = '_input_as_multiline_string'
+    _parameters = {
+        # input input filename in fasta format, required
+        '-i':ValuedParameter('-',Name='i',Delimiter=' ',IsPath=True),
+
+        # output filename, required
+        '-o':ValuedParameter('-',Name='o',Delimiter=' ',IsPath=True),
+
+        # sequence identity threshold, default 0.9
+        # this is the default cd-hit's "global sequence identity" calc'd as :
+        # number of identical amino acids in alignment
+        # divided by the full length of the shorter sequence
+        '-c':ValuedParameter('-',Name='c',Delimiter=' '),
+
+        # use global sequence identity, default 1
+        # if set to 0, then use local sequence identity, calculated as :
+        # number of identical amino acids in alignment
+        # divided by the length of the alignment
+        # NOTE!!! don't use -G 0 unless you use alignment coverage controls
+        # see options -aL, -AL, -aS, -AS
+        '-g':ValuedParameter('-',Name='g',Delimiter=' '),
+
+        # band_width of alignment, default 20
+        '-b':ValuedParameter('-',Name='b',Delimiter=' '),
+
+        # max available memory (Mbyte), default 400
+        '-M':ValuedParameter('-',Name='M',Delimiter=' '),
+
+        # word_length, default 8, see user's guide for choosing it
+        '-n':ValuedParameter('-',Name='n',Delimiter=' '),
+
+        # length of throw_away_sequences, default 10
+        '-l':ValuedParameter('-',Name='l',Delimiter=' '),
+
+        # tolerance for redundance, default 2
+        '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+        # length of description in .clstr file, default 20
+        # if set to 0, it takes the fasta defline and stops at first space
+        '-d':ValuedParameter('-',Name='d',Delimiter=' '),
+
+        # length difference cutoff, default 0.0
+        # if set to 0.9, the shorter sequences need to be
+        # at least 90% length of the representative of the cluster
+        '-s':ValuedParameter('-',Name='s',Delimiter=' '),
+
+        # length difference cutoff in amino acid, default 999999
+        # f set to 60, the length difference between the shorter sequences
+        # and the representative of the cluster can not be bigger than 60
+        '-S':ValuedParameter('-',Name='S',Delimiter=' '),
+
+        # alignment coverage for the longer sequence, default 0.0
+        # if set to 0.9, the alignment must covers 90% of the sequence
+        '-aL':ValuedParameter('-',Name='aL',Delimiter=' '),
+
+        # alignment coverage control for the longer sequence, default 99999999
+        # if set to 60, and the length of the sequence is 400,
+        # then the alignment must be >= 340 (400-60) residues
+        '-AL':ValuedParameter('-',Name='AL',Delimiter=' '),
+
+        # alignment coverage for the shorter sequence, default 0.0
+        # if set to 0.9, the alignment must covers 90% of the sequence
+        '-aS':ValuedParameter('-',Name='aS',Delimiter=' '),
+
+        # alignment coverage control for the shorter sequence, default 99999999
+        # if set to 60, and the length of the sequence is 400,
+        # then the alignment must be >= 340 (400-60) residues
+        '-AS':ValuedParameter('-',Name='AS',Delimiter=' '),
+
+        # 1 or 0, default 0, by default, sequences are stored in RAM
+        # if set to 1, sequence are stored on hard drive
+        # it is recommended to use -B 1 for huge databases
+        '-B':ValuedParameter('-',Name='B',Delimiter=' '),
+
+        # 1 or 0, default 0
+        # if set to 1, print alignment overlap in .clstr file
+        '-p':ValuedParameter('-',Name='p',Delimiter=' '),
+
+        # 1 or 0, default 0
+        # by cd-hit's default algorithm, a sequence is clustered to the first
+        # cluster that meet the threshold (fast cluster). If set to 1, the program
+        # will cluster it into the most similar cluster that meet the threshold
+        # (accurate but slow mode)
+        # but either 1 or 0 won't change the representatives of final clusters
+        '-g':ValuedParameter('-',Name='g',Delimiter=' '),
+
+        # print this help
+        '-h':ValuedParameter('-',Name='h',Delimiter=' ')
+    }
+    _synonyms = {'Similarity':'-c'}
+
+    def getHelp(self):
+        """Method that points to documentation"""
+        help_str =\
+        """
+        CD-HIT is hosted as an open source project at:
+        http://www.bioinformatics.org/cd-hit/
+
+        The following papers should be cited if this resource is used:
+
+        Clustering of highly homologous sequences to reduce thesize of large
+        protein database", Weizhong Li, Lukasz Jaroszewski & Adam Godzik
+        Bioinformatics, (2001) 17:282-283
+
+        Tolerating some redundancy significantly speeds up clustering of large
+        protein databases", Weizhong Li, Lukasz Jaroszewski & Adam Godzik
+        Bioinformatics, (2002) 18:77-82
+        """
+        return help_str
+
+    def _input_as_multiline_string(self, data):
+        """Writes data to tempfile and sets -i parameter
+
+        data -- list of lines
+        """
+        if data:
+            self.Parameters['-i']\
+                    .on(super(CD_HIT,self)._input_as_multiline_string(data))
+        return ''
+
+    def _input_as_lines(self, data):
+        """Writes data to tempfile and sets -i parameter
+
+        data -- list of lines, ready to be written to file
+        """
+        if data:
+            self.Parameters['-i']\
+                    .on(super(CD_HIT,self)._input_as_lines(data))
+        return ''
+
+    def _input_as_seqs(self, data):
+        """Creates a list of seqs to pass to _input_as_lines
+
+        data -- list like object of sequences
+        """
+        lines = []
+        for i,s in enumerate(data):
+            # will number the sequences 1,2,3, etc...
+            lines.append(''.join(['>',str(i+1)]))
+            lines.append(s)
+        return self._input_as_lines(lines)
+
+    def _input_as_string(self, data):
+        """Makes data the value of a specific parameter"""
+        if data:
+            self.Parameters['-i'].on(str(data))
+        return ''
+
+    def _get_seqs_outfile(self):
+        """Returns the absolute path to the seqs outfile"""
+        if self.Parameters['-o'].isOn():
+            return self.Parameters['-o'].Value
+        else:
+            raise ValueError, "No output file specified"
+
+    def _get_clstr_outfile(self):
+        """Returns the absolute path to the clstr outfile"""
+        if self.Parameters['-o'].isOn():
+            return ''.join([self.Parameters['-o'].Value, '.clstr'])
+        else:
+            raise ValueError, "No output file specified"
+
+    def _get_result_paths(self, data):
+        """Return dict of {key: ResultPath}"""
+        result = {}
+        result['FASTA'] = ResultPath(Path=self._get_seqs_outfile())
+        result['CLSTR'] = ResultPath(Path=self._get_clstr_outfile())
+        return result
+
+class CD_HIT_EST(CD_HIT):
+    """cd-hit Application Controller
+
+    Use this version of CD-HIT if your MolType is PROTEIN
+    """
+
+    _command = 'cd-hit-est'
+    _input_handler = '_input_as_multiline_string'
+    _parameters = CD_HIT._parameters
+    _parameters.update({\
+        # 1 or 0, default 0, by default only +/+ strand alignment
+        # if set to 1, do both +/+ & +/- alignments
+        '-r':ValuedParameter('-',Name='r',Delimiter=' ')
+        })
+
+def cdhit_clusters_from_seqs(seqs, moltype=DNA, params=None):
+    """Returns the CD-HIT clusters given seqs
+
+    seqs        : dict like collection of sequences
+    moltype     : cogent.core.moltype object
+    params      : cd-hit parameters
+
+    NOTE: This method will call CD_HIT if moltype is PROTIEN,
+        CD_HIT_EST if moltype is RNA/DNA, and raise if any other
+        moltype is passed.
+    """
+    # keys are not remapped. Tested against seq_ids of 100char length
+    seqs = SequenceCollection(seqs, MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = seqs.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+
+    # setup params and make sure the output argument is set
+    if params is None:
+        params = {}
+    if '-o' not in params:
+        _, params['-o'] = mkstemp()
+
+    # call the correct version of cd-hit base on moltype
+    working_dir = mkdtemp()
+    if moltype is PROTEIN:
+        app = CD_HIT(WorkingDir=working_dir, params=params)
+    elif moltype is RNA:
+        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
+    elif moltype is DNA:
+        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
+    else:
+        raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"
+
+    # grab result
+    res = app(int_map.toFasta())
+    clusters = parse_cdhit_clstr_file(res['CLSTR'])
+
+    remapped_clusters = []
+    for c in clusters:
+        curr = [int_keys[i] for i in c]
+        remapped_clusters.append(curr)
+
+    # perform cleanup
+    res.cleanUp()
+    shutil.rmtree(working_dir)
+    remove(params['-o'] + '.bak.clstr')
+
+    return remapped_clusters
+
+def cdhit_from_seqs(seqs, moltype, params=None):
+    """Returns the CD-HIT results given seqs
+
+    seqs    : dict like collection of sequences
+    moltype : cogent.core.moltype object
+    params  : cd-hit parameters
+
+    NOTE: This method will call CD_HIT if moltype is PROTIEN,
+        CD_HIT_EST if moltype is RNA/DNA, and raise if any other
+        moltype is passed.
+    """
+    # keys are not remapped. Tested against seq_ids of 100char length
+    seqs = SequenceCollection(seqs, MolType=moltype)
+
+    # setup params and make sure the output argument is set
+    if params is None:
+        params = {}
+    if '-o' not in params:
+        _, params['-o'] = mkstemp()
+
+    # call the correct version of cd-hit base on moltype
+    working_dir = mkdtemp()
+    if moltype is PROTEIN:
+        app = CD_HIT(WorkingDir=working_dir, params=params)
+    elif moltype is RNA:
+        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
+    elif moltype is DNA:
+        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
+    else:
+        raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"
+
+    # grab result
+    res = app(seqs.toFasta())
+    new_seqs = dict(parse_fasta(res['FASTA']))
+
+    # perform cleanup
+    res.cleanUp()
+    shutil.rmtree(working_dir)
+    remove(params['-o'] + '.bak.clstr')
+
+    return SequenceCollection(new_seqs, MolType=moltype)
+
+def clean_cluster_seq_id(id):
+    """Returns a cleaned cd-hit sequence id
+
+    The cluster file has sequence ids in the form of:
+    >some_id...
+    """
+    return id[1:-3]
+
+def parse_cdhit_clstr_file(lines):
+    """Returns a list of list of sequence ids representing clusters"""
+    clusters = []
+    curr_cluster = []
+
+    for l in lines:
+        if l.startswith('>Cluster'):
+            if not curr_cluster:
+                continue
+            clusters.append(curr_cluster)
+            curr_cluster = []
+        else:
+            curr_cluster.append(clean_cluster_seq_id(l.split()[2]))
+
+    if curr_cluster:
+        clusters.append(curr_cluster)
+
+    return clusters
diff --git a/bfillings/clearcut.py b/bfillings/clearcut.py
new file mode 100644
index 0000000..5f53a90
--- /dev/null
+++ b/bfillings/clearcut.py
@@ -0,0 +1,401 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Provides an application controller for the commandline version of:
+Clearcut v1.0.8
+"""
+from burrito.parameters import (FlagParameter, ValuedParameter,
+                                  MixedParameter)
+from burrito.util import (CommandLineApplication, ResultPath,
+                            get_tmp_filename)
+
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.parse.tree import DndParser
+from cogent.core.tree import PhyloNode
+from cogent.util.dict2d import Dict2D
+from cogent.format.table import phylipMatrix
+
+
+MOLTYPE_MAP = {'DNA':'-D',
+               'RNA':'-D',
+               'PROTEIN':'-P',
+               }
+
+
+class Clearcut(CommandLineApplication):
+    """ clearcut application controller
+
+    The parameters are organized by function to give some idea of how the
+    program works. However, no restrictions are put on any combinations
+    of parameters. Misuse of parameters can lead to errors or otherwise
+    strange results.
+    """
+    #General options.
+    _general = {\
+        # --verbose.  More Output. (Default:OFF)
+        '-v':FlagParameter('-',Name='v'),
+        # --quiet.  Silent operation. (Default: ON)
+        '-q':FlagParameter('-',Name='q',Value=True),
+        # --seed=<seed>.  Explicitly set the PRNG seed to a specific value.
+        '-s':ValuedParameter('-',Name='s',Delimiter='='),
+        # --norandom.  Attempt joins deterministically.  (Default: OFF)
+        '-r':FlagParameter('-',Name='r'),
+        # --shuffle.  Randomly shuffle the distance matrix.  (Default: OFF)
+        '-S':FlagParameter('-',Name='S'),
+        #--neighbor.  Use traditional Neighbor-Joining algorithm. (Default: OFF)
+        '-N':FlagParameter('-',Name='N'),
+
+        }
+
+
+    # Input file is distance matrix or alignment.  Default expects distance
+    # matrix.  Output file is tree created by clearcut.
+    _input = {\
+        # --in=<infilename>.  Input file
+        '--in':ValuedParameter('--',Name='in',Delimiter='=',IsPath=True),
+        # --stdin.  Read input from STDIN.
+        '-I':FlagParameter('-',Name='I'),
+        # --distance.  Input file is a distance matrix. (Default: ON)
+        '-d':FlagParameter('-',Name='d',Value=True),
+        # --alignment.  Input file is a set of aligned sequences.
+        #     (Default: OFF)
+        '-a':FlagParameter('-',Name='a'),
+        # --DNA.  Input alignment are DNA sequences.
+        '-D':FlagParameter('-',Name='D'),
+        # --protein.  Input alignment are protein sequences.
+        '-P':FlagParameter('-',Name='P'),
+        }
+
+
+    #Correction model for computing distance matrix (Default: NO Correction):
+    _correction={\
+        # --jukes.  Use Jukes-Cantor correction for computing distance matrix.
+        '-j':FlagParameter('-',Name='j'),
+        # --kimura.  Use Kimura correction for distance matrix.
+        '-k':FlagParameter('-',Name='k'),
+
+        }
+
+    _output={\
+        # --out=<outfilename>.  Output file
+        '--out':ValuedParameter('--',Name='out',Delimiter='=',IsPath=True),
+        # --stdout.  Output tree to STDOUT.
+        '-O':FlagParameter('-',Name='O'),
+        # --matrixout=<file> Output distance matrix to specified file.
+        '-m':ValuedParameter('-',Name='m',Delimiter='='),
+        # --ntrees=<n>.  Output n trees.  (Default: 1)
+        '-n':ValuedParameter('-',Name='n',Delimiter='='),
+        # --expblen.  Exponential notation for branch lengths. (Default: OFF)
+        '-e':FlagParameter('-',Name='e'),
+        # --expdist.  Exponential notation in distance output. (Default: OFF)
+        '-E':FlagParameter('-',Name='E'),
+
+        }
+
+
+        #NOT SUPPORTED
+        #'-h':FlagParameter('-','h'),       #Help
+        #'-V':FlagParameter('-','V'),       #Version
+
+
+    _parameters = {}
+    _parameters.update(_general)
+    _parameters.update(_input)
+    _parameters.update(_correction)
+    _parameters.update(_output)
+
+    _command = 'clearcut'
+
+    def getHelp(self):
+        """Method that points to the Clearcut documentation."""
+        help_str =\
+        """
+        See Clearcut homepage at:
+        http://bioinformatics.hungry.com/clearcut/
+        """
+        return help_str
+
+    def _input_as_multiline_string(self, data):
+        """Writes data to tempfile and sets -infile parameter
+
+        data -- list of lines
+        """
+        if data:
+            self.Parameters['--in']\
+                .on(super(Clearcut,self)._input_as_multiline_string(data))
+        return ''
+
+    def _input_as_lines(self,data):
+        """Writes data to tempfile and sets -infile parameter
+
+        data -- list of lines, ready to be written to file
+        """
+        if data:
+            self.Parameters['--in']\
+                .on(super(Clearcut,self)._input_as_lines(data))
+        return ''
+
+    def _input_as_seqs(self,data):
+        """writes sequences to tempfile and sets -infile parameter
+
+        data -- list of sequences
+
+        Adds numbering to the sequences: >1, >2, etc.
+        """
+        lines = []
+        for i,s in enumerate(data):
+            #will number the sequences 1,2,3,etc.
+            lines.append(''.join(['>',str(i+1)]))
+            lines.append(s)
+        return self._input_as_lines(lines)
+
+    def _input_as_string(self,data):
+        """Makes data the value of a specific parameter
+
+        This method returns the empty string. The parameter will be printed
+        automatically once set.
+        """
+        if data:
+            self.Parameters['--in'].on(data)
+        return ''
+
+    def _tree_filename(self):
+        """Return name of file containing the alignment
+
+        prefix -- str, prefix of alignment file.
+        """
+        if self.Parameters['--out']:
+            aln_filename = self._absolute(self.Parameters['--out'].Value)
+        else:
+            raise ValueError, "No tree output file specified."
+        return aln_filename
+
+    def _get_result_paths(self,data):
+        """Return dict of {key: ResultPath}
+        """
+        result = {}
+        if self.Parameters['--out'].isOn():
+            out_name = self._tree_filename()
+            result['Tree'] = ResultPath(Path=out_name,IsWritten=True)
+        return result
+
+
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+
+
+def align_unaligned_seqs(seqs, moltype=DNA, params=None):
+    """Returns an Alignment object from seqs.
+
+    seqs: SequenceCollection object, or data that can be used to build one.
+
+    moltype: a MolType object.  DNA, RNA, or PROTEIN.
+
+    params: dict of parameters to pass in to the Clearcut app controller.
+
+    Result will be an Alignment object.
+    """
+    #Clearcut does not support alignment
+    raise NotImplementedError, """Clearcut does not support alignment."""
+
+def align_and_build_tree(seqs, moltype, best_tree=False, params={}):
+    """Returns an alignment and a tree from Sequences object seqs.
+
+    seqs: SequenceCollection object, or data that can be used to build one.
+
+    best_tree: if True (default:False), uses a slower but more accurate
+    algorithm to build the tree.
+
+    params: dict of parameters to pass in to the Clearcut app controller.
+
+    The result will be a tuple containing an Alignment object and a
+    cogent.core.tree.PhyloNode object (or None for the alignment and/or tree
+    if either fails).
+    """
+    #Clearcut does not support alignment
+    raise NotImplementedError, """Clearcut does not support alignment."""
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={},\
+    working_dir='/tmp'):
+    """Returns a tree from Alignment object aln.
+
+    aln: an cogent.core.alignment.Alignment object, or data that can be used
+    to build one.
+        -  Clearcut only accepts aligned sequences.  Alignment object used to
+        handle unaligned sequences.
+
+    moltype: a cogent.core.moltype object.
+        - NOTE: If moltype = RNA, we must convert to DNA since Clearcut v1.0.8
+        gives incorrect results if RNA is passed in.  'U' is treated as an
+        incorrect character and is excluded from distance calculations.
+
+    best_tree: if True (default:False), uses a slower but more accurate
+    algorithm to build the tree.
+
+    params: dict of parameters to pass in to the Clearcut app controller.
+
+    The result will be an cogent.core.tree.PhyloNode object, or None if tree
+    fails.
+    """
+    params['--out'] = get_tmp_filename(working_dir)
+
+    # Create instance of app controller, enable tree, disable alignment
+    app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \
+                   WorkingDir=working_dir, SuppressStdout=True,\
+                   SuppressStderr=True)
+    #Input is an alignment
+    app.Parameters['-a'].on()
+    #Turn off input as distance matrix
+    app.Parameters['-d'].off()
+
+    #If moltype = RNA, we must convert to DNA.
+    if moltype == RNA:
+        moltype = DNA
+
+    if best_tree:
+        app.Parameters['-N'].on()
+
+    #Turn on correct moltype
+    moltype_string = moltype.label.upper()
+    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
+
+    # Setup mapping. Clearcut clips identifiers. We will need to remap them.
+    # Clearcut only accepts aligned sequences.  Let Alignment object handle
+    # unaligned sequences.
+    seq_aln = Alignment(aln,MolType=moltype)
+    #get int mapping
+    int_map, int_keys = seq_aln.getIntMap()
+    #create new Alignment object with int_map
+    int_map = Alignment(int_map)
+
+    # Collect result
+    result = app(int_map.toFasta())
+
+    # Build tree
+    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+    for node in tree.tips():
+        node.Name = int_keys[node.Name]
+
+    # Clean up
+    result.cleanUp()
+    del(seq_aln, app, result, int_map, int_keys, params)
+
+    return tree
+
+def add_seqs_to_alignment(seqs, aln, params=None):
+    """Returns an Alignment object from seqs and existing Alignment.
+
+    seqs: an cogent.core.sequence.Sequence object, or data that can be used
+    to build one.
+
+    aln: an cogent.core.alignment.Alignment object, or data that can be used
+    to build one
+
+    params: dict of parameters to pass in to the Clearcut app controller.
+    """
+    #Clearcut does not support alignment
+    raise NotImplementedError, """Clearcut does not support alignment."""
+
+def align_two_alignments(aln1, aln2, params=None):
+    """Returns an Alignment object from two existing Alignments.
+
+    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
+    used to build them.
+
+    params: dict of parameters to pass in to the Clearcut app controller.
+    """
+    #Clearcut does not support alignment
+    raise NotImplementedError, """Clearcut does not support alignment."""
+
+
+def build_tree_from_distance_matrix(matrix, best_tree=False, params={},\
+    working_dir='/tmp'):
+    """Returns a tree from a distance matrix.
+
+    matrix: a square Dict2D object (cogent.util.dict2d)
+
+    best_tree: if True (default:False), uses a slower but more accurate
+    algorithm to build the tree.
+
+    params: dict of parameters to pass in to the Clearcut app controller.
+
+    The result will be an cogent.core.tree.PhyloNode object, or None if tree
+    fails.
+    """
+    params['--out'] = get_tmp_filename(working_dir)
+
+    # Create instance of app controller, enable tree, disable alignment
+    app = Clearcut(InputHandler='_input_as_multiline_string', params=params, \
+                   WorkingDir=working_dir, SuppressStdout=True,\
+                   SuppressStderr=True)
+    #Turn off input as alignment
+    app.Parameters['-a'].off()
+    #Input is a distance matrix
+    app.Parameters['-d'].on()
+
+    if best_tree:
+        app.Parameters['-N'].on()
+
+    # Turn the dict2d object into the expected input format
+    matrix_input, int_keys = _matrix_input_from_dict2d(matrix)
+
+    # Collect result
+    result = app(matrix_input)
+
+    # Build tree
+    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+
+    # reassign to original names
+    for node in tree.tips():
+        node.Name = int_keys[node.Name]
+
+    # Clean up
+    result.cleanUp()
+    del(app, result, params)
+
+    return tree
+
+def _matrix_input_from_dict2d(matrix):
+    """makes input for running clearcut on a matrix from a dict2D object"""
+    #clearcut truncates names to 10 char- need to rename before and
+    #reassign after
+
+    #make a dict of env_index:full name
+    int_keys = dict([('env_' + str(i), k) for i,k in \
+            enumerate(sorted(matrix.keys()))])
+    #invert the dict
+    int_map = {}
+    for i in int_keys:
+        int_map[int_keys[i]] = i
+
+    #make a new dict2D object with the integer keys mapped to values instead of
+    #the original names
+    new_dists = []
+    for env1 in matrix:
+        for env2 in matrix[env1]:
+            new_dists.append((int_map[env1], int_map[env2], matrix[env1][env2]))
+    int_map_dists = Dict2D(new_dists)
+
+    #names will be fed into the phylipTable function - it is the int map names
+    names = sorted(int_map_dists.keys())
+    rows = []
+    #populated rows with values based on the order of names
+    #the following code will work for a square matrix only
+    for index, key1 in enumerate(names):
+        row = []
+        for key2 in names:
+            row.append(str(int_map_dists[key1][key2]))
+        rows.append(row)
+    input_matrix = phylipMatrix(rows, names)
+    #input needs a trailing whitespace or it will fail!
+    input_matrix += '\n'
+
+    return input_matrix, int_keys
diff --git a/bfillings/clustalw.py b/bfillings/clustalw.py
new file mode 100644
index 0000000..b195e15
--- /dev/null
+++ b/bfillings/clustalw.py
@@ -0,0 +1,724 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Provides an application controller for the commandline version of:
+CLUSTALW v1.83
+"""
+from numpy.random import randint
+from burrito.parameters import (FlagParameter, ValuedParameter,
+                                  MixedParameter, FilePath)
+from burrito.util import CommandLineApplication, ResultPath, remove
+
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.parse.tree import DndParser
+from cogent.parse.clustal import ClustalParser
+from cogent.core.tree import PhyloNode
+from cogent.core.moltype import RNA, DNA, PROTEIN
+
+
+class Clustalw(CommandLineApplication):
+    """ clustalw application controller
+
+    The parameters are organized by function to give some idea of how the
+    program works. However, no restrictions are put on any combinations
+    of parameters. Misuse of parameters can lead to errors or otherwise
+    strange results.
+
+    You are supposed to choose one action for the program to perform. (align,
+    profile, sequences, tree, or bootstrap). If you choose multiple, only the
+    dominant action (see order above) will be executed. By DEFAULT, the -align
+    parameter is turned on. If you decide to turn another one on, you should
+    turn '-align' off IN ADDITION!
+
+    Some references to help pages are available in the 'getHelp' method.
+    Some might be useful to you.
+    """
+    _actions = {\
+        '-align':FlagParameter('-','align',Value=True),
+        '-profile':FlagParameter('-','profile'),
+        '-sequences':FlagParameter('-','sequences'),
+        '-tree':FlagParameter('-','tree'),
+        '-bootstrap':MixedParameter('-','bootstrap',Delimiter='=')}
+
+    #sequence file for alignment, or alignment file for bootstrap and tree
+    #actions
+    _input = {'-infile':ValuedParameter('-','infile',Delimiter='=',IsPath=True)}
+
+    # matrix and dnamatrix can be filenames as well, but not always.
+    # They won't be treated as filenames and thus not quoted.
+    # Therefore filepaths containing spaces might result in errors.
+    _multiple_alignment={\
+        '-quicktree':FlagParameter('-','quicktree'),
+        '-type':ValuedParameter('-','type',Delimiter='='),
+        '-matrix':ValuedParameter('-','matrix',Delimiter='='),
+        '-dnamatrix':ValuedParameter('-','dnamatrix',Delimiter='='),
+        '-gapopen':ValuedParameter('-','gapopen',Delimiter='='),
+        '-gapext':ValuedParameter('-','gapext',Delimiter='='),
+        '-endgaps':FlagParameter('-','endgaps'),
+        '-gapdist':ValuedParameter('-',Name='gapdist',Delimiter='='),
+        '-nopgap':FlagParameter('-','nopgap'),
+        '-nohgap':FlagParameter('-','nohgap'),
+        '-hgapresidues':ValuedParameter('-','hgapresidues',Delimiter='='),
+        '-maxdiv':ValuedParameter('-',Name='maxdiv',Delimiter='='),
+        '-negative':FlagParameter('-','negative'),
+        '-transweight':ValuedParameter('-',Name='transweight',Delimiter='='),
+        '-newtree':ValuedParameter('-','newtree',Delimiter='=',IsPath=True),
+        '-usetree':ValuedParameter('-','usetree',Delimiter='=',IsPath=True)}
+
+    _fast_pairwise={\
+        '-ktuple':ValuedParameter('-',Name='ktuple',Delimiter='='),
+        '-topdiags':ValuedParameter('-',Name='topdiags',Delimiter='='),
+        '-window':ValuedParameter('-',Name='window',Delimiter='='),
+        '-pairgap':ValuedParameter('-',Name='pairgap',Delimiter='='),
+        '-score':ValuedParameter('-',Name='score',Delimiter='=')}
+
+    # pwmatrix and pwdnamatrix can be filenames as well, but not always.
+    # They won't be treated as filenames and thus not quoted.
+    # Therefore filepaths containing spaces might result in errors.
+    _slow_pairwise={\
+        '-pwmatrix':ValuedParameter('-',Name='pwmatrix',Delimiter='='),
+        '-pwdnamatrix':ValuedParameter('-',Name='pwdnamatrix',Delimiter='='),
+        '-pwgapopen':ValuedParameter('-',Name='pwgapopen',Delimiter='='),
+        '-pwgapext':ValuedParameter('-',Name='pwgapext',Delimiter='=')}
+
+    #plus -bootstrap
+    _tree={\
+        '-kimura':FlagParameter('-',Name='kimura'),
+        '-tossgaps':FlagParameter('-',Name='tossgaps'),
+        '-bootlabels':ValuedParameter('-',Name='bootlabels',Delimiter='='),
+        '-seed':ValuedParameter('-',Name='seed',Delimiter='='),
+        '-outputtree':ValuedParameter('-',Name='outputtree',Delimiter='=')}
+
+    _output={\
+        '-outfile':ValuedParameter('-',Name='outfile',Delimiter='=',\
+            IsPath=True),
+        '-output':ValuedParameter('-',Name='output',Delimiter='='),
+        '-case':ValuedParameter('-',Name='case',Delimiter='='),
+        '-outorder':ValuedParameter('-',Name='outorder',Delimiter='='),
+        '-seqnos':ValuedParameter('-',Name='seqnos',Delimiter='=')}
+
+    _profile_alignment={\
+        '-profile1':ValuedParameter('-','profile1',Delimiter='=',IsPath=True),
+        '-profile2':ValuedParameter('-','profile2',Delimiter='=',IsPath=True),
+        '-usetree1':ValuedParameter('-','usetree1',Delimiter='=',IsPath=True),
+        '-usetree2':ValuedParameter('-','usetree2',Delimiter='=',IsPath=True),
+        '-newtree1':ValuedParameter('-','newtree1',Delimiter='=',IsPath=True),
+        '-newtree2':ValuedParameter('-','newtree2',Delimiter='=',IsPath=True)}
+
+    _structure_alignment={\
+        '-nosecstr1':FlagParameter('-',Name='nosecstr1'),
+        '-nosecstr2':FlagParameter('-',Name='nosecstr2'),
+        '-helixgap':ValuedParameter('-',Name='helixgap',Delimiter='='),
+        '-strandgap':ValuedParameter('-',Name='strandgap',Delimiter='='),
+        '-loopgap':ValuedParameter('-',Name='loopgap',Delimiter='='),
+        '-terminalgap':ValuedParameter('-',Name='terminalgap',Delimiter='='),
+        '-helixendin':ValuedParameter('-',Name='helixendin',Delimiter='='),
+        '-helixendout':ValuedParameter('-',Name='helixendout',Delimiter='='),
+        '-strandendin':ValuedParameter('-',Name='strandendin',Delimiter='='),
+        '-strandendout':ValuedParameter('-',Name='strandendout',Delimiter='='),
+        '-secstrout':ValuedParameter('-',Name='secstrout',Delimiter='=')}
+
+        #NOT SUPPORTED
+        #'-help':FlagParameter('-','help'),
+        #'-check':FlagParameter('-','check'),
+        #'-options':FlagParameter('-','options'),
+        #'-convert':FlagParameter('-','convert'),
+        #'-batch':FlagParameter('-','batch'),
+        #'-noweights':FlagParameter('-','noweights'),
+        #'-novgap':FlagParameter('-','novgap'),
+        #'-debug':ValuedParameter('-',Name='debug',Delimiter='='),
+
+    _parameters = {}
+    _parameters.update(_actions)
+    _parameters.update(_input)
+    _parameters.update(_multiple_alignment)
+    _parameters.update(_fast_pairwise)
+    _parameters.update(_slow_pairwise)
+    _parameters.update(_tree)
+    _parameters.update(_output)
+    _parameters.update(_profile_alignment)
+    _parameters.update(_structure_alignment)
+
+    _command = 'clustalw'
+
+    def getHelp(self):
+        """Methods that points to the documentation"""
+        help_str =\
+        """
+        There are several help pages available online. For example:
+        http://searchlauncher.bcm.tmc.edu/multi-align/Help/
+            clustalw_help_1.8.html
+        http://hypernig.nig.ac.jp/homology/clustalw-e_help.html
+        http://www.genebee.msu.su/clustal/help.html
+
+        A page that give reasonable insight in use of the parameters:
+        http://bioweb.pasteur.fr/seqanal/interfaces/clustalw.html
+        """
+        return help_str
+
+    def _input_as_multiline_string(self, data):
+        """Writes data to tempfile and sets -infile parameter
+
+        data -- list of lines
+        """
+        if data:
+            self.Parameters['-infile']\
+                .on(super(Clustalw,self)._input_as_multiline_string(data))
+        return ''
+
+    def _input_as_lines(self,data):
+        """Writes data to tempfile and sets -infile parameter
+
+        data -- list of lines, ready to be written to file
+        """
+        if data:
+            self.Parameters['-infile']\
+                .on(super(Clustalw,self)._input_as_lines(data))
+        return ''
+
+    def _input_as_seqs(self,data):
+        """writes sequences to tempfile and sets -infile parameter
+
+        data -- list of sequences
+
+        Adds numbering to the sequences: >1, >2, etc.
+        """
+        lines = []
+        for i,s in enumerate(data):
+            #will number the sequences 1,2,3,etc.
+            lines.append(''.join(['>',str(i+1)]))
+            lines.append(s)
+        return self._input_as_lines(lines)
+
+    def _input_as_string(self,data):
+        """Makes data the value of a specific parameter
+
+        This method returns the empty string. The parameter will be printed
+        automatically once set.
+        """
+        if data:
+            self.Parameters['-infile'].on(data)
+        return ''
+
+    def _suffix(self):
+        """Return appropriate suffix for alignment file"""
+        _output_formats={'GCG':'.msf',
+                        'GDE':'.gde',
+                        'PHYLIP':'.phy',
+                        'PIR':'.pir',
+                        'NEXUS':'.nxs'}
+
+        if self.Parameters['-output'].isOn():
+            return _output_formats[self.Parameters['-output'].Value]
+        else:
+            return '.aln'
+
+    def _aln_filename(self,prefix):
+        """Return name of file containing the alignment
+
+        prefix -- str, prefix of alignment file.
+        """
+        if self.Parameters['-outfile'].isOn():
+            aln_filename = self._absolute(self.Parameters['-outfile'].Value)
+        else:
+            aln_filename = prefix + self._suffix()
+        return aln_filename
+
+    def _tempfile_as_multiline_string(self, data):
+        """Write a multiline string to a temp file and return the filename.
+
+            data: a multiline string to be written to a file.
+
+           * Note: the result will be the filename as a FilePath object
+            (which is a string subclass).
+
+        """
+        filename = FilePath(self.getTmpFilename(self.TmpDir))
+        data_file = open(filename,'w')
+        data_file.write(data)
+        data_file.close()
+        return filename
+
+    def _get_result_paths(self,data):
+        """Return dict of {key: ResultPath}
+        """
+
+        #clustalw .aln is used when no or unkown output type specified
+        _treeinfo_formats = {'nj':'.nj',
+                            'dist':'.dst',
+                            'nexus':'.tre'}
+
+        result = {}
+        par = self.Parameters
+        abs = self._absolute
+
+        if par['-align'].isOn():
+            prefix = par['-infile'].Value.rsplit('.', 1)[0]
+            #prefix = par['-infile'].Value.split('.')[0]
+            aln_filename = self._aln_filename(prefix)
+            if par['-newtree'].isOn():
+                dnd_filename = abs(par['-newtree'].Value)
+            elif par['-usetree'].isOn():
+                dnd_filename = abs(par['-usetree'].Value)
+            else:
+                dnd_filename = abs(prefix + '.dnd')
+            result['Align'] = ResultPath(Path=aln_filename,IsWritten=True)
+            result['Dendro'] = ResultPath(Path=dnd_filename,IsWritten=True)
+        elif par['-profile'].isOn():
+            prefix1 = par['-profile1'].Value.rsplit('.', 1)[0]
+            prefix2 = par['-profile2'].Value.rsplit('.', 1)[0]
+            #prefix1 = par['-profile1'].Value.split('.')[0]
+            #prefix2 = par['-profile2'].Value.split('.')[0]
+            aln_filename = ''; aln_written = True
+            dnd1_filename = ''; tree1_written = True
+            dnd2_filename = ''; tree2_written = True
+            aln_filename = self._aln_filename(prefix1)
+            #usetree1
+            if par['-usetree1'].isOn():
+                tree1_written = False
+            #usetree2
+            if par['-usetree2'].isOn():
+                tree2_written = False
+            if par['-newtree1'].isOn():
+                dnd1_filename = abs(par['-newtree1'].Value)
+                aln_written=False
+            else:
+                dnd1_filename = abs(prefix1 + '.dnd')
+            if par['-newtree2'].isOn():
+                dnd2_filename = abs(par['-newtree2'].Value)
+                aln_written=False
+            else:
+                dnd2_filename = abs(prefix2 + '.dnd')
+            result['Align'] = ResultPath(Path=aln_filename,
+                IsWritten=aln_written)
+            result['Dendro1'] = ResultPath(Path=dnd1_filename,
+                IsWritten=tree1_written)
+            result['Dendro2'] = ResultPath(Path=dnd2_filename,
+                IsWritten=tree2_written)
+        elif par['-sequences'].isOn():
+            prefix1 = par['-profile1'].Value.rsplit('.', 1)[0]
+            prefix2 = par['-profile2'].Value.rsplit('.', 1)[0]
+            #prefix1 = par['-profile1'].Value.split('.')[0] #alignment
+            #prefix2 = par['-profile2'].Value.split('.')[0] #sequences
+            aln_filename = ''; aln_written = True
+            dnd_filename = ''; dnd_written = True
+
+            aln_filename = self._aln_filename(prefix2)
+            if par['-usetree'].isOn():
+                dnd_written = False
+            elif par['-newtree'].isOn():
+                aln_written = False
+                dnd_filename = abs(par['-newtree'].Value)
+            else:
+                dnd_filename = prefix2 + '.dnd'
+            result['Align'] = ResultPath(Path=aln_filename,\
+                IsWritten=aln_written)
+            result['Dendro'] = ResultPath(Path=dnd_filename,\
+                IsWritten=dnd_written)
+        elif par['-tree'].isOn():
+            prefix = par['-infile'].Value.rsplit('.', 1)[0]
+            #prefix = par['-infile'].Value.split('.')[0]
+            tree_filename = ''; tree_written = True
+            treeinfo_filename = ''; treeinfo_written = False
+            tree_filename = prefix + '.ph'
+            if par['-outputtree'].isOn() and\
+                par['-outputtree'].Value != 'phylip':
+                treeinfo_filename = prefix +\
+                    _treeinfo_formats[par['-outputtree'].Value]
+                treeinfo_written = True
+            result['Tree'] = ResultPath(Path=tree_filename,\
+                IsWritten=tree_written)
+            result['TreeInfo'] = ResultPath(Path=treeinfo_filename,\
+                IsWritten=treeinfo_written)
+
+        elif par['-bootstrap'].isOn():
+            prefix = par['-infile'].Value.rsplit('.', 1)[0]
+            #prefix = par['-infile'].Value.split('.')[0]
+            boottree_filename = prefix + '.phb'
+            result['Tree'] = ResultPath(Path=boottree_filename,IsWritten=True)
+
+        return result
+
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+def alignUnalignedSeqs(seqs,add_seq_names=True,WorkingDir=None,\
+    SuppressStderr=None,SuppressStdout=None):
+    """Aligns unaligned sequences
+
+    seqs: either list of sequence objects or list of strings
+    add_seq_names: boolean. if True, sequence names are inserted in the list
+        of sequences. if False, it assumes seqs is a list of lines of some
+        proper format that the program can handle
+    """
+    if add_seq_names:
+        app = Clustalw(InputHandler='_input_as_seqs',\
+            WorkingDir=WorkingDir,SuppressStderr=SuppressStderr,\
+            SuppressStdout=SuppressStdout)
+    else:
+        app = Clustalw(InputHandler='_input_as_lines',\
+            WorkingDir=WorkingDir,SuppressStderr=SuppressStderr,\
+            SuppressStdout=SuppressStdout)
+    return app(seqs)
+
+def alignUnalignedSeqsFromFile(filename,WorkingDir=None,SuppressStderr=None,\
+    SuppressStdout=None):
+    """Aligns unaligned sequences from some file (file should be right format)
+
+    filename: string, the filename of the file containing the sequences
+        to be aligned in a valid format.
+    """
+    app = Clustalw(WorkingDir=WorkingDir,SuppressStderr=SuppressStderr,\
+        SuppressStdout=SuppressStdout)
+    return app(filename)
+
+def alignTwoAlignments(aln1,aln2,outfile,WorkingDir=None,SuppressStderr=None,\
+    SuppressStdout=None):
+    """Aligns two alignments. Individual sequences are not realigned
+
+    aln1: string, name of file containing the first alignment
+    aln2: string, name of file containing the second alignment
+    outfile: you're forced to specify an outfile name, because if you don't
+        aln1 will be overwritten. So, if you want aln1 to be overwritten, you
+        should specify the same filename.
+    WARNING: a .dnd file is created with the same prefix as aln1. So an
+    existing dendrogram might get overwritten.
+    """
+    app = Clustalw({'-profile':None,'-profile1':aln1,\
+        '-profile2':aln2,'-outfile':outfile},SuppressStderr=\
+        SuppressStderr,WorkingDir=WorkingDir,SuppressStdout=SuppressStdout)
+    app.Parameters['-align'].off()
+    return app()
+
+def addSeqsToAlignment(aln1,seqs,outfile,WorkingDir=None,SuppressStderr=None,\
+        SuppressStdout=None):
+    """Aligns sequences from second profile against first profile
+
+    aln1: string, name of file containing the alignment
+    seqs: string, name of file containing the sequences that should be added
+        to the alignment.
+    opoutfile: string, name of the output file (the new alignment)
+    """
+    app = Clustalw({'-sequences':None,'-profile1':aln1,\
+        '-profile2':seqs,'-outfile':outfile},SuppressStderr=\
+        SuppressStderr,WorkingDir=WorkingDir, SuppressStdout=SuppressStdout)
+
+    app.Parameters['-align'].off()
+    return app()
+
+def buildTreeFromAlignment(filename,WorkingDir=None,SuppressStderr=None):
+    """Builds a new tree from an existing alignment
+
+    filename: string, name of file containing the seqs or alignment
+    """
+    app = Clustalw({'-tree':None,'-infile':filename},SuppressStderr=\
+        SuppressStderr,WorkingDir=WorkingDir)
+    app.Parameters['-align'].off()
+    return app()
+
+def align_and_build_tree(seqs, moltype, best_tree=False, params=None):
+    """Returns an alignment and a tree from Sequences object seqs.
+
+    seqs: an cogent.core.alignment.SequenceCollection object, or data that can
+    be used to build one.
+
+    moltype: cogent.core.moltype.MolType object
+
+    best_tree: if True (default:False), uses a slower but more accurate
+    algorithm to build the tree.
+
+    params: dict of parameters to pass in to the Clustal app controller.
+
+    The result will be a tuple containing a cogent.core.alignment.Alignment
+    and a cogent.core.tree.PhyloNode
+    object (or None for the alignment and/or tree if either fails).
+    """
+    aln = align_unaligned_seqs(seqs, moltype=moltype, params=params)
+    tree = build_tree_from_alignment(aln, moltype, best_tree, params)
+    return {'Align':aln,'Tree':tree}
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None):
+    """Returns a tree from Alignment object aln.
+
+    aln: an cogent.core.alignment.Alignment object, or data that can be used
+    to build one.
+
+    moltype: cogent.core.moltype.MolType object
+
+    best_tree: if True (default:False), uses a slower but more accurate
+    algorithm to build the tree.
+
+    params: dict of parameters to pass in to the Clustal app controller.
+
+    The result will be an cogent.core.tree.PhyloNode object, or None if tree
+    fails.
+    """
+    # Create instance of app controller, enable tree, disable alignment
+    app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \
+                   WorkingDir='/tmp')
+    app.Parameters['-align'].off()
+
+    #Set params to empty dict if None.
+    if params is None:
+        params={}
+
+    if moltype == DNA or moltype == RNA:
+        params['-type'] = 'd'
+    elif moltype == PROTEIN:
+        params['-type'] = 'p'
+    else:
+        raise ValueError, "moltype must be DNA, RNA, or PROTEIN"
+
+    # best_tree -> bootstrap
+    if best_tree:
+        if '-bootstrap' not in params:
+            app.Parameters['-bootstrap'].on(1000)
+        if '-seed' not in params:
+            app.Parameters['-seed'].on(randint(0,1000))
+        if '-bootlabels' not in params:
+            app.Parameters['-bootlabels'].on('nodes')
+    else:
+        app.Parameters['-tree'].on()
+
+    # Setup mapping. Clustalw clips identifiers. We will need to remap them.
+    seq_collection = SequenceCollection(aln)
+    int_map, int_keys = seq_collection.getIntMap()
+    int_map = SequenceCollection(int_map)
+
+    # Collect result
+    result = app(int_map.toFasta())
+
+    # Build tree
+    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+    for node in tree.tips():
+        node.Name = int_keys[node.Name]
+
+    # Clean up
+    result.cleanUp()
+    del(seq_collection, app, result, int_map, int_keys)
+
+    return tree
+
+def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None):
+    """Returns a tree from Alignment object aln with bootstrap support values.
+
+    aln: an cogent.core.alignment.Alignment object, or data that can be used
+    to build one.
+
+    seed: an interger, seed value to use
+
+    num_trees: an integer, number of trees to bootstrap against
+
+    params: dict of parameters to pass in to the Clustal app controller.
+
+    The result will be an cogent.core.tree.PhyloNode object, or None if tree
+    fails.
+
+    If seed is not specifed in params, a random integer between 0-1000 is used.
+    """
+    # Create instance of controllor, enable bootstrap, disable alignment,tree
+    app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \
+                   WorkingDir='/tmp')
+    app.Parameters['-align'].off()
+    app.Parameters['-tree'].off()
+
+    if app.Parameters['-bootstrap'].isOff():
+        if num_trees is None:
+            num_trees = 1000
+
+        app.Parameters['-bootstrap'].on(num_trees)
+
+    if app.Parameters['-seed'].isOff():
+        if seed is None:
+            seed = randint(0,1000)
+
+        app.Parameters['-seed'].on(seed)
+
+    if app.Parameters['-bootlabels'].isOff():
+        app.Parameters['-bootlabels'].on("node")
+
+    # Setup mapping. Clustalw clips identifiers. We will need to remap them.
+    seq_collection = SequenceCollection(aln)
+    int_map, int_keys = seq_collection.getIntMap()
+    int_map = SequenceCollection(int_map)
+
+    # Collect result
+    result = app(int_map.toFasta())
+
+    # Build tree
+    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+    for node in tree.tips():
+        node.Name = int_keys[node.Name]
+
+    # Clean up
+    result.cleanUp()
+    del(seq_collection, app, result, int_map, int_keys)
+
+    return tree
+
+def align_unaligned_seqs(seqs, moltype=DNA, params=None):
+    """Returns an Alignment object from seqs.
+
+    seqs: cogent.core.alignment.SequenceCollection object, or data that can be
+    used to build one.
+
+    moltype: a MolType object.  DNA, RNA, or PROTEIN.
+
+    params: dict of parameters to pass in to the Clustal app controller.
+
+    Result will be a cogent.core.alignment.Alignment object.
+    """
+    #create SequenceCollection object from seqs
+    seq_collection = SequenceCollection(seqs,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = seq_collection.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+    #Create Clustalw app.
+    app = Clustalw(InputHandler='_input_as_multiline_string',params=params)
+    #Get results using int_map as input to app
+    res = app(int_map.toFasta())
+    #Get alignment as dict out of results
+    alignment = dict(ClustalParser(res['Align'].readlines()))
+    #Make new dict mapping original IDs
+    new_alignment = {}
+    for k,v in alignment.items():
+        new_alignment[int_keys[k]]=v
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment,MolType=moltype)
+    #Clean up
+    res.cleanUp()
+    del(seq_collection,int_map,int_keys,app,res,alignment)
+
+    return new_alignment
+
+def add_seqs_to_alignment(seqs, aln, moltype, params=None):
+    """Returns an Alignment object from seqs and existing Alignment.
+
+    seqs: a cogent.core.alignment.SequenceCollection object, or data that can
+    be used to build one.
+
+    aln: a cogent.core.alignment.Alignment object, or data that can be used to
+    build one
+
+    params: dict of parameters to pass in to the Clustal app controller.
+    """
+    #create SequenceCollection object from seqs
+    seq_collection = SequenceCollection(seqs,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    seq_int_map, seq_int_keys = seq_collection.getIntMap()
+    #Create SequenceCollection from int_map.
+    seq_int_map = SequenceCollection(seq_int_map,MolType=moltype)
+
+    #create Alignment object from aln
+    aln = Alignment(aln,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_')
+    #Create SequenceCollection from int_map.
+    aln_int_map = Alignment(aln_int_map,MolType=moltype)
+
+    #Update seq_int_keys with aln_int_keys
+    seq_int_keys.update(aln_int_keys)
+
+    #Create Mafft app.
+    app = Clustalw(InputHandler='_input_as_multiline_string',\
+        params=params,
+        SuppressStderr=True)
+    app.Parameters['-align'].off()
+    app.Parameters['-infile'].off()
+    app.Parameters['-sequences'].on()
+
+    #Add aln_int_map as profile1
+    app.Parameters['-profile1'].on(\
+        app._tempfile_as_multiline_string(aln_int_map.toFasta()))
+
+    #Add seq_int_map as profile2
+    app.Parameters['-profile2'].on(\
+        app._tempfile_as_multiline_string(seq_int_map.toFasta()))
+    #Get results using int_map as input to app
+    res = app()
+
+    #Get alignment as dict out of results
+    alignment = dict(ClustalParser(res['Align'].readlines()))
+
+    #Make new dict mapping original IDs
+    new_alignment = {}
+    for k,v in alignment.items():
+        new_alignment[seq_int_keys[k]]=v
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment,MolType=moltype)
+    #Clean up
+    res.cleanUp()
+    remove(app.Parameters['-profile1'].Value)
+    remove(app.Parameters['-profile2'].Value)
+    del(seq_collection,seq_int_map,seq_int_keys,\
+        aln,aln_int_map,aln_int_keys,app,res,alignment)
+
+    return new_alignment
+
+def align_two_alignments(aln1, aln2, moltype, params=None):
+    """Returns an Alignment object from two existing Alignments.
+
+    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
+    used to build them.
+
+    params: dict of parameters to pass in to the Clustal app controller.
+    """
+    #create SequenceCollection object from seqs
+    aln1 = Alignment(aln1,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    aln1_int_map, aln1_int_keys = aln1.getIntMap()
+    #Create SequenceCollection from int_map.
+    aln1_int_map = Alignment(aln1_int_map,MolType=moltype)
+
+    #create Alignment object from aln
+    aln2 = Alignment(aln2,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_')
+    #Create SequenceCollection from int_map.
+    aln2_int_map = Alignment(aln2_int_map,MolType=moltype)
+
+    #Update aln1_int_keys with aln2_int_keys
+    aln1_int_keys.update(aln2_int_keys)
+
+    #Create Mafft app.
+    app = Clustalw(InputHandler='_input_as_multiline_string',\
+        params=params,
+        SuppressStderr=True)
+    app.Parameters['-align'].off()
+    app.Parameters['-infile'].off()
+    app.Parameters['-profile'].on()
+
+    #Add aln_int_map as profile1
+    app.Parameters['-profile1'].on(\
+        app._tempfile_as_multiline_string(aln1_int_map.toFasta()))
+
+    #Add seq_int_map as profile2
+    app.Parameters['-profile2'].on(\
+        app._tempfile_as_multiline_string(aln2_int_map.toFasta()))
+    #Get results using int_map as input to app
+    res = app()
+
+    #Get alignment as dict out of results
+    alignment = dict(ClustalParser(res['Align'].readlines()))
+
+    #Make new dict mapping original IDs
+    new_alignment = {}
+    for k,v in alignment.items():
+        new_alignment[aln1_int_keys[k]]=v
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment,MolType=moltype)
+    #Clean up
+    res.cleanUp()
+    remove(app.Parameters['-profile1'].Value)
+    remove(app.Parameters['-profile2'].Value)
+    del(aln1,aln1_int_map,aln1_int_keys,\
+        aln2,aln2_int_map,aln2_int_keys,app,res,alignment)
+
+    return new_alignment
diff --git a/bfillings/denoiser.py b/bfillings/denoiser.py
new file mode 100644
index 0000000..98336da
--- /dev/null
+++ b/bfillings/denoiser.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+This module provides pass-through access to PyCogent's denoiser code. It's a
+bit of a hack, but it allows us to remove the direct dependency on PyCogent by
+centralizing the denoiser code with all of the other PyCogent code that is
+targeted either for complete re-write or removal pending benchmarks. The basic
+idea is that it's not worth porting this code anywhere now because it's days are
+numbered, but we still need to be able to access it for the time being.
+
+"""
+
+from cogent.parse.flowgram import (Flowgram, build_averaged_flowgram,
+                                   seq_to_flow)
+from cogent.parse.flowgram_parser import lazy_parse_sff_handle, get_header_info
+from cogent.parse.flowgram_collection import (FlowgramCollection, parse_sff)
+from cogent.util.trie import build_prefix_map
diff --git a/bfillings/fastq_join.py b/bfillings/fastq_join.py
new file mode 100644
index 0000000..d296aa2
--- /dev/null
+++ b/bfillings/fastq_join.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+# Application controller for ea-utils v1.1.2-537
+# fastq processing utilities
+# http://code.google.com/p/ea-utils/
+#
+
+import os
+import tempfile
+import shutil
+
+from burrito.parameters import ValuedParameter
+from burrito.util import CommandLineApplication, ResultPath
+
+
+class FastqJoin(CommandLineApplication):
+
+    """fastq-join (v1.1.2) application controller for joining paired-end reads."""
+
+    _command = 'fastq-join'
+
+    _parameters = {
+        # Description copied from 'fastq-join'
+        # Usage: fastq-join [options] <read1.fq> <read2.fq> [mate.fq] -o
+        # <read.%.fq>
+
+        # Output:
+        # You can supply 3 -o arguments, for un1, un2, join files, or one
+        # argument as a file name template.  The suffix 'un1, un2, or join' is
+        # appended to the file, or they replace a %-character if present.
+        # If a 'mate' input file is present (barcode read), then the files
+        # 'un3' and 'join2' are also created.
+
+        # we'll only handle one output base path / file name
+        # -o FIL:  See 'Output' above
+        '-o': ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),
+
+        # -v C:  Verifies that the 2 files probe id's match up to char C
+        # use ' ' (space) for Illumina reads
+        '-v': ValuedParameter(Prefix='-', Delimiter=' ', Name='v'),
+
+        # -p N:  N-percent maximum difference (8)
+        '-p': ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),
+
+        # -m N:  N-minimum overlap (6)
+        '-m': ValuedParameter(Prefix='-', Delimiter=' ', Name='m'),
+
+        # -r FIL:  Verbose stitch length report
+        '-r': ValuedParameter(Prefix='-', Delimiter=' ', Name='r')}
+
+    _input_handler = '_input_as_paths'
+
+    def _get_output_path(self):
+        """Checks if a base file label / path is set. Returns absolute path."""
+        if self.Parameters['-o'].isOn():
+            output_path = self._absolute(str(self.Parameters['-o'].Value))
+        else:
+            raise ValueError("No output path specified.")
+        return output_path
+
+    def _get_stitch_report_path(self):
+        """Checks if stitch report label / path is set. Returns absolute path."""
+        if self.Parameters['-r'].isOn():
+            stitch_path = self._absolute(str(self.Parameters['-r'].Value))
+            return stitch_path
+        elif self.Parameters['-r'].isOff():
+            return None
+
+    def _get_result_paths(self, data):
+        """Capture fastq-join output.
+
+        Three output files are produced, in the form of
+            outputjoin : assembled paired reads
+            outputun1 : unassembled reads_1
+            outputun2 : unassembled reads_2
+
+        If a barcode / mate-pairs file is also provided then the following
+        additional files are output:
+            outputjoin2
+            outputun3
+
+        If a verbose stitch length report (-r) is chosen to be written by the
+        user then use a user specified filename.
+        """
+        output_path = self._get_output_path()
+
+        result = {}
+
+        # always output:
+        result['Assembled'] = ResultPath(Path=output_path + 'join',
+                                         IsWritten=True)
+        result['UnassembledReads1'] = ResultPath(Path=output_path + 'un1',
+                                                 IsWritten=True)
+        result['UnassembledReads2'] = ResultPath(Path=output_path + 'un2',
+                                                 IsWritten=True)
+
+        # check if stitch report is requested:
+        stitch_path = self._get_stitch_report_path()
+        if stitch_path:
+            result['Report'] = ResultPath(Path=stitch_path,
+                                          IsWritten=True)
+
+        # Check if mate file / barcode file is present.
+        # If not, return result
+        # We need to check this way becuase there are no infile parameters.
+        mate_path_string = output_path + 'join2'
+        mate_unassembled_path_string = output_path + 'un3'
+        if os.path.exists(mate_path_string) and \
+                os.path.exists(mate_unassembled_path_string):
+            result['Mate'] = ResultPath(Path=mate_path_string,
+                                        IsWritten=True)
+            result['MateUnassembled'] = ResultPath(Path=
+                                                   mate_unassembled_path_string,
+                                                   IsWritten=True)
+        else:
+            pass
+        return result
+
+    def getHelp(self):
+        """fastq-join (v1.1.2) help"""
+        help_str = """
+        For issues with the actual program 'fastq-join', see the following:
+
+        For basic help, type the following at the command line:
+            'fastq-join'
+
+        Website:
+           http://code.google.com/p/ea-utils/
+
+        For questions / comments subit an issue to:
+        http://code.google.com/p/ea-utils/issues/list
+        """
+        return help_str
+
+
+def join_paired_end_reads_fastqjoin(
+        reads1_infile_path,
+        reads2_infile_path,
+        perc_max_diff=None,  # typical default is 8
+        min_overlap=None,  # typical default is 6
+        outfile_label='fastqjoin',
+        params={},
+        working_dir=tempfile.gettempdir(),
+        SuppressStderr=True,
+        SuppressStdout=True,
+        HALT_EXEC=False):
+    """ Runs fastq-join, with default parameters to assemble paired-end reads.
+        Returns file path string.
+
+        -reads1_infile_path : reads1.fastq infile path
+        -reads2_infile_path : reads2.fastq infile path
+        -perc_max_diff : maximum % diff of overlap differences allowed
+        -min_overlap : minimum allowed overlap required to assemble reads
+        -outfile_label : base name for output files.
+        -params : dictionary of application controller parameters
+
+    """
+    abs_r1_path = os.path.abspath(reads1_infile_path)
+    abs_r2_path = os.path.abspath(reads2_infile_path)
+
+    infile_paths = [abs_r1_path, abs_r2_path]
+
+    # check / make absolute infile paths
+    for p in infile_paths:
+        if not os.path.exists(p):
+            raise IOError('File not found at: %s' % p)
+
+    fastq_join_app = FastqJoin(params=params,
+                               WorkingDir=working_dir,
+                               SuppressStderr=SuppressStderr,
+                               SuppressStdout=SuppressStdout,
+                               HALT_EXEC=HALT_EXEC)
+
+    # set param. Helps with QIIME integration to have these values
+    # set to None by default. This way we do not have to worry
+    # about changes in default behaviour of the wrapped
+    # application
+    if perc_max_diff is not None:
+        if isinstance(perc_max_diff, int) and 0 <= perc_max_diff <= 100:
+            fastq_join_app.Parameters['-p'].on(perc_max_diff)
+        else:
+            raise ValueError("perc_max_diff must be int between 0-100!")
+
+    if min_overlap is not None:
+        if isinstance(min_overlap, int) and 0 < min_overlap:
+            fastq_join_app.Parameters['-m'].on(min_overlap)
+        else:
+            raise ValueError("min_overlap must be an int >= 0!")
+
+    if outfile_label is not None:
+        if isinstance(outfile_label, str):
+            fastq_join_app.Parameters['-o'].on(outfile_label + '.')
+        else:
+            raise ValueError("outfile_label must be a string!")
+    else:
+        pass
+
+    # run assembler
+    result = fastq_join_app(infile_paths)
+
+    # Store output file path data to dict
+    path_dict = {}
+    path_dict['Assembled'] = result['Assembled'].name
+    path_dict['UnassembledReads1'] = result['UnassembledReads1'].name
+    path_dict['UnassembledReads2'] = result['UnassembledReads2'].name
+
+    # sanity check that files actually exist in path lcoations
+    for path in path_dict.values():
+        if not os.path.exists(path):
+            raise IOError('Output file not found at: %s' % path)
+
+    # fastq-join automatically appends: 'join', 'un1', or 'un2'
+    # to the end of the file names. But we want to rename them so
+    # they end in '.fastq'. So, we iterate through path_dict to
+    # rename the files and overwrite the dict values.
+    for key, file_path in path_dict.items():
+        new_file_path = file_path + '.fastq'
+        shutil.move(file_path, new_file_path)
+        path_dict[key] = new_file_path
+
+    return path_dict
diff --git a/bfillings/fasttree.py b/bfillings/fasttree.py
new file mode 100644
index 0000000..4f752ca
--- /dev/null
+++ b/bfillings/fasttree.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for FastTree
+
+designed for FastTree v1.1.0 .  Also functions with v2.0.1, v2.1.0, and v2.1.3
+though only with basic functionality"""
+
+from burrito.parameters import (ValuedParameter, FlagParameter,
+                                  MixedParameter)
+from burrito.util import (CommandLineApplication, FilePath, system,
+                            CommandLineAppResult, ResultPath, remove,
+                            ApplicationError)
+
+from cogent.core.tree import PhyloNode
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import SequenceCollection
+
+
+class FastTree(CommandLineApplication):
+    """FastTree application Controller"""
+
+    _command = 'FastTree'
+    _input_handler = '_input_as_multiline_string'
+    _parameters = {
+            '-quiet':FlagParameter('-',Name='quiet'),
+            '-boot':ValuedParameter('-',Delimiter=' ',Name='boot'),
+            '-seed':ValuedParameter('-',Delimiter=' ',Name='seed'),
+            '-nni':ValuedParameter('-',Delimiter=' ',Name='nni'),
+            '-slow':FlagParameter('-',Name='slow'),
+            '-fastest':FlagParameter('-',Name='fastest'),
+            '-top':FlagParameter('-',Name='top'),
+            '-notop':FlagParameter('-',Name='notop'),
+            '-topm':ValuedParameter('-',Delimiter=' ',Name='topm'),
+            '-close':ValuedParameter('-',Delimiter=' ',Name='close'),
+            '-refresh':ValuedParameter('-',Delimiter=' ',Name='refresh'),
+            '-matrix':ValuedParameter('-',Delimiter=' ',Name='matrix'),
+            '-nomatrix':FlagParameter('-',Name='nomatrix'),
+            '-nj':FlagParameter('-',Name='nj'),
+            '-bionj':FlagParameter('-',Name='bionj'),
+            '-nt':FlagParameter('-',Name='nt'),
+            '-n':ValuedParameter('-',Delimiter=' ',Name='n'),
+            '-pseudo':MixedParameter('-',Delimiter=' ', Name='pseudo'),
+            '-intree':ValuedParameter('-',Delimiter=' ',Name='intree'),
+            '-spr':ValuedParameter('-',Delimiter=' ',Name='spr'),
+            '-constraints':ValuedParameter('-',Delimiter=' ',\
+                                           Name='constraints'),
+            '-constraintWeight':ValuedParameter('-',Delimiter=' ',\
+                                                Name='constraintWeight'),\
+            '-makematrix':ValuedParameter('-',Delimiter=' ',Name='makematrix')}
+
+    def __call__(self,data=None, remove_tmp=True):
+        """Run the application with the specified kwargs on data
+
+            data: anything that can be cast into a string or written out to
+                a file. Usually either a list of things or a single string or
+                number. input_handler will be called on this data before it
+                is passed as part of the command-line argument, so by creating
+                your own input handlers you can customize what kind of data
+                you want your application to accept
+
+            remove_tmp: if True, removes tmp files
+
+            NOTE: Override of the base class to handle redirected output
+        """
+        input_handler = self.InputHandler
+        suppress_stderr = self.SuppressStderr
+
+        outfile = self.getTmpFilename(self.TmpDir)
+        self._outfile = outfile
+
+        if suppress_stderr:
+            errfile = FilePath('/dev/null')
+        else:
+            errfile = FilePath(self.getTmpFilename(self.TmpDir))
+        if data is None:
+            input_arg = ''
+        else:
+            input_arg = getattr(self,input_handler)(data)
+
+        # Build up the command, consisting of a BaseCommand followed by
+        # input and output (file) specifications
+        command = self._command_delimiter.join(filter(None,\
+            [self.BaseCommand,str(input_arg),'>',str(outfile),'2>',\
+                str(errfile)]))
+        if self.HaltExec:
+            raise AssertionError, "Halted exec with command:\n" + command
+        # The return value of system is a 16-bit number containing the signal
+        # number that killed the process, and then the exit status.
+        # We only want to keep the exit status so do a right bitwise shift to
+        # get rid of the signal number byte
+        exit_status = system(command) >> 8
+
+        # Determine if error should be raised due to exit status of
+        # appliciation
+        if not self._accept_exit_status(exit_status):
+            raise ApplicationError, \
+             'Unacceptable application exit status: %s, command: %s'\
+                % (str(exit_status),command)
+
+        out = open(outfile,"r")
+
+        err = None
+        if not suppress_stderr:
+            err = open(errfile,"r")
+
+        result =  CommandLineAppResult(out,err,exit_status,\
+            result_paths=self._get_result_paths(data))
+
+        # Clean up the input file if one was created
+        if remove_tmp:
+            if self._input_filename:
+                remove(self._input_filename)
+                self._input_filename = None
+
+        return result
+
+    def _get_result_paths(self, data):
+        result = {}
+        result['Tree'] = ResultPath(Path=self._outfile)
+        return result
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None):
+    """Returns a tree from alignment
+
+    Will check MolType of aln object
+    """
+    if params is None:
+        params = {}
+
+    if moltype == DNA or moltype == RNA:
+        params['-nt'] = True
+    elif moltype == PROTEIN:
+        params['-nt'] = False
+    else:
+        raise ValueError, \
+                "FastTree does not support moltype: %s" % moltype.label
+
+    if best_tree:
+        params['-slow'] = True
+
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = aln.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+
+    app = FastTree(params=params)
+
+    result = app(int_map.toFasta())
+    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+    #remap tip names
+    for tip in tree.tips():
+        tip.Name = int_keys[tip.Name]
+
+    return tree
diff --git a/bfillings/fasttree_v1.py b/bfillings/fasttree_v1.py
new file mode 100644
index 0000000..a887b6d
--- /dev/null
+++ b/bfillings/fasttree_v1.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for FastTree v1.0"""
+
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, FilePath, system,
+                            CommandLineAppResult, ResultPath, remove,
+                            ApplicationError)
+
+from cogent.core.tree import PhyloNode
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA, RNA, PROTEIN
+
+
+class FastTree(CommandLineApplication):
+    """FastTree application Controller"""
+
+    _command = 'FastTree'
+    _input_handler = '_input_as_multiline_string'
+    _parameters = {
+            '-quiet':FlagParameter('-',Name='quiet'),
+            '-boot':ValuedParameter('-',Delimiter=' ',Name='boot'),
+            '-seed':ValuedParameter('-',Delimiter=' ',Name='seed'),
+            '-nni':ValuedParameter('-',Delimiter=' ',Name='nni'),
+            '-slow':FlagParameter('-',Name='slow'),
+            '-fastest':FlagParameter('-',Name='fastest'),
+            '-top':FlagParameter('-',Name='top'),
+            '-notop':FlagParameter('-',Name='notop'),
+            '-topm':ValuedParameter('-',Delimiter=' ',Name='topm'),
+            '-close':ValuedParameter('-',Delimiter=' ',Name='close'),
+            '-refresh':ValuedParameter('-',Delimiter=' ',Name='refresh'),
+            '-matrix':ValuedParameter('-',Delimiter=' ',Name='matrix'),
+            '-nomatrix':FlagParameter('-',Name='nomatrix'),
+            '-nj':FlagParameter('-',Name='nj'),
+            '-bionj':FlagParameter('-',Name='bionj'),
+            '-nt':FlagParameter('-',Name='nt'),
+            '-n':ValuedParameter('-',Delimiter=' ',Name='n')}
+
+    #FastTree [-quiet] [-boot 1000] [-seed 1253] [-nni 10] [-slow | -fastest]
+    #      [-top | -notop] [-topm 1.0 [-close 0.75] [-refresh 0.8]]
+    #      [-matrix Matrix | -nomatrix] [-nj | -bionj]
+    #      [-nt] [-n 100] [alignment] > newick_tree
+
+    def __call__(self,data=None, remove_tmp=True):
+        """Run the application with the specified kwargs on data
+
+            data: anything that can be cast into a string or written out to
+                a file. Usually either a list of things or a single string or
+                number. input_handler will be called on this data before it
+                is passed as part of the command-line argument, so by creating
+                your own input handlers you can customize what kind of data
+                you want your application to accept
+
+            remove_tmp: if True, removes tmp files
+
+            NOTE: Override of the base class to handle redirected output
+        """
+        input_handler = self.InputHandler
+        suppress_stderr = self.SuppressStderr
+
+        outfile = self.getTmpFilename(self.TmpDir)
+        self._outfile = outfile
+
+        if suppress_stderr:
+            errfile = FilePath('/dev/null')
+        else:
+            errfile = FilePath(self.getTmpFilename(self.TmpDir))
+        if data is None:
+            input_arg = ''
+        else:
+            input_arg = getattr(self,input_handler)(data)
+
+        # Build up the command, consisting of a BaseCommand followed by
+        # input and output (file) specifications
+        command = self._command_delimiter.join(filter(None,\
+            [self.BaseCommand,str(input_arg),'>',str(outfile),'2>',\
+                str(errfile)]))
+        if self.HaltExec:
+            raise AssertionError, "Halted exec with command:\n" + command
+        # The return value of system is a 16-bit number containing the signal
+        # number that killed the process, and then the exit status.
+        # We only want to keep the exit status so do a right bitwise shift to
+        # get rid of the signal number byte
+        exit_status = system(command) >> 8
+
+        # Determine if error should be raised due to exit status of
+        # appliciation
+        if not self._accept_exit_status(exit_status):
+            raise ApplicationError, \
+             'Unacceptable application exit status: %s, command: %s'\
+                % (str(exit_status),command)
+
+        out = open(outfile,"r")
+
+        err = None
+        if not suppress_stderr:
+            err = open(errfile,"r")
+
+        result =  CommandLineAppResult(out,err,exit_status,\
+            result_paths=self._get_result_paths(data))
+
+        # Clean up the input file if one was created
+        if remove_tmp:
+            if self._input_filename:
+                remove(self._input_filename)
+                self._input_filename = None
+
+        return result
+
+    def _get_result_paths(self, data):
+        result = {}
+        result['Tree'] = ResultPath(Path=self._outfile)
+        return result
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None):
+    """Returns a tree from alignment
+
+    Will check MolType of aln object
+    """
+    if params is None:
+        params = {}
+
+    if moltype == DNA or moltype == RNA:
+        params['-nt'] = True
+    elif moltype == PROTEIN:
+        params['-nt'] = False
+    else:
+        raise ValueError, \
+                "FastTree does not support moltype: %s" % moltype.label
+
+    app = FastTree(params=params)
+
+    if best_tree:
+        raise NotImplementedError, "best_tree not implemented yet"
+    result = app(aln.toFasta())
+    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+    return tree
diff --git a/bfillings/formatdb.py b/bfillings/formatdb.py
new file mode 100755
index 0000000..e089a4b
--- /dev/null
+++ b/bfillings/formatdb.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+""" Description
+File created on 16 Sep 2009.
+
+"""
+from __future__ import division
+from optparse import OptionParser
+from os.path import split, splitext
+from os import remove
+from glob import glob
+from tempfile import mkstemp
+
+from burrito.util import CommandLineApplication, ResultPath
+from burrito.parameters import ValuedParameter, FilePath
+
+
+class FormatDb(CommandLineApplication):
+    """ ApplicationController for formatting blast databases
+
+        Currently contains a minimal parameter set.
+    """
+
+    _command = 'formatdb'
+    _parameters = {
+        '-i': ValuedParameter(Prefix='-', Name='i', Delimiter=' ',
+                              IsPath=True),
+        '-l': ValuedParameter(Prefix='-', Name='l', Delimiter=' ',
+                              IsPath=True),
+        '-o': ValuedParameter(Prefix='-', Name='o', Delimiter=' ', Value='T'),
+        '-p': ValuedParameter(Prefix='-', Name='p', Delimiter=' ', Value='F'),
+        '-n': ValuedParameter(Prefix='-', Name='n', Delimiter=' ')
+        }
+    _input_handler = '_input_as_parameter'
+    _suppress_stdout = True
+    _suppress_stderr = True
+
+    def _input_as_parameter(self, data):
+        """ Set the input path and log path based on data (a fasta filepath)
+        """
+        self.Parameters['-i'].on(data)
+        # access data through self.Parameters so we know it's been cast
+        # to a FilePath
+        input_filepath = self.Parameters['-i'].Value
+        input_file_dir, input_filename = split(input_filepath)
+        input_file_base, input_file_ext = splitext(input_filename)
+        # FIXME: the following all other options
+        # formatdb ignores the working directory if not name is passed.
+        self.Parameters['-l'].on(FilePath('%s.log') % input_filename)
+        self.Parameters['-n'].on(FilePath(input_filename))
+        return ''
+
+    def _get_result_paths(self, data):
+        """ Build the dict of result filepaths
+        """
+        # access data through self.Parameters so we know it's been cast
+        # to a FilePath
+        wd = self.WorkingDir
+        db_name = self.Parameters['-n'].Value
+        log_name = self.Parameters['-l'].Value
+        result = {}
+        result['log'] = ResultPath(Path=wd + log_name, IsWritten=True)
+        if self.Parameters['-p'].Value == 'F':
+            extensions = ['nhr', 'nin', 'nsq', 'nsd', 'nsi']
+        else:
+            extensions = ['phr', 'pin', 'psq', 'psd', 'psi']
+        for extension in extensions:
+            for file_path in glob(wd + (db_name + '*' + extension)):
+                # this will match e.g. nr.01.psd and nr.psd
+                key = file_path.split(db_name + '.')[1]
+                result_path = ResultPath(Path=file_path, IsWritten=True)
+                result[key] = result_path
+        return result
+
+    def _accept_exit_status(self, exit_status):
+        """ Return True when the exit status was 0
+        """
+        return exit_status == 0
+
+
+def build_blast_db_from_fasta_path(fasta_path, is_protein=False,
+                                   output_dir=None, HALT_EXEC=False):
+    """Build blast db from fasta_path; return db name and list of files created
+
+        **If using to create temporary blast databases, you can call
+        cogent.util.misc.remove_files(db_filepaths) to clean up all the
+        files created by formatdb when you're done with the database.
+
+        fasta_path: path to fasta file of sequences to build database from
+        is_protein: True if working on protein seqs (default: False)
+        output_dir: directory where output should be written
+         (default: directory containing fasta_path)
+        HALT_EXEC: halt just before running the formatdb command and
+         print the command -- useful for debugging
+    """
+    fasta_dir, fasta_filename = split(fasta_path)
+    if not output_dir:
+        output_dir = fasta_dir or '.'
+        # Will cd to this directory, so just pass the filename
+        # so the app is not confused by relative paths
+        fasta_path = fasta_filename
+
+    if not output_dir.endswith('/'):
+        db_name = output_dir + '/' + fasta_filename
+    else:
+        db_name = output_dir + fasta_filename
+
+    # instantiate the object
+    fdb = FormatDb(WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+    if is_protein:
+        fdb.Parameters['-p'].on('T')
+    else:
+        fdb.Parameters['-p'].on('F')
+    app_result = fdb(fasta_path)
+    db_filepaths = []
+    for v in app_result.values():
+        try:
+            db_filepaths.append(v.name)
+        except AttributeError:
+            # not a file object, so no path to return
+            pass
+    return db_name, db_filepaths
+
+
+def build_blast_db_from_fasta_file(fasta_file, is_protein=False,
+                                   output_dir=None, HALT_EXEC=False):
+    """Build blast db from fasta_path; return db name and list of files created
+
+        **If using to create temporary blast databases, you can call
+        cogent.util.misc.remove_files(db_filepaths) to clean up all the
+        files created by formatdb when you're done with the database.
+
+        fasta_path: path to fasta file of sequences to build database from
+        is_protein: True if working on protein seqs (default: False)
+        output_dir: directory where output should be written
+         (default: directory containing fasta_path)
+        HALT_EXEC: halt just before running the formatdb command and
+         print the command -- useful for debugging
+    """
+    output_dir = output_dir or '.'
+    _, fasta_path = mkstemp(dir=output_dir, prefix="BLAST_temp_db_",
+                            suffix=".fasta")
+
+    fasta_f = open(fasta_path, 'w')
+    for line in fasta_file:
+        fasta_f.write('%s\n' % line.strip())
+    fasta_f.close()
+
+    blast_db, db_filepaths = build_blast_db_from_fasta_path(fasta_path,
+                                                            is_protein=is_protein,
+                                                            output_dir=None,
+                                                            HALT_EXEC=HALT_EXEC
+                                                            )
+
+    db_filepaths.append(fasta_path)
+
+    return blast_db, db_filepaths
+
+
+def build_blast_db_from_seqs(seqs, is_protein=False, output_dir='./',
+                             HALT_EXEC=False):
+    """Build blast db from seqs; return db name and list of files created
+
+        **If using to create temporary blast databases, you can call
+        cogent.util.misc.remove_files(db_filepaths) to clean up all the
+        files created by formatdb when you're done with the database.
+
+        seqs: sequence collection or alignment object
+        is_protein: True if working on protein seqs (default: False)
+        output_dir: directory where output should be written
+         (default: current directory)
+        HALT_EXEC: halt just before running the formatdb command and
+         print the command -- useful for debugging
+    """
+
+    # Build a temp filepath
+    _, tmp_fasta_filepath = mkstemp(prefix='Blast_tmp_db', suffix='.fasta')
+    # open the temp file
+    tmp_fasta_file = open(tmp_fasta_filepath, 'w')
+    # write the sequence collection to file
+    tmp_fasta_file.write(seqs.toFasta())
+    tmp_fasta_file.close()
+
+    # build the bast database
+    db_name, db_filepaths = build_blast_db_from_fasta_path(tmp_fasta_filepath,
+                                                           is_protein=is_protein,
+                                                           output_dir=output_dir,
+                                                           HALT_EXEC=HALT_EXEC)
+
+    # clean-up the temporary file
+    remove(tmp_fasta_filepath)
+
+    # return the results
+    return db_name, db_filepaths
+
+
+def parse_command_line_parameters():
+    """ Parses command line arguments """
+    usage = 'usage: %prog [options] fasta_filepath'
+    version = 'Version: %prog 0.1'
+    parser = OptionParser(usage=usage, version=version)
+
+    # A binary 'verbose' flag
+    parser.add_option('-p', '--is_protein', action='store_true',
+                      dest='is_protein', default=False,
+                      help='Pass if building db of protein sequences [default:'
+                           ' False, nucleotide db]')
+
+    parser.add_option('-o', '--output_dir', action='store', type='string',
+                      dest='output_dir', default=None,
+                      help='the output directory [default: directory '
+                           'containing input fasta_filepath]')
+
+    opts, args = parser.parse_args()
+    num_args = 1
+    if len(args) != num_args:
+        parser.error('Must provide single filepath to build database from.')
+
+    return opts, args
+
+
+if __name__ == "__main__":
+    opts, args = parse_command_line_parameters()
+
+    fasta_filepath = args[0]
+    is_protein = opts.is_protein
+    output_dir = opts.output_dir
+
+    db_name, db_filepaths = build_blast_db_from_fasta_path(fasta_filepath,
+                                                           is_protein=is_protein,
+                                                           output_dir=output_dir
+                                                           )
diff --git a/bfillings/infernal.py b/bfillings/infernal.py
new file mode 100644
index 0000000..788d074
--- /dev/null
+++ b/bfillings/infernal.py
@@ -0,0 +1,1571 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Provides an application controller for the commandline version of:
+Infernal 1.0 and 1.0.2 only.
+"""
+from os import remove
+from tempfile import mkstemp
+
+from burrito.parameters import FlagParameter, ValuedParameter, FilePath
+from burrito.util import CommandLineApplication, ResultPath
+
+from cogent.core.alignment import SequenceCollection, Alignment, DataError
+from cogent.parse.rfam import (MinimalRfamParser, ChangedSequence,
+                               ChangedRnaSequence, ChangedDnaSequence)
+from cogent.format.stockholm import stockholm_from_alignment
+from cogent.parse.infernal import CmsearchParser
+from cogent.core.moltype import DNA, RNA
+from cogent.struct.rna2d import ViennaStructure, wuss_to_vienna
+
+MOLTYPE_MAP = {'DNA':'--dna',\
+                DNA:'--dna',\
+               'RNA':'--rna',\
+                RNA:'--rna',\
+               }
+
+SEQ_CONSTRUCTOR_MAP = {'DNA':ChangedDnaSequence,\
+                        DNA:ChangedDnaSequence,\
+                       'RNA':ChangedRnaSequence,\
+                        RNA:ChangedRnaSequence,\
+                       }
+
+class Cmalign(CommandLineApplication):
+    """cmalign application controller."""
+    _options = {
+
+    # -o <f> Save the alignment in Stockholm format to a file <f>. The default
+    #   is to write it to standard output.
+    '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' '),\
+
+    # -l Turn on the local alignment algorithm. Default is global.
+    '-l':FlagParameter(Prefix='-',Name='l'),\
+
+    # -p Annotate the alignment with posterior probabilities calculated using
+    #   the Inside and Outside algorithms.
+    '-p':FlagParameter(Prefix='-',Name='p'),\
+
+    # -q Quiet; suppress the verbose banner, and only print the resulting
+    #   alignment to stdout.
+    '-q':FlagParameter(Prefix='-',Name='q'),\
+
+    # --informat <s> Assert that the input seqfile is in format <s>. Do not run
+    #   Babelfish format autodection. Acceptable formats are: FASTA, EMBL,
+    #   UNIPROT, GENBANK, and DDBJ. <s> is case-insensitive.
+    '--informat':ValuedParameter(Prefix='--',Name='informat',Delimiter=' '),\
+
+    # --mpi Run as an MPI parallel program.  (see User's Guide for details).
+    '--mpi':FlagParameter(Prefix='--',Name='mpi'),\
+
+    # Expert Options
+
+    # --optacc Align sequences using the Durbin/Holmes optimal accuracy
+    #   algorithm. This is default behavior, so this option is probably useless.
+    '--optacc':FlagParameter(Prefix='--',Name='optacc'),\
+
+    # --cyk Do not use the Durbin/Holmes optimal accuracy alignment to align the
+    #   sequences, instead use the CYK algorithm which determines the optimally
+    #   scoring alignment of the sequence to the model.
+    '--cyk':FlagParameter(Prefix='--',Name='cyk'),\
+
+    # --sample Sample an alignment from the posterior distribution of
+    #   alignments.
+    '--sample':FlagParameter(Prefix='--',Name='sample'),\
+
+    # -s <n> Set the random number generator seed to <n>, where <n> is a
+    #   positive integer. This option can only be used in combination with
+    #   --sample. The default is to use time() to generate a different seed for
+    #   each run, which means that two different runs of cmalign --sample on the
+    #   same alignment will give slightly different results. You can use this
+    #   option to generate reproducible results.
+    '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+    # --viterbi Do not use the CM to align the sequences, instead use the HMM
+    #   Viterbi algorithm to align with a CM Plan 9 HMM.
+    '--viterbi':FlagParameter(Prefix='--',Name='viterbi'),\
+
+    # --sub Turn on the sub model construction and alignment procedure.
+    '--sub':FlagParameter(Prefix='--',Name='sub'),\
+
+    # --small Use the divide and conquer CYK alignment algorithm described in
+    #   SR Eddy, BMC Bioinformatics 3:18, 2002.
+    '--small':FlagParameter(Prefix='--',Name='small'),\
+
+    # --hbanded This option is turned on by default. Accelerate alignment by
+    #   pruning away regions of the CM DP matrix that are deemed negligible by
+    #   an HMM.
+    '--hbanded':FlagParameter(Prefix='--',Name='hbanded'),\
+
+    # --nonbanded Turns off HMM banding.
+    '--nonbanded':FlagParameter(Prefix='--',Name='nonbanded'),\
+
+    # --tau <x> Set the tail loss probability used during HMM band calculation
+    #   to <x>.
+    '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\
+
+    # --mxsize <x> Set the maximum allowable DP matrix size to <x> megabytes.
+    '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+    # --rna Output the alignments as RNA sequence alignments. This is true by
+    #   default.
+    '--rna':FlagParameter(Prefix='--',Name='rna'),\
+
+    # --dna Output the alignments as DNA sequence alignments.
+    '--dna':FlagParameter(Prefix='--',Name='dna'),\
+
+    # --matchonly Only include match columns in the output alignment, do not
+    #   include any insertions relative to the consensus model.
+    '--matchonly':FlagParameter(Prefix='--',Name='matchonly'),\
+
+    # --resonly Only include match columns in the output alignment that have at
+    #   least 1 residue (non-gap character) in them.
+    '--resonly':FlagParameter(Prefix='--',Name='resonly'),\
+
+    # --fins Change the behavior of how insert emissions are placed in the
+    #   alignment.
+    '--fins':FlagParameter(Prefix='--',Name='fins'),\
+
+    # --onepost Modifies behavior of the -p option. Use only one character
+    #   instead of two to annotate the posterior probability of each aligned
+    #   residue.
+    '--onepost':FlagParameter(Prefix='--',Name='onepost'),\
+
+    # --withali <f> Reads an alignment from file <f> and aligns it as a single
+    #   object to the CM; e.g. the alignment in <f> is held fixed.
+    '--withali':ValuedParameter(Prefix='--',Name='withali',Delimiter=' '),\
+
+    # --withpknots Must be used in combination with --withali <f>. Propogate
+    #   structural information for any pseudoknots that exist in <f> to the
+    #   output alignment.
+    '--withpknots':FlagParameter(Prefix='--',Name='withpknots'),\
+
+    # --rf Must be used in combination with --withali <f>. Specify that the
+    #   alignment in <f> has the same "#=GC RF" annotation as the alignment file
+    #   the CM was built from using cmbuild and further that the --rf option was
+    #   supplied to cmbuild when the CM was constructed.
+    '--rf':FlagParameter(Prefix='--',Name='rf'),\
+
+    # --gapthresh <x> Must be used in combination with --withali <f>. Specify
+    #   that the --gapthresh <x> option was supplied to cmbuild when the CM was
+    #   constructed from the alignment file <f>.
+    '--gapthresh':ValuedParameter(Prefix='--',Name='gapthresh',Delimiter=' '),\
+
+    # --tfile <f> Dump tabular sequence tracebacks for each individual sequence
+    #   to a file <f>. Primarily useful for debugging.
+    '--tfile':ValuedParameter(Prefix='--',Name='tfile',Delimiter=' '),\
+
+
+    }
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "cmalign"
+    _suppress_stderr=True
+
+    def getHelp(self):
+        """Method that points to the Infernal documentation."""
+
+        help_str = \
+        """
+        See Infernal documentation at:
+        http://infernal.janelia.org/
+        """
+        return help_str
+
+    def _tempfile_as_multiline_string(self, data):
+        """Write a multiline string to a temp file and return the filename.
+
+            data: a multiline string to be written to a file.
+
+           * Note: the result will be the filename as a FilePath object
+            (which is a string subclass).
+
+        """
+        filename = FilePath(self.getTmpFilename(self.TmpDir))
+        data_file = open(filename,'w')
+        data_file.write(data)
+        data_file.close()
+        return filename
+
+    def _alignment_out_filename(self):
+
+        if self.Parameters['-o'].isOn():
+            refined_filename = self._absolute(str(\
+                self.Parameters['-o'].Value))
+        else:
+            raise ValueError, 'No alignment output file specified.'
+        return refined_filename
+
+    def _get_result_paths(self,data):
+        result = {}
+        if self.Parameters['-o'].isOn():
+            out_name = self._alignment_out_filename()
+            result['Alignment'] = ResultPath(Path=out_name,IsWritten=True)
+
+        return result
+
+class Cmbuild(CommandLineApplication):
+    """cmbuild application controller."""
+    _options = {
+
+    # -n <s> Name the covariance model <s>. (Does not work if alifile contains
+    #   more than one alignment).
+    '-n':ValuedParameter(Prefix='-',Name='n',Delimiter=' '),\
+
+    # -A Append the CM to cmfile, if cmfile already exists.
+    '-A':FlagParameter(Prefix='-',Name='A'),\
+
+    # -F Allow cmfile to be overwritten. Normally, if cmfile already exists,
+    #   cmbuild exits with an error unless the -A or -F option is set.
+    '-F':FlagParameter(Prefix='-',Name='F'),\
+
+    # -v Run in verbose output mode instead of using the default single line
+    #   tabular format. This output format is similar to that used by older
+    #   versions of Infernal.
+    '-v':FlagParameter(Prefix='-',Name='v'),\
+
+    # --iins Allow informative insert emissions for the CM. By default, all CM
+    #   insert emission scores are set to 0.0 bits.
+    '--iins':FlagParameter(Prefix='--',Name='iins'),\
+
+    # --Wbeta<x> Set the beta tail loss probability for query-dependent banding
+    #   (QDB) to <x> The QDB algorithm is used to determine the maximium length
+    #   of a hit to the model. For more information on QDB see (Nawrocki and
+    #   Eddy, PLoS Computational Biology 3(3): e56).
+    '--Wbeta':ValuedParameter(Prefix='--',Name='Wbeta',Delimiter=' '),\
+
+    # Expert Options
+
+    # --rsearch <f> Parameterize emission scores a la RSEARCH, using the
+    #   RIBOSUM matrix in file <f>. For more information see the RSEARCH
+    #   publication (Klein and Eddy, BMC Bioinformatics 4:44, 2003). Actually,
+    #   the emission scores will not exactly With --rsearch enabled, all
+    #   alignments in alifile must contain exactly one sequence or the --call
+    #   option must also be enabled.
+    '--rsearch':ValuedParameter(Prefix='--',Name='rsearch',Delimiter=' '),\
+
+    # --binary Save the model in a compact binary format. The default is a more
+    #   readable ASCII text format.
+    '--binary':FlagParameter(Prefix='--',Name='binary'),\
+
+    # --rf Use reference coordinate annotation (#=GC RF line, in Stockholm) to
+    #   determine which columns are consensus, and which are inserts.
+    '--rf':FlagParameter(Prefix='--',Name='rf'),\
+
+    # --gapthresh <x> Set the gap threshold (used for determining which columns
+    #   are insertions versus consensus; see --rf above) to <x>. The default is
+    #   0.5.
+    '--gapthresh':ValuedParameter(Prefix='--',Name='gapthresh',Delimiter=' '),\
+
+    # --ignorant Strip all base pair secondary structure information from all
+    #   input alignments in alifile before building the CM(s).
+    '--ignorant':FlagParameter(Prefix='--',Name='ignorant'),\
+
+    # --wgsc Use the Gerstein/Sonnhammer/Chothia (GSC) weighting algorithm.
+    #   This is the default unless the number of sequences in the alignment
+    #   exceeds a cutoff (see --pbswitch), in which case the default becomes
+    #   the faster Henikoff position-based weighting scheme.
+    '--wgsc':FlagParameter(Prefix='--',Name='wgsc'),\
+
+    # --wblosum Use the BLOSUM filtering algorithm to weight the sequences,
+    #   instead of the default GSC weighting.
+    '--wblosum':FlagParameter(Prefix='--',Name='wblosum'),\
+
+    # --wpb Use the Henikoff position-based weighting scheme. This weighting
+    #   scheme is automatically used (overriding --wgsc and --wblosum) if the
+    #   number of sequences in the alignment exceeds a cutoff (see --pbswitch).
+    '--wpb':FlagParameter(Prefix='--',Name='wpb'),\
+
+    # --wnone Turn sequence weighting off; e.g. explicitly set all sequence
+    #   weights to 1.0.
+    '--wnone':FlagParameter(Prefix='--',Name='wnone'),\
+
+    # --wgiven Use sequence weights as given in annotation in the input
+    #   alignment file. If no weights were given, assume they are all 1.0.
+    #   The default is to determine new sequence weights by the Gerstein/
+    #   Sonnhammer/Chothia algorithm, ignoring any annotated weights.
+    '--wgiven':FlagParameter(Prefix='--',Name='wgiven'),\
+
+    # --pbswitch <n> Set the cutoff for automatically switching the weighting
+    #   method to the Henikoff position-based weighting scheme to <n>. If the
+    #   number of sequences in the alignment exceeds <n> Henikoff weighting is
+    #   used. By default <n> is 5000.
+    '--pbswitch':ValuedParameter(Prefix='--',Name='pbswitch',Delimiter=' '),\
+
+    # --wid <x> Controls the behavior of the --wblosum weighting option by
+    #   setting the percent identity for clustering the alignment to <x>.
+    '--wid':ValuedParameter(Prefix='--',Name='wid',Delimiter=' '),\
+
+    # --eent Use the entropy weighting strategy to determine the effective
+    #   sequence number that gives a target mean match state relative entropy.
+    '--wgiven':FlagParameter(Prefix='--',Name='wgiven'),\
+
+    # --enone Turn off the entropy weighting strategy. The effective sequence
+    #   number is just the number of sequences in the alignment.
+    '--wgiven':FlagParameter(Prefix='--',Name='wgiven'),\
+
+    # --ere <x> Set the target mean match state entropy as <x>. By default the
+    #   target entropy 1.46 bits.
+    '--ere':ValuedParameter(Prefix='--',Name='ere',Delimiter=' '),\
+
+    # --null <f> Read a null model from <f>. The null model defines the
+    #   probability of each RNA nucleotide in background sequence, the default
+    #   is to use 0.25 for each nucleotide.
+    '--null':ValuedParameter(Prefix='--',Name='null',Delimiter=' '),\
+
+    # --prior <f> Read a Dirichlet prior from <f>, replacing the default mixture
+    #   Dirichlet.
+    '--prior':ValuedParameter(Prefix='--',Name='prior',Delimiter=' '),\
+
+    # --ctarget <n> Cluster each alignment in alifile by percent identity.
+    #   find a cutoff percent id threshold that gives exactly <n> clusters and
+    #   build a separate CM from each cluster. If <n> is greater than the number
+    #   of sequences in the alignment the program will not complain, and each
+    #   sequence in the alignment will be its own cluster. Each CM will have a
+    #   positive integer appended to its name indicating the order in which it
+    #   was built.
+    '--ctarget':ValuedParameter(Prefix='--',Name='ctarget',Delimiter=' '),\
+
+    # --cmaxid <x> Cluster each sequence alignment in alifile by percent
+    #   identity. Define clusters at the cutoff fractional id similarity of <x>
+    #   and build a separate CM from each cluster.
+    '--cmaxid':ValuedParameter(Prefix='--',Name='cmaxid',Delimiter=' '),\
+
+    # --call Build a separate CM from each sequence in each alignment in
+    #   alifile. Naming of CMs takes place as described above for --ctarget.
+    '--call':FlagParameter(Prefix='--',Name='call'),\
+
+    # --corig After building multiple CMs using --ctarget, --cmindiff or --call
+    #   as described above, build a final CM using the complete original
+    #   alignment from alifile.
+    '--corig':FlagParameter(Prefix='--',Name='corig'),\
+
+    # --cdump<f> Dump the multiple alignments of each cluster to <f> in
+    #   Stockholm format. This option only works in combination with --ctarget,
+    #   --cmindiff or --call.
+    '--cdump':ValuedParameter(Prefix='--',Name='cdump',Delimiter=' '),\
+
+    # --refine <f> Attempt to refine the alignment before building the CM using
+    #   expectation-maximization (EM). The final alignment (the alignment used
+    #   to build the CM that gets written to cmfile) is written to <f>.
+    '--refine':ValuedParameter(Prefix='--',Name='refine',Delimiter=' '),\
+
+    # --gibbs Modifies the behavior of --refine so Gibbs sampling is used
+    #   instead of EM.
+    '--gibbs':FlagParameter(Prefix='--',Name='gibbs'),\
+
+    # -s <n> Set the random seed to <n>, where <n> is a positive integer.
+    #   This option can only be used in combination with --gibbs. The default is
+    #   to use time() to generate a different seed for each run, which means
+    #   that two different runs of cmbuild --refine <f> --gibbs on the same
+    #   alignment will give slightly different results. You can use this option
+    #   to generate reproducible results.
+    '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+    # -l With --refine, turn on the local alignment algorithm, which allows the
+    #   alignment to span two or more subsequences if necessary (e.g. if the
+    #   structures of the query model and target sequence are only partially
+    #   shared), allowing certain large insertions and deletions in the
+    #   structure to be penalized differently than normal indels. The default is
+    #   to globally align the query model to the target sequences.
+    '-l':ValuedParameter(Prefix='-',Name='l',Delimiter=' '),\
+
+    # -a With --refine, print the scores of each individual sequence alignment.
+    '-a':ValuedParameter(Prefix='-',Name='a',Delimiter=' '),\
+
+    # --cyk With --refine, align with the CYK algorithm.
+    '--cyk':FlagParameter(Prefix='--',Name='cyk'),\
+
+    # --sub With --refine, turn on the sub model construction and alignment
+    #   procedure.
+    '--sub':FlagParameter(Prefix='--',Name='sub'),\
+
+    # --nonbanded With --refine, do not use HMM bands to accelerate alignment.
+    #   Use the full CYK algorithm which is guaranteed to give the optimal
+    #   alignment. This will slow down the run significantly, especially for
+    #   large models.
+    '--nonbanded':FlagParameter(Prefix='--',Name='nonbanded'),\
+
+    # --tau <x> With --refine, set the tail loss probability used during HMM
+    #   band calculation to <f>. This is the amount of probability mass within
+    #   the HMM posterior probabilities that is considered negligible. The
+    #   default value is 1E-7. In general, higher values will result in greater
+    #   acceleration, but increase the chance of missing the optimal alignment
+    #   due to the HMM bands.
+    '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\
+
+    # --fins With --refine, change the behavior of how insert emissions are
+    #   placed in the alignment.
+    '--fins':FlagParameter(Prefix='--',Name='fins'),\
+
+    # --mxsize <x> With --refine, set the maximum allowable matrix size for
+    #   alignment to <x> megabytes.
+    '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+    # --rdump<x> With --refine, output the intermediate alignments at each
+    #   iteration of the refinement procedure (as described above for --refine )
+    #   to file <f>.
+    '--rdump':ValuedParameter(Prefix='--',Name='rdump',Delimiter=' '),\
+
+    }
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "cmbuild"
+    _suppress_stderr=True
+
+    def getHelp(self):
+        """Method that points to the Infernal documentation."""
+
+        help_str = \
+        """
+        See Infernal documentation at:
+        http://infernal.janelia.org/
+        """
+        return help_str
+
+    def _refine_out_filename(self):
+
+        if self.Parameters['--refine'].isOn():
+            refined_filename = self._absolute(str(\
+                self.Parameters['--refine'].Value))
+        else:
+            raise ValueError, 'No refine output file specified.'
+        return refined_filename
+
+    def _cm_out_filename(self):
+
+        if self.Parameters['-n'].isOn():
+            refined_filename = self._absolute(str(\
+                self.Parameters['-n'].Value))
+        else:
+            raise ValueError, 'No cm output file specified.'
+        return refined_filename
+
+    def _tempfile_as_multiline_string(self, data):
+        """Write a multiline string to a temp file and return the filename.
+
+            data: a multiline string to be written to a file.
+
+           * Note: the result will be the filename as a FilePath object
+            (which is a string subclass).
+
+        """
+        filename = FilePath(self.getTmpFilename(self.TmpDir))
+        data_file = open(filename,'w')
+        data_file.write(data)
+        data_file.close()
+        return filename
+
+    def _get_result_paths(self,data):
+        result = {}
+        if self.Parameters['--refine'].isOn():
+            out_name = self._refine_out_filename()
+            result['Refined'] = ResultPath(Path=out_name,IsWritten=True)
+        if self.Parameters['-n'].isOn():
+            cm_name = self._cm_out_filename()
+            result['CmFile'] = ResultPath(Path=cm_name,IsWritten=True)
+
+        return result
+
+
+class Cmcalibrate(CommandLineApplication):
+    """cmcalibrate application controller."""
+    _options = {
+
+    # -s <n> Set the random number generator seed to <n>, where <n> is a
+    #   positive integer. The default is to use time() to generate a different
+    #   seed for each run, which means that two different runs of cmcalibrate on
+    #   the same CM will give slightly different E-value and HMM filter
+    #   threshold parameters. You can use this option to generate reproducible
+    #   results.
+    '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+    # --forecast <n> Predict the running time of the calibration for cmfile and
+    #   provided options and exit, DO NOT perform the calibration.
+    '--forecast':ValuedParameter(Prefix='--',Name='forecast',Delimiter=' '),\
+
+    # --mpi Run as an MPI parallel program.
+    '--mpi':FlagParameter(Prefix='--',Name='mpi'),\
+
+    # Expert Options
+
+    # --exp-cmL-glc <x> Set the length of random sequence to search for the CM
+    #   glocal exponential tail fits to <x> megabases (Mb).
+    '--exp-cmL-glc':ValuedParameter(Prefix='--',Name='exp-cmL-glc',\
+        Delimiter=' '),\
+
+    # --exp-cmL-loc <x> Set the length of random sequence to search for the CM
+    #   local exponential tail fits to <x> megabases (Mb).
+    '--exp-cmL-loc':ValuedParameter(Prefix='--',Name='exp-cmL-loc',\
+        Delimiter=' '),\
+
+    # --exp-hmmLn-glc <x> Set the minimum random sequence length to search for
+    #   the HMM glocal exponential tail fits to <x> megabases (Mb).
+    '--exp-hmmLn-glc':ValuedParameter(Prefix='--',Name='exp-hmmLn-glc',\
+        Delimiter=' '),\
+
+    # --exp-hmmLn-loc <x> Set the minimum random sequence length to search for
+    #   the HMM local exponential tail fits to <x> megabases (Mb).
+    '--exp-hmmLn-loc':ValuedParameter(Prefix='--',Name='exp-hmmLn-loc',\
+        Delimiter=' '),\
+
+    # --exp-hmmLx <x> Set the maximum random sequence length to search when
+    #   determining HMM E-values to <x> megabases (Mb).
+    '--exp-hmmLx':ValuedParameter(Prefix='--',Name='exp-hmmLx',Delimiter=' '),\
+
+    # --exp-fract <x> Set the HMM/CM fraction of dynamic programming
+    #   calculations to <x>.
+    '--exp-fract':ValuedParameter(Prefix='--',Name='exp-fract',Delimiter=' '),\
+
+    # --exp-tailn-cglc <x> During E-value calibration of glocal CM search modes
+    #   fit the exponential tail to the high scores in the histogram tail that
+    #   includes <x> hits per Mb searched.
+    '--exp-tailn-cglc':ValuedParameter(Prefix='--',Name='exp-tailn-cglc',\
+        Delimiter=' '),\
+
+    # --exp-tailn-cloc <x> During E-value calibration of local CM search modes
+    #   fit the exponential tail to the high scores in the histogram tail that
+    #   includes <x> hits per Mb searched.
+    '--exp-tailn-cloc':ValuedParameter(Prefix='--',Name='exp-tailn-cloc',\
+        Delimiter=' '),\
+
+    # --exp-tailn-hglc <x> During E-value calibration of glocal HMM search modes
+    #   fit the exponential tail to the high scores in the histogram tail that
+    #   includes <x> hits per Mb searched.
+    '--exp-tailn-hglc':ValuedParameter(Prefix='--',Name='exp-tailn-hglc',\
+        Delimiter=' '),\
+
+    # --exp-tailn-hloc <x> During E-value calibration of local HMM search modes
+    #   fit the exponential tail to the high scores in the histogram tail that
+    #   includes <x> hits per Mb searched.
+    '--exp-tailn-hloc':ValuedParameter(Prefix='--',Name='exp-tailn-hloc',\
+        Delimiter=' '),\
+
+    # --exp-tailp <x> Ignore the --exp-tailn prefixed options and fit the <x>
+    #   fraction right tail of the histogram to exponential tails, for all
+    #   search modes.
+    '--exp-tailp':ValuedParameter(Prefix='--',Name='exp-tailp',Delimiter=' '),\
+
+    # --exp-tailxn <n> With --exp-tailp enforce that the maximum number of hits
+    #   in the tail that is fit is <n>.
+    '--exp-tailxn':ValuedParameter(Prefix='--',Name='exp-tailxn',\
+        Delimiter=' '),\
+
+    # --exp-beta <x> During E-value calibration, by default query-dependent
+    #   banding (QDB) is used to accelerate the CM search algorithms with a beta
+    #   tail loss probability of 1E-15.
+    '--exp-beta':ValuedParameter(Prefix='--',Name='exp-beta',Delimiter=' '),\
+
+    # --exp-no-qdb Turn of QDB during E-value calibration. This will slow down
+    #   calibration, and is not recommended unless you plan on using --no-qdb in
+    #   cmsearch.
+    '--exp-no-qdb':FlagParameter(Prefix='--',Name='exp-no-qdb'),\
+
+    # --exp-hfile <f> Save the histograms fit for the E-value calibration to
+    #   file <f>. The format of this file is two tab delimited columns.
+    '--exp-hfile':ValuedParameter(Prefix='--',Name='exp-hfile',Delimiter=' '),\
+
+    # --exp-sfile <f> Save a survival plot for the E-value calibration to file
+    #   <f>. The format of this file is two tab delimited columns.
+    '--exp-sfile':ValuedParameter(Prefix='--',Name='exp-sfile',Delimiter=' '),\
+
+    # --exp-qqfile <f> Save a quantile-quantile plot for the E-value calibration
+    #   to file <f>. The format of this file is two tab delimited columns.
+    '--exp-qqfile':ValuedParameter(Prefix='--',Name='exp-qqfile',\
+        Delimiter=' '),\
+
+    # --exp-ffile <f> Save statistics on the exponential tail statistics to file
+    #   <f>. The file will contain the lambda and mu values for exponential
+    #   tails fit to tails of different sizes.
+    '--exp-ffile':ValuedParameter(Prefix='--',Name='exp-ffile',Delimiter=' '),\
+
+    # --fil-N <n> Set the number of sequences sampled and searched for the HMM
+    #   filter threshold calibration to <n>. By default, <n> is 10,000.
+    '--fil-N':ValuedParameter(Prefix='--',Name='fil-N',Delimiter=' '),\
+
+    # --fil-F <x> Set the fraction of sample sequences the HMM filter must be
+    #   able to recognize, and allow to survive, to <x>, where <x> is a positive
+    #   real number less than or equal to 1.0. By default, <x> is 0.995.
+    '--fil-F':ValuedParameter(Prefix='--',Name='fil-F',Delimiter=' '),\
+
+    # --fil-xhmm <x> Set the target number of dynamic programming calculations
+    #   for a HMM filtered CM QDB search with beta = 1E-7 to <x> times the
+    #   number of calculations required to do an HMM search. By default, <x> is
+    #   2.0.
+    '--fil-xhmm':ValuedParameter(Prefix='--',Name='fil-xhmm',Delimiter=' '),\
+
+    # --fil-tau <x> Set the tail loss probability during HMM band calculation
+    #   for HMM filter threshold calibration to <x>.
+    '--fil-tau':ValuedParameter(Prefix='--',Name='fil-tau',Delimiter=' '),\
+
+    # --fil-gemit During HMM filter calibration, always sample sequences from a
+    #   globally configured CM, even when calibrating local modes.
+    '--fil-gemit':FlagParameter(Prefix='--',Name='fil-gemit'),\
+
+    # --fil-dfile <f> Save statistics on filter threshold calibration, including
+    #   HMM and CM scores for all sampled sequences, to file <f>.
+    '--fil-dfile':ValuedParameter(Prefix='--',Name='fil-dfile',Delimiter=' '),\
+
+    # --mxsize <x> Set the maximum allowable DP matrix size to <x> megabytes.
+    '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+    }
+
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "cmcalibrate"
+    _suppress_stderr=True
+
+    def getHelp(self):
+        """Method that points to the Infernal documentation."""
+
+        help_str = \
+        """
+        See Infernal documentation at:
+        http://infernal.janelia.org/
+        """
+        return help_str
+
+class Cmemit(CommandLineApplication):
+    """cmemit application controller."""
+    _options = {
+
+    # -o <f> Save the synthetic sequences to file <f> rather than writing them
+    #   to stdout.
+    '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' '),\
+
+    # -n <n> Generate <n> sequences. Default is 10.
+    '-n':ValuedParameter(Prefix='-',Name='n',Delimiter=' '),\
+
+    # -u Write the generated sequences in unaligned format (FASTA). This is the
+    # default, so this option is probably useless.
+    '-u':FlagParameter(Prefix='-',Name='u'),\
+
+    # -a Write the generated sequences in an aligned format (STOCKHOLM) with
+    #   consensus structure annotation rather than FASTA.
+    '-a':FlagParameter(Prefix='-',Name='a'),\
+
+    # -c Predict a single majority-rule consensus sequence instead of sampling
+    #   sequences from the CM's probability distribution.
+    '-c':FlagParameter(Prefix='-',Name='c'),\
+
+    # -l Configure the CMs into local mode before emitting sequences. See the
+    #   User's Guide for more information on locally configured CMs.
+    '-l':FlagParameter(Prefix='-',Name='l'),\
+
+    # -s <n> Set the random seed to <n>, where <n> is a positive integer. The
+    #   default is to use time() to generate a different seed for each run,
+    #   which means that two different runs of cmemit on the same CM will give
+    #   different results. You can use this option to generate reproducible
+    #   results.
+    '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+    # --rna Specify that the emitted sequences be output as RNA sequences. This
+    #   is true by default.
+    '--rna':FlagParameter(Prefix='--',Name='rna'),\
+
+    # --dna Specify that the emitted sequences be output as DNA sequences. By
+    #   default, the output alphabet is RNA.
+    '--dna':FlagParameter(Prefix='--',Name='dna'),\
+
+    # --tfile <f> Dump tabular sequence parsetrees (tracebacks) for each emitted
+    #   sequence to file <f>. Primarily useful for debugging.
+    '--tfile':ValuedParameter(Prefix='--',Name='tfile',Delimiter=' '),\
+
+    # --exp <x> Exponentiate the emission and transition probabilities of the CM
+    #   by <x> and then renormalize those distributions before emitting
+    #   sequences.
+    '--exp':ValuedParameter(Prefix='--',Name='exp',Delimiter=' '),\
+
+    # --begin <n> Truncate the resulting alignment by removing all residues
+    #   before consensus column <n>, where <n> is a positive integer no greater
+    #   than the consensus length of the CM. Must be used in combination with
+    #   --end and either -a or --shmm (a developer option).
+    '--begin':ValuedParameter(Prefix='--',Name='begin',Delimiter=' '),\
+
+    # --end <n> Truncate the resulting alignment by removing all residues after
+    #   consensus column <n>, where <n> is a positive integer no greater than
+    #   the consensus length of the CM. Must be used in combination with --begin
+    #   and either -a or --shmm (a developer option).
+    '--end':ValuedParameter(Prefix='--',Name='end',Delimiter=' '),\
+
+    }
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "cmemit"
+    _suppress_stderr=True
+
+    def getHelp(self):
+        """Method that points to the Infernal documentation."""
+
+        help_str = \
+        """
+        See Infernal documentation at:
+        http://infernal.janelia.org/
+        """
+        return help_str
+
+class Cmscore(CommandLineApplication):
+    """cmscore application controller."""
+    _options = {
+
+    # -n <n> Set the number of sequences to generate and align to <n>. This
+    #   option is incompatible with the --infile option.
+    '-n':ValuedParameter(Prefix='-',Name='n',Delimiter=' '),\
+
+    # -l Turn on the local alignment algorithm, which allows the alignment to
+    #   span two or more subsequences if necessary (e.g. if the structures of
+    #   the query model and target sequence are only partially shared), allowing
+    #   certain large insertions and deletions in the structure to be penalized
+    #   differently than normal indels. The default is to globally align the
+    #   query model to the target sequences.
+    '-l':FlagParameter(Prefix='-',Name='l'),\
+
+    # -s <n> Set the random seed to <n>, where <n> is a positive integer. The
+    #   default is to use time() to generate a different seed for each run,
+    #   which means that two different runs of cmscore on the same CM will give
+    #   different results. You can use this option to generate reproducible
+    #   results. The random number generator is used to generate sequences to
+    #   score, so -s is incompatible with the --infile option which supplies
+    #   the sequences to score in an input file.
+    '-s':ValuedParameter(Prefix='-',Name='s',Delimiter=' '),\
+
+    # -a Print individual timings and score comparisons for each sequence in
+    #   seqfile. By default only summary statistics are printed.
+    '-a':FlagParameter(Prefix='-',Name='a'),\
+
+    # --sub Turn on the sub model construction and alignment procedure.
+    '--sub':FlagParameter(Prefix='--',Name='sub'),\
+
+    # --mxsize <x> Set the maximum allowable DP matrix size to <x> megabytes.
+    '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+    # --mpi Run as an MPI parallel program.
+    '--mpi':FlagParameter(Prefix='--',Name='mpi'),\
+
+    # Expert Options
+
+    # --emit Generate sequences to score by sampling from the CM.
+    '--emit':FlagParameter(Prefix='--',Name='emit'),\
+
+    # --random Generate sequences to score by sampling from the CMs null
+    #   distribution. This option turns the --emit option off.
+    '--random':FlagParameter(Prefix='--',Name='random'),\
+
+    # --infile <f> Sequences to score are read from the file <f>. All the
+    #   sequences from <f> are read and scored, the -n and -s options are
+    #   incompatible with --infile.
+    '--infile':ValuedParameter(Prefix='--',Name='infile',Delimiter=' '),\
+
+    # --outfile <f> Save generated sequences that are scored to the file <f> in
+    #   FASTA format. This option is incompatible with the --infile option.
+    '--outfile':ValuedParameter(Prefix='--',Name='outfile',Delimiter=' '),\
+
+    # --Lmin <n1> Must be used in combination with --random and --Lmax <n2>.
+    '--Lmin':ValuedParameter(Prefix='--',Name='Lmin',Delimiter=' '),\
+
+    # --pad Must be used in combination with --emit and --search. Add <n> cm->W
+    #   (max hit length) minus L (sequence <x> length) residues to the 5' and 3'
+    #   end of each emitted sequence <x>.
+    '--pad':FlagParameter(Prefix='--',Name='pad'),\
+
+    # --hbanded Specify that the second stage alignment algorithm be HMM banded
+    #   CYK. This option is on by default.
+    '--hbanded':FlagParameter(Prefix='--',Name='hbanded'),\
+
+    # --tau <x> For stage 2 alignment, set the tail loss probability used during
+    #   HMM band calculation to <x>.
+    '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\
+
+    # --aln2bands With --search, when calculating HMM bands, use an HMM
+    #   alignment algorithm instead of an HMM search algorithm.
+    '--aln2bands':FlagParameter(Prefix='--',Name='aln2bands'),\
+
+    # --hsafe For stage 2 HMM banded alignment, realign any sequences with a
+    #   negative alignment score using non-banded CYK to guarantee finding the
+    #   optimal alignment.
+    '--hsafe':FlagParameter(Prefix='--',Name='hsafe'),\
+
+    # --nonbanded Specify that the second stage alignment algorithm be standard,
+    #   non-banded, non-D&C CYK. When --nonbanded is enabled, the program fails
+    #   with a non-zero exit code and prints an error message if the parsetree
+    #   score for any sequence from stage 1 D&C alignment and stage 2 alignment
+    #   differs by more than 0.01 bits. In theory, this should never happen as
+    #   both algorithms are guaranteed to determine the optimal parsetree. For
+    #   larger RNAs (more than 300 residues) if memory is limiting, --nonbanded
+    #   should be used in combination with --scoreonly.
+    '--nonbanded':FlagParameter(Prefix='--',Name='nonbanded'),\
+
+    # --scoreonly With --nonbanded during the second stage standard non-banded
+    #   CYK alignment, use the "score only" variant of the algorithm to save
+    #   memory, and don't recover a parse tree.
+    '--scoreonly':FlagParameter(Prefix='--',Name='scoreonly'),\
+
+    # --viterbi Specify that the second stage alignment algorithm be Viterbi to
+    #   a CM Plan 9 HMM.
+    '--viterbi':FlagParameter(Prefix='--',Name='viterbi'),\
+
+    # --search Run all algorithms in scanning mode, not alignment mode.
+    '--search':FlagParameter(Prefix='--',Name='search'),\
+
+    # --inside With --search Compare the non-banded scanning Inside algorithm to
+    #   the HMM banded scanning Inside algorith, instead of using CYK versions.
+    '--inside':FlagParameter(Prefix='--',Name='inside'),\
+
+    # --forward With --search Compare the scanning Forward scoring algorithm
+    #   against CYK.
+    '--forward':FlagParameter(Prefix='--',Name='forward'),\
+
+    # --taus <n> Specify the first alignment algorithm as non-banded D&C CYK,
+    #   and multiple stages of HMM banded CYK alignment. The first HMM banded
+    #   alignment will use tau=1E-<x>, which will be the highest value of tau
+    #   used. Must be used in combination with --taue.
+    '--taus':ValuedParameter(Prefix='--',Name='taus',Delimiter=' '),\
+
+    # --taue <n> Specify the first alignment algorithm as non-banded D&C CYK,
+    #   and multiple stages of HMM banded CYK alignment. The final HMM banded
+    #   alignment will use tau=1E-<x>, which will be the lowest value of tau
+    #   used. Must be used in combination with --taus.
+    '--taue':ValuedParameter(Prefix='--',Name='taue',Delimiter=' '),\
+
+    # --tfile <f> Print the parsetrees for each alignment of each sequence to
+    #   file <f>.
+    '--tfile':ValuedParameter(Prefix='--',Name='tfile',Delimiter=' '),\
+
+    }
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "cmscore"
+    _suppress_stderr=True
+
+    def getHelp(self):
+        """Method that points to the Infernal documentation."""
+
+        help_str = \
+        """
+        See Infernal documentation at:
+        http://infernal.janelia.org/
+        """
+        return help_str
+
+class Cmsearch(CommandLineApplication):
+    """cmsearch application controller."""
+    _options = {
+
+    # -o <f> Save the high-scoring alignments of hits to a file <f>. The default
+    #   is to write them to standard output.
+    '-o':ValuedParameter(Prefix='-',Name='o',Delimiter=' '),\
+
+    # -g <f> Turn on the 'glocal' alignment algorithm, local with respect to the
+    #   target database, and global with respect to the model. By default, the
+    #   local alignment algorithm is used which is local with respect to both
+    #   the target sequence and the model.
+    '-g':ValuedParameter(Prefix='-',Name='g',Delimiter=' '),\
+
+    # -p Append posterior probabilities to alignments of hits.
+    '-p':FlagParameter(Prefix='-',Name='p'),\
+
+    # -x Annotate non-compensatory basepairs and basepairs that include a gap in
+    #   the left and/or right half of the pair with x's in the alignments of
+    #   hits.
+    '-x':FlagParameter(Prefix='-',Name='x'),\
+
+    # -Z <x> Calculate E-values as if the target database size was <x> megabases
+    #   (Mb). Ignore the actual size of the database. This option is only valid
+    #   if the CM file has been calibrated. Warning: the predictions for timings
+    #   and survival fractions will be calculated as if the database was of size
+    #   <x> Mb, which means they will be inaccurate.
+    '-Z':ValuedParameter(Prefix='-',Name='Z',Delimiter=' '),\
+
+    # --toponly Only search the top (Watson) strand of the sequences in seqfile.
+    #   By default, both strands are searched.
+    '--toponly':FlagParameter(Prefix='--',Name='toponly'),\
+
+    # --bottomonly Only search the bottom (Crick) strand of the sequences in
+    #   seqfile. By default, both strands are searched.
+    '--bottomonly':FlagParameter(Prefix='--',Name='bottomonly'),\
+
+    # --forecast <n> Predict the running time of the search with provided files
+    #   and options and exit, DO NOT perform the search. This option is only
+    #   available with calibrated CM files.
+    '--forecast':ValuedParameter(Prefix='--',Name='forecast',Delimiter=' '),\
+
+    # --informat <s> Assert that the input seqfile is in format <s>. Do not run
+    #   Babelfish format autodection. This increases the reliability of the
+    #   program somewhat, because the Babelfish can make mistakes; particularly
+    #   recommended for unattended, high-throughput runs of @PACKAGE at . <s> is
+    #   case-insensitive. Acceptable formats are: FASTA, EMBL, UNIPROT, GENBANK,
+    #   and DDBJ. <s> is case-insensitive.
+    '--informat':ValuedParameter(Prefix='--',Name='informat',Delimiter=' '),\
+
+    # --mxsize <x> Set the maximum allowable DP matrix size to <x> megabytes.
+    '--mxsize':ValuedParameter(Prefix='--',Name='mxsize',Delimiter=' '),\
+
+    # --mpi Run as an MPI parallel program.
+    '--mpi':FlagParameter(Prefix='--',Name='mpi'),\
+
+    # Expert Options
+
+    # --inside Use the Inside algorithm for the final round of searching. This
+    #   is true by default.
+    '--inside':FlagParameter(Prefix='--',Name='inside'),\
+
+    # --cyk Use the CYK algorithm for the final round of searching.
+    '--cyk':FlagParameter(Prefix='--',Name='cyk'),\
+
+    # --viterbi Search only with an HMM. This is much faster but less sensitive
+    #   than a CM search. Use the Viterbi algorithm for the HMM search.
+    '--viterbi':FlagParameter(Prefix='--',Name='viterbi'),\
+
+    # --forward Search only with an HMM. This is much faster but less sensitive
+    #   than a CM search. Use the Forward algorithm for the HMM search.
+    '--forward':FlagParameter(Prefix='--',Name='forward'),\
+
+    # -E <x> Set the E-value cutoff for the per-sequence/strand ranked hit list
+    #   to <x>, where <x> is a positive real number.
+    '-E':ValuedParameter(Prefix='-',Name='E',Delimiter=' '),\
+
+    # -T <x> Set the bit score cutoff for the per-sequence ranked hit list to
+    #   <x>, where <x> is a positive real number.
+    '-T':ValuedParameter(Prefix='-',Name='T',Delimiter=' '),\
+
+    # --nc Set the bit score cutoff as the NC cutoff value used by Rfam curators
+    #   as the noise cutoff score.
+    '--nc':FlagParameter(Prefix='--',Name='nc'),\
+
+    # --ga Set the bit score cutoff as the GA cutoff value used by Rfam curators
+    #   as the gathering threshold.
+    '--ga':FlagParameter(Prefix='--',Name='ga'),\
+
+    # --tc Set the bit score cutoff as the TC cutoff value used by Rfam curators
+    #   as the trusted cutoff.
+    '--tc':FlagParameter(Prefix='--',Name='tc'),\
+
+    # --no-qdb Do not use query-dependent banding (QDB) for the final round of
+    #   search.
+    '--no-qdb':FlagParameter(Prefix='--',Name='no-qdb'),\
+
+    # --beta " <x>" For query-dependent banding (QDB) during the final round of
+    #   search, set the beta parameter to <x> where <x> is any positive real
+    #   number less than 1.0.
+    '--beta':ValuedParameter(Prefix='--',Name='beta',Delimiter=' '),\
+
+    # --hbanded Use HMM bands to accelerate the final round of search.
+    #   Constraints for the CM search are derived from posterior probabilities
+    #   from an HMM. This is an experimental option and it is not recommended
+    #   for use unless you know exactly what you're doing.
+    '--hbanded':FlagParameter(Prefix='--',Name='hbanded'),\
+
+    # --tau <x> Set the tail loss probability during HMM band calculation to
+    #   <x>.
+    '--tau':ValuedParameter(Prefix='--',Name='tau',Delimiter=' '),\
+
+    # --fil-no-hmm Turn the HMM filter off.
+    '--fil-no-hmm':FlagParameter(Prefix='--',Name='fil-no-hmm'),\
+
+    # --fil-no-qdb Turn the QDB filter off.
+    '--fil-no-qdb':FlagParameter(Prefix='--',Name='fil-no-qdb'),\
+
+    # --fil-beta For the QDB filter, set the beta parameter to <x> where <x> is
+    #   any positive real number less than 1.0.
+    '--fil-beta':FlagParameter(Prefix='--',Name='fil-beta'),\
+
+    # --fil-T-qdb <x> Set the bit score cutoff for the QDB filter round to <x>,
+    #   where <x> is a positive real number.
+    '--fil-T-qdb':ValuedParameter(Prefix='--',Name='fil-T-qdb',Delimiter=' '),\
+
+    # --fil-T-hmm <x> Set the bit score cutoff for the HMM filter round to <x>,
+    #   where <x> is a positive real number.
+    '--fil-T-hmm':ValuedParameter(Prefix='--',Name='fil-T-hmm',Delimiter=' '),\
+
+    # --fil-E-qdb <x> Set the E-value cutoff for the QDB filter round. <x>,
+    #   where <x> is a positive real number. Hits with E-values better than
+    #   (less than) or equal to this threshold will survive and be passed to the
+    #   final round. This option is only available if the CM file has been
+    #   calibrated.
+    '--fil-E-qdb':ValuedParameter(Prefix='--',Name='fil-E-qdb',Delimiter=' '),\
+
+    # --fil-E-hmm <x> Set the E-value cutoff for the HMM filter round. <x>,
+    #   where <x> is a positive real number. Hits with E-values better than
+    #   (less than) or equal to this threshold will survive and be passed to the
+    #   next round, either a QDB filter round, or if the QDB filter is disable,
+    #   to the final round of search. This option is only available if the CM
+    #   file has been calibrated.
+    '--fil-E-hmm':ValuedParameter(Prefix='--',Name='fil-E-hmm',Delimiter=' '),\
+
+    # --fil-Smax-hmm <x> Set the maximum predicted survival fraction for an HMM
+    #   filter as <x>, where <x> is a positive real number less than 1.0.
+    '--fil-Smax-hmm':ValuedParameter(Prefix='--',Name='fil-Smax-hmm',\
+        Delimiter=' '),\
+
+    # --noalign Do not calculate and print alignments of each hit, only print
+    #   locations and scores.
+    '--noalign':FlagParameter(Prefix='--',Name='noalign'),\
+
+    # --aln-hbanded Use HMM bands to accelerate alignment during the hit
+    #   alignment stage.
+    '--aln-hbanded':FlagParameter(Prefix='--',Name='aln-hbanded'),\
+
+    # --aln-optacc Calculate alignments of hits from final round of search using
+    #   the optimal accuracy algorithm which computes the alignment that
+    #   maximizes the summed posterior probability of all aligned residues given
+    #   the model, which can be different from the highest scoring one.
+    '--aln-optacc':FlagParameter(Prefix='--',Name='aln-optacc'),\
+
+    # --tabfile <f> Create a new output file <f> and print tabular results to
+    #   it.
+    '--tabfile':ValuedParameter(Prefix='--',Name='tabfile',Delimiter=' '),\
+
+    # --gcfile <f> Create a new output file <f> and print statistics of the GC
+    #   content of the sequences in seqfile to it.
+    '--gcfile':ValuedParameter(Prefix='--',Name='gcfile',Delimiter=' '),\
+
+    # --rna Output the hit alignments as RNA sequences alignments. This is true
+    #   by default.
+    '--rna':FlagParameter(Prefix='--',Name='rna'),\
+
+    # --dna Output the hit alignments as DNA sequence alignments.
+    '--dna':FlagParameter(Prefix='--',Name='dna'),\
+
+    }
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "cmsearch"
+    _suppress_stderr=True
+
+    def getHelp(self):
+        """Method that points to the Infernal documentation."""
+
+        help_str = \
+        """
+        See Infernal documentation at:
+        http://infernal.janelia.org/
+        """
+        return help_str
+
+    def _tabfile_out_filename(self):
+
+        if self.Parameters['--tabfile'].isOn():
+            tabfile_filename = self._absolute(str(\
+                self.Parameters['--tabfile'].Value))
+        else:
+            raise ValueError, 'No tabfile output file specified.'
+        return tabfile_filename
+
+    def _tempfile_as_multiline_string(self, data):
+        """Write a multiline string to a temp file and return the filename.
+
+            data: a multiline string to be written to a file.
+
+           * Note: the result will be the filename as a FilePath object
+            (which is a string subclass).
+
+        """
+        filename = FilePath(self.getTmpFilename(self.TmpDir))
+        data_file = open(filename,'w')
+        data_file.write(data)
+        data_file.close()
+        return filename
+
+    def _get_result_paths(self,data):
+        result = {}
+        if self.Parameters['--tabfile'].isOn():
+            out_name = self._tabfile_out_filename()
+            result['SearchResults'] = ResultPath(Path=out_name,IsWritten=True)
+
+        return result
+
+class Cmstat(CommandLineApplication):
+    """cmstat application controller."""
+    _options = {
+
+    # -g Turn on the 'glocal' alignment algorithm, local with respect to the
+    #   target database, and global with respect to the model. By default, the
+    #   model is configured for local alignment which is local with respect to
+    #   both the target sequence and the model.
+    '-g':FlagParameter(Prefix='-',Name='g'),\
+
+    # -m print general statistics on the models in cmfile and the alignment it
+    #   was built from.
+    '-m':FlagParameter(Prefix='-',Name='m'),\
+
+    # -Z <x> Calculate E-values as if the target database size was <x> megabases
+    #   (Mb). Ignore the actual size of the database. This option is only valid
+    #   if the CM file has been calibrated.
+    '-Z':ValuedParameter(Prefix='-',Name='Z',Delimiter=' '),\
+
+    # --all print all available statistics
+    '--all':FlagParameter(Prefix='--',Name='all'),\
+
+    # --le print local E-value statistics. This option only works if cmfile has
+    #   been calibrated with cmcalibrate.
+    '--le':FlagParameter(Prefix='--',Name='le'),\
+
+    # --ge print glocal E-value statistics. This option only works if cmfile has
+    #   been calibrated with cmcalibrate.
+    '--ge':FlagParameter(Prefix='--',Name='ge'),\
+
+    # --beta <x> With the --search option set the beta parameter for the query-
+    #   dependent banding algorithm stages to <x> Beta is the probability mass
+    #   considered negligible during band calculation. The default is 1E-7.
+    '--beta':ValuedParameter(Prefix='--',Name='beta',Delimiter=' '),\
+
+    # --qdbfile <f> Save the query-dependent bands (QDBs) for each state to file
+    #   <f>
+    '--qdbfile':ValuedParameter(Prefix='--',Name='qdbfile',Delimiter=' '),\
+
+    # Expert Options
+
+    # --lfi Print the HMM filter thresholds for the range of relevant CM bit
+    #   score cutoffs for searches with locally configured models using the
+    #   Inside algorithm.
+    '--lfi':FlagParameter(Prefix='--',Name='lfi'),\
+
+    # --gfi Print the HMM filter thresholds for the range of relevant CM bit
+    #   score cutoffs for searches with globally configured models using the
+    #   Inside algorithm.
+    '--gfi':FlagParameter(Prefix='--',Name='gfi'),\
+
+    # --lfc Print the HMM filter thresholds for the range of relevant CM bit
+    #   score cutoffs for searches with locally configured models using the CYK
+    #   algorithm.
+    '--lfc':FlagParameter(Prefix='--',Name='lfc'),\
+
+    # --gfc Print the HMM filter thresholds for the range of relevant CM bit
+    #   score cutoffs for searches with globally configured models using the CYK
+    #   algorithm.
+    '--gfc':FlagParameter(Prefix='--',Name='gfc'),\
+
+    # -E <x> Print filter threshold statistics for an HMM filter if a final CM
+    #   E-value cutoff of <x> were to be used for a run of cmsearch on 1 MB of
+    #   sequence.
+    '-E':ValuedParameter(Prefix='-',Name='E',Delimiter=' '),\
+
+    # -T <x> Print filter threshold statistics for an HMM filter if a final CM
+    #   bit score cutoff of <x> were to be used for a run of cmsearch.
+    '-T':ValuedParameter(Prefix='-',Name='T',Delimiter=' '),\
+
+    # --nc Print filter threshold statistics for an HMM filter if a CM bit score
+    #   cutoff equal to the Rfam NC cutoff were to be used for a run of
+    #   cmsearch.
+    '--nc':FlagParameter(Prefix='--',Name='nc'),\
+
+    # --ga Print filter threshold statistics for an HMM filter if a CM bit score
+    #   cutoff of Rfam GA cutoff value were to be used for a run of cmsearch.
+    '--ga':FlagParameter(Prefix='--',Name='ga'),\
+
+    # --tc Print filter threshold statistics for an HMM filter if a CM bit score
+    #   cutoff equal to the Rfam TC cutoff value were to be used for a run of
+    #   cmsearch.
+    '--tc':FlagParameter(Prefix='--',Name='tc'),\
+
+    # --seqfile <x> With the -E option, use the database size of the database in
+    #   <x> instead of the default database size of 1 MB.
+    '--seqfile':ValuedParameter(Prefix='--',Name='seqfile',Delimiter=' '),\
+
+    # --toponly In combination with --seqfile <x> option, only consider the top
+    #   strand of the database in <x> instead of both strands. --search perform
+    #   an experiment to determine how fast the CM(s) can search with different
+    #   search algorithms.
+    '--toponly':FlagParameter(Prefix='--',Name='toponly'),\
+
+    # --cmL <n> With the --search option set the length of sequence to search
+    #   with CM algorithms as <n> residues. By default, <n> is 1000.
+    '--cmL':ValuedParameter(Prefix='--',Name='cmL',Delimiter=' '),\
+
+    # --hmmL <n> With the --search option set the length of sequence to search
+    #   with HMM algorithms as <n> residues. By default, <n> is 100,000.
+    '--hmmL':ValuedParameter(Prefix='--',Name='hmmL',Delimiter=' '),\
+
+    # --efile <f> Save a plot of cmsearch HMM filter E value cutoffs versus CM
+    #   E-value cutoffs in xmgrace format to file <f>. This option must be used
+    #   in combination with --lfi, --gfi, --lfc or --gfc.
+    '--efile':ValuedParameter(Prefix='--',Name='efile',Delimiter=' '),\
+
+    # --bfile <f> Save a plot of cmsearch HMM bit score cutoffs versus CM bit
+    #   score cutoffs in xmgrace format to file <f>. This option must be used in
+    #   combination with --lfi, --gfi, --lfc or --gfc.
+    '--bfile':ValuedParameter(Prefix='--',Name='bfile',Delimiter=' '),\
+
+    # --sfile <f> Save a plot of cmsearch predicted survival fraction from the
+    #   HMM filter versus CM E value cutoff in xmgrace format to file <f>. This
+    #   option must be used in combination with --lfi, --gfi, --lfc or --gfc.
+    '--sfile':ValuedParameter(Prefix='--',Name='sfile',Delimiter=' '),\
+
+    # --xfile <f> Save a plot of 'xhmm' versus CM E value cutoff in xmgrace
+    #   format to file <f> 'xhmm' is the ratio of the number of dynamic
+    #   programming calculations predicted to be required for the HMM filter and
+    #   the CM search of the filter survivors versus the number of dynamic
+    #   programming calculations for the filter alone. This option must be
+    #   used in combination with --lfi, --gfi, --lfc or --gfc.
+    '--xfile':ValuedParameter(Prefix='--',Name='xfile',Delimiter=' '),\
+
+    # --afile <f> Save a plot of the predicted acceleration for an HMM filtered
+    #   search versus CM E value cutoff in xmgrace format to file <f>. This
+    #   option must be used in combination with --lfi, --gfi, --lfc or --gfc.
+    '--afile':ValuedParameter(Prefix='--',Name='afile',Delimiter=' '),\
+
+    # --bits With --efile, --sfile, --xfile, and --afile use CM bit score
+    #   cutoffs instead of CM E value cutoffs for the x-axis values of the plot.
+    '--bits':FlagParameter(Prefix='--',Name='bits'),\
+
+    }
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "cmstat"
+    _suppress_stderr=True
+
+    def getHelp(self):
+        """Method that points to the Infernal documentation."""
+
+        help_str = \
+        """
+        See Infernal documentation at:
+        http://infernal.janelia.org/
+        """
+        return help_str
+
+def cmbuild_from_alignment(aln, structure_string, refine=False, \
+    return_alignment=False,params=None):
+    """Uses cmbuild to build a CM file given an alignment and structure string.
+
+        - aln: an Alignment object or something that can be used to construct
+            one.  All sequences must be the same length.
+        - structure_string: vienna structure string representing the consensus
+            stucture for the sequences in aln.  Must be the same length as the
+            alignment.
+        - refine: refine the alignment and realign before building the cm.
+            (Default=False)
+        - return_alignment: Return (in Stockholm format) alignment file used to
+            construct the CM file.  This will either be the original alignment
+            and structure string passed in, or the refined alignment if --refine
+            was used. (Default=False)
+            - Note.  This will be a string that can either be written to a file
+                or parsed.
+    """
+    aln = Alignment(aln)
+    if len(structure_string) != aln.SeqLen:
+        raise ValueError, """Structure string is not same length as alignment.  Structure string is %s long. Alignment is %s long."""%(len(structure_string),\
+        aln.SeqLen)
+    else:
+        struct_dict = {'SS_cons':structure_string}
+    #Make new Cmbuild app instance.
+    app = Cmbuild(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+        params=params)
+
+    #turn on refine flag if True.
+    if refine:
+        _, tmp_file = mkstemp(dir=app.WorkingDir)
+        app.Parameters['--refine'].on(tmp_file)
+
+    #Get alignment in Stockholm format
+    aln_file_string = stockholm_from_alignment(aln,GC_annotation=struct_dict)
+
+    #get path to alignment filename
+    aln_path = app._input_as_multiline_string(aln_file_string)
+    cm_path = aln_path.split('.txt')[0]+'.cm'
+    app.Parameters['-n'].on(cm_path)
+
+    filepaths = [cm_path,aln_path]
+
+    res = app(filepaths)
+
+    cm_file = res['CmFile'].read()
+
+    if return_alignment:
+        #If alignment was refined, return refined alignment and structure,
+        # otherwise return original alignment and structure.
+        if refine:
+            aln_file_string = res['Refined'].read()
+        res.cleanUp()
+        return cm_file, aln_file_string
+    #Just return cm_file
+    else:
+        res.cleanUp()
+        return cm_file
+
+
+def cmbuild_from_file(stockholm_file_path, refine=False,return_alignment=False,\
+    params=None):
+    """Uses cmbuild to build a CM file given a stockholm file.
+
+        - stockholm_file_path: a path to a stockholm file.  This file should
+            contain a multiple sequence alignment formated in Stockholm format.
+            This must contain a sequence structure line:
+                #=GC SS_cons <structure string>
+        - refine: refine the alignment and realign before building the cm.
+            (Default=False)
+        - return_alignment: Return alignment and structure string used to
+            construct the CM file.  This will either be the original alignment
+            and structure string passed in, or the refined alignment if
+            --refine was used. (Default=False)
+    """
+    #get alignment and structure string from stockholm file.
+    info, aln, structure_string = \
+        list(MinimalRfamParser(open(stockholm_file_path,'U'),\
+            seq_constructor=ChangedSequence))[0]
+
+    #call cmbuild_from_alignment.
+    res = cmbuild_from_alignment(aln, structure_string, refine=refine, \
+        return_alignment=return_alignment,params=params)
+    return res
+
+def cmalign_from_alignment(aln, structure_string, seqs, moltype=DNA,\
+    include_aln=True,refine=False, return_stdout=False,params=None,\
+    cmbuild_params=None):
+    """Uses cmbuild to build a CM file, then cmalign to build an alignment.
+
+        - aln: an Alignment object or something that can be used to construct
+            one.  All sequences must be the same length.
+        - structure_string: vienna structure string representing the consensus
+            stucture for the sequences in aln.  Must be the same length as the
+            alignment.
+        - seqs: SequenceCollection object or something that can be used to
+            construct one, containing unaligned sequences that are to be aligned
+            to the aligned sequences in aln.
+        - moltype: Cogent moltype object.  Must be RNA or DNA.
+        - include_aln: Boolean to include sequences in aln in final alignment.
+            (Default=True)
+        - refine: refine the alignment and realign before building the cm.
+            (Default=False)
+        - return_stdout: Boolean to return standard output from infernal.  This
+            includes alignment and structure bit scores and average
+            probabilities for each sequence. (Default=False)
+    """
+    #NOTE: Must degap seqs or Infernal well seg fault!
+    seqs = SequenceCollection(seqs,MolType=moltype).degap()
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = seqs.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+
+    cm_file, aln_file_string = cmbuild_from_alignment(aln, structure_string,\
+        refine=refine,return_alignment=True,params=cmbuild_params)
+
+    if params is None:
+        params = {}
+    params.update({MOLTYPE_MAP[moltype]:True})
+
+    app = Cmalign(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+        params=params)
+    app.Parameters['--informat'].on('FASTA')
+
+    #files to remove that aren't cleaned up by ResultPath object
+    to_remove = []
+    #turn on --withali flag if True.
+    if include_aln:
+        app.Parameters['--withali'].on(\
+            app._tempfile_as_multiline_string(aln_file_string))
+        #remove this file at end
+        to_remove.append(app.Parameters['--withali'].Value)
+
+    seqs_path = app._input_as_multiline_string(int_map.toFasta())
+    cm_path = app._tempfile_as_multiline_string(cm_file)
+
+    #add cm_path to to_remove
+    to_remove.append(cm_path)
+    paths = [cm_path,seqs_path]
+
+    _, tmp_file = mkstemp(dir=app.WorkingDir)
+    app.Parameters['-o'].on(tmp_file)
+
+    res = app(paths)
+
+    info, aligned, struct_string = \
+        list(MinimalRfamParser(res['Alignment'].readlines(),\
+            seq_constructor=SEQ_CONSTRUCTOR_MAP[moltype]))[0]
+
+    #Make new dict mapping original IDs
+    new_alignment={}
+    for k,v in aligned.NamedSeqs.items():
+        new_alignment[int_keys.get(k,k)]=v
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment,MolType=moltype)
+
+    std_out = res['StdOut'].read()
+    #clean up files
+    res.cleanUp()
+    for f in to_remove: remove(f)
+
+    if return_stdout:
+        return new_alignment, struct_string, std_out
+    else:
+        return new_alignment, struct_string
+
+
+def cmalign_from_file(cm_file_path, seqs, moltype=DNA, alignment_file_path=None,\
+    include_aln=False,return_stdout=False,params=None):
+    """Uses cmalign to align seqs to alignment in cm_file_path.
+
+        - cm_file_path: path to the file created by cmbuild, containing aligned
+            sequences. This will be used to align sequences in seqs.
+        - seqs: unaligned sequendes that are to be aligned to the sequences in
+            cm_file.
+        - moltype: cogent.core.moltype object.  Must be DNA or RNA
+        - alignment_file_path: path to stockholm alignment file used to create
+            cm_file.
+            __IMPORTANT__: This MUST be the same file used by cmbuild
+            originally.  Only need to pass in this file if include_aln=True.
+            This helper function will NOT check if the alignment file is correct
+            so you must use it correctly.
+        - include_aln: Boolean to include sequences in aln_file in final
+            alignment. (Default=False)
+        - return_stdout: Boolean to return standard output from infernal.  This
+            includes alignment and structure bit scores and average
+            probabilities for each sequence. (Default=False)
+    """
+    #NOTE: Must degap seqs or Infernal well seg fault!
+    seqs = SequenceCollection(seqs,MolType=moltype).degap()
+
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = seqs.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+
+    if params is None:
+        params = {}
+    params.update({MOLTYPE_MAP[moltype]:True})
+
+    app = Cmalign(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+        params=params)
+    app.Parameters['--informat'].on('FASTA')
+
+    #turn on --withali flag if True.
+    if include_aln:
+        if alignment_file_path is None:
+            raise DataError, """Must have path to alignment file used to build CM if include_aln=True."""
+        else:
+            app.Parameters['--withali'].on(alignment_file_path)
+
+    seqs_path = app._input_as_multiline_string(int_map.toFasta())
+    paths = [cm_file_path,seqs_path]
+
+    _, tmp_file = mkstemp(dir=app.WorkingDir)
+    app.Parameters['-o'].on(tmp_file)
+    res = app(paths)
+
+    info, aligned, struct_string = \
+        list(MinimalRfamParser(res['Alignment'].readlines(),\
+            seq_constructor=SEQ_CONSTRUCTOR_MAP[moltype]))[0]
+
+
+    #Make new dict mapping original IDs
+    new_alignment={}
+    for k,v in aligned.items():
+        new_alignment[int_keys.get(k,k)]=v
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment,MolType=moltype)
+    std_out = res['StdOut'].read()
+    res.cleanUp()
+    if return_stdout:
+        return new_alignment, struct_string, std_out
+    else:
+        return new_alignment, struct_string
+
+def cmsearch_from_alignment(aln, structure_string, seqs, moltype, cutoff=0.0,\
+    refine=False,params=None):
+    """Uses cmbuild to build a CM file, then cmsearch to find homologs.
+
+        - aln: an Alignment object or something that can be used to construct
+            one.  All sequences must be the same length.
+        - structure_string: vienna structure string representing the consensus
+            stucture for the sequences in aln.  Must be the same length as the
+            alignment.
+        - seqs: SequenceCollection object or something that can be used to
+            construct one, containing unaligned sequences that are to be
+            searched.
+        - moltype: cogent.core.moltype object.  Must be DNA or RNA
+        - cutoff: bitscore cutoff.  No sequences < cutoff will be kept in
+            search results. (Default=0.0).  Infernal documentation suggests
+            a cutoff of log2(number nucleotides searching) will give most
+            likely true homologs.
+        - refine: refine the alignment and realign before building the cm.
+            (Default=False)
+    """
+    #NOTE: Must degap seqs or Infernal well seg fault!
+    seqs = SequenceCollection(seqs,MolType=moltype).degap()
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = seqs.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+
+    cm_file, aln_file_string = cmbuild_from_alignment(aln, structure_string,\
+        refine=refine,return_alignment=True)
+
+    app = Cmsearch(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+        params=params)
+    app.Parameters['--informat'].on('FASTA')
+    app.Parameters['-T'].on(cutoff)
+
+    to_remove = []
+
+    seqs_path = app._input_as_multiline_string(int_map.toFasta())
+    cm_path = app._tempfile_as_multiline_string(cm_file)
+    paths = [cm_path,seqs_path]
+    to_remove.append(cm_path)
+
+    _, tmp_file = mkstemp(dir=app.WorkingDir)
+    app.Parameters['--tabfile'].on(tmp_file)
+    res = app(paths)
+
+    search_results = list(CmsearchParser(res['SearchResults'].readlines()))
+    if search_results:
+        for i,line in enumerate(search_results):
+            label = line[1]
+            search_results[i][1]=int_keys.get(label,label)
+
+    res.cleanUp()
+    for f in to_remove:remove(f)
+
+    return search_results
+
+def cmsearch_from_file(cm_file_path, seqs, moltype, cutoff=0.0, params=None):
+    """Uses cmbuild to build a CM file, then cmsearch to find homologs.
+
+        - cm_file_path: path to the file created by cmbuild, containing aligned
+            sequences. This will be used to search sequences in seqs.
+        - seqs: SequenceCollection object or something that can be used to
+            construct one, containing unaligned sequences that are to be
+            searched.
+        - moltype: cogent.core.moltype object.  Must be DNA or RNA
+        - cutoff: bitscore cutoff.  No sequences < cutoff will be kept in
+            search results. (Default=0.0).  Infernal documentation suggests
+            a cutoff of log2(number nucleotides searching) will give most
+            likely true homologs.
+    """
+    #NOTE: Must degap seqs or Infernal well seg fault!
+    seqs = SequenceCollection(seqs,MolType=moltype).degap()
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = seqs.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+
+    app = Cmsearch(InputHandler='_input_as_paths',WorkingDir='/tmp',\
+        params=params)
+    app.Parameters['--informat'].on('FASTA')
+    app.Parameters['-T'].on(cutoff)
+
+    seqs_path = app._input_as_multiline_string(int_map.toFasta())
+
+    paths = [cm_file_path,seqs_path]
+
+    _, tmp_file = mkstemp(dir=app.WorkingDir)
+    app.Parameters['--tabfile'].on(tmp_file)
+    res = app(paths)
+
+    search_results = list(CmsearchParser(res['SearchResults'].readlines()))
+
+    if search_results:
+        for i,line in enumerate(search_results):
+            label = line[1]
+            search_results[i][1]=int_keys.get(label,label)
+
+    res.cleanUp()
+
+    return search_results
diff --git a/bfillings/mafft.py b/bfillings/mafft.py
new file mode 100644
index 0000000..417bc08
--- /dev/null
+++ b/bfillings/mafft.py
@@ -0,0 +1,470 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Provides an application controller for the commandline version of:
+MAFFT v6.602
+"""
+from random import choice
+from os import remove
+
+from burrito.parameters import FlagParameter, ValuedParameter, FilePath
+from burrito.util import CommandLineApplication, ResultPath, get_tmp_filename
+from skbio.parse.sequences import parse_fasta
+
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.core.tree import PhyloNode
+from cogent.parse.tree import DndParser
+
+
+MOLTYPE_MAP = {'DNA':'--nuc',\
+               'RNA':'--nuc',\
+               'PROTEIN':'--amino',\
+               }
+
+class Mafft(CommandLineApplication):
+    """Mafft application controller"""
+
+
+    _options ={
+    # Algorithm
+
+    # Automatically selects an appropriate strategy from L-INS-i, FFT-NS-i
+    # and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
+    '--auto':FlagParameter(Prefix='--',Name='auto'),\
+
+    # Distance is calculated based on the number of shared 6mers. Default: on
+    '--6merpair':FlagParameter(Prefix='--',Name='6merpair'),\
+
+    # All pairwise alignments are computed with the Needleman-Wunsch algorithm.
+    # More accurate but slower than --6merpair. Suitable for a set of globally
+    # alignable sequences. Applicable to up to ~200 sequences. A combination
+    # with --maxiterate 1000 is recommended (G-INS-i). Default: off
+    # (6mer distance is used)
+    '--globalpair':FlagParameter(Prefix='--',Name='globalpair'),\
+
+    # All pairwise alignments are computed with the Smith-Waterman algorithm.
+    # More accurate but slower than --6merpair. Suitable for a set of locally
+    # alignable sequences. Applicable to up to ~200 sequences. A combination
+    # with --maxiterate 1000 is recommended (L-INS-i). Default: off
+    # (6mer distance is used)
+    '--localpair':FlagParameter(Prefix='--',Name='localpair'),\
+
+    # All pairwise alignments are computed with a local algorithm with the
+    # generalized affine gap cost (Altschul 1998). More accurate but slower than
+    # --6merpair. Suitable when large internal gaps are expected. Applicable to
+    # up to ~200 sequences. A combination with --maxiterate 1000 is recommended
+    # (E-INS-i). Default: off (6mer distance is used)
+    '--genafpair':FlagParameter(Prefix='--',Name='genafpair'),\
+
+    # All pairwise alignments are computed with FASTA (Pearson and Lipman 1988).
+    # FASTA is required. Default: off (6mer distance is used)
+    '--fastapair':FlagParameter(Prefix='--',Name='fastapair'),\
+
+    # Weighting factor for the consistency term calculated from pairwise
+    # alignments. Valid when either of --blobalpair, --localpair, --genafpair,
+    # --fastapair or --blastpair is selected. Default: 2.7
+    '--weighti':ValuedParameter(Prefix='--',Name='weighti',Delimiter=' '),\
+
+    # Guide tree is built number times in the progressive stage. Valid with 6mer
+    # distance. Default: 2
+    '--retree':ValuedParameter(Prefix='--',Name='retree',Delimiter=' '),\
+
+    # number cycles of iterative refinement are performed. Default: 0
+    '--maxiterate':ValuedParameter(Prefix='--',Name='maxiterate',\
+        Delimiter=' '),\
+
+    # Use FFT approximation in group-to-group alignment. Default: on
+    '--fft':FlagParameter(Prefix='--',Name='fft'),\
+
+    # Do not use FFT approximation in group-to-group alignment. Default: off
+    '--nofft':FlagParameter(Prefix='--',Name='nofft'),\
+
+    #Alignment score is not checked in the iterative refinement stage. Default:
+    # off (score is checked)
+    '--noscore':FlagParameter(Prefix='--',Name='noscore'),\
+
+    # Use the Myers-Miller (1988) algorithm. Default: automatically turned on
+    # when the alignment length exceeds 10,000 (aa/nt).
+    '--memsave':FlagParameter(Prefix='--',Name='memsave'),\
+
+    # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with the
+    # 6mer distance. Recommended for a large number (> ~10,000) of sequences are
+    # input. Default: off
+    '--parttree':FlagParameter(Prefix='--',Name='parttree'),\
+
+    # The PartTree algorithm is used with distances based on DP. Slightly more
+    # accurate and slower than --parttree. Recommended for a large number
+    # (> ~10,000) of sequences are input. Default: off
+    '--dpparttree':FlagParameter(Prefix='--',Name='dpparttree'),\
+
+    # The PartTree algorithm is used with distances based on FASTA. Slightly
+    # more accurate and slower than --parttree. Recommended for a large number
+    # (> ~10,000) of sequences are input. FASTA is required. Default: off
+    '--fastaparttree':FlagParameter(Prefix='--',Name='fastaparttree'),\
+
+    # The number of partitions in the PartTree algorithm. Default: 50
+    '--partsize':ValuedParameter(Prefix='--',Name='partsize',Delimiter=' '),\
+
+    # Do not make alignment larger than number sequences. Valid only with the
+    # --*parttree options. Default: the number of input sequences
+    '--groupsize':ValuedParameter(Prefix='--',Name='groupsize',Delimiter=' '),\
+
+    # Parameter
+
+    # Gap opening penalty at group-to-group alignment. Default: 1.53
+    '--op':ValuedParameter(Prefix='--',Name='op',Delimiter=' '),\
+
+    # Offset value, which works like gap extension penalty, for group-to-group
+    # alignment. Deafult: 0.123
+    '--ep':ValuedParameter(Prefix='--',Name='ep',Delimiter=' '),\
+
+    # Gap opening penalty at local pairwise alignment. Valid when the
+    # --localpair or --genafpair option is selected. Default: -2.00
+    '--lop':ValuedParameter(Prefix='--',Name='lop',Delimiter=' '),\
+
+    # Offset value at local pairwise alignment. Valid when the --localpair or
+    # --genafpair option is selected. Default: 0.1
+    '--lep':ValuedParameter(Prefix='--',Name='lep',Delimiter=' '),\
+
+    # Gap extension penalty at local pairwise alignment. Valid when the
+    # --localpair or --genafpair option is selected. Default: -0.1
+    '--lexp':ValuedParameter(Prefix='--',Name='lexp',Delimiter=' '),\
+
+    # Gap opening penalty to skip the alignment. Valid when the --genafpair
+    # option is selected. Default: -6.00
+    '--LOP':ValuedParameter(Prefix='--',Name='LOP',Delimiter=' '),\
+
+    # Gap extension penalty to skip the alignment. Valid when the --genafpair
+    # option is selected. Default: 0.00
+    '--LEXP':ValuedParameter(Prefix='--',Name='LEXP',Delimiter=' '),\
+
+    # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. number=30, 45,
+    # 62 or 80. Default: 62
+    '--bl':ValuedParameter(Prefix='--',Name='bl',Delimiter=' '),\
+
+    # JTT PAM number (Jones et al. 1992) matrix is used. number>0.
+    # Default: BLOSUM62
+    '--jtt':ValuedParameter(Prefix='--',Name='jtt',Delimiter=' '),\
+
+    # Transmembrane PAM number (Jones et al. 1994) matrix is used. number>0.
+    # Default: BLOSUM62
+    '--tm':ValuedParameter(Prefix='--',Name='tm',Delimiter=' '),\
+
+    # Use a user-defined AA scoring matrix. The format of matrixfile is the same
+    # to that of BLAST. Ignored when nucleotide sequences are input.
+    # Default: BLOSUM62
+    '--aamatrix':ValuedParameter(Prefix='--',Name='aamatrix',Delimiter=' '),\
+
+    # Incorporate the AA/nuc composition information into the scoring matrix.
+    # Deafult: off
+    '--fmodel':FlagParameter(Prefix='--',Name='fmodel'),\
+
+    # Output
+
+    # Output format: clustal format. Default: off (fasta format)
+    '--clustalout':FlagParameter(Prefix='--',Name='clustalout'),\
+
+    # Output order: same as input. Default: on
+    '--inputorder':FlagParameter(Prefix='--',Name='inputorder'),\
+
+    # Output order: aligned. Default: off (inputorder)
+    '--reorder':FlagParameter(Prefix='--',Name='reorder'),\
+
+    # Guide tree is output to the input.tree file. Default: off
+    '--treeout':FlagParameter(Prefix='--',Name='treeout'),\
+
+    # Do not report progress. Default: off
+    '--quiet':FlagParameter(Prefix='--',Name='quiet'),\
+
+# Input
+
+    # Assume the sequences are nucleotide. Deafult: auto
+    '--nuc':FlagParameter(Prefix='--',Name='nuc'),\
+
+    # Assume the sequences are amino acid. Deafult: auto
+    '--amino':FlagParameter(Prefix='--',Name='amino'),\
+
+    # Seed alignments given in alignment_n (fasta format) are aligned with
+    # sequences in input. The alignment within every seed is preserved.
+    '--seed':ValuedParameter(Prefix='--',Name='seed',Delimiter=' '),\
+    }
+
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "mafft"
+    _suppress_stderr=True
+
+    def _input_as_seqs(self,data):
+        lines = []
+        for i,s in enumerate(data):
+            #will number the sequences 1,2,3,etc.
+            lines.append(''.join(['>',str(i+1)]))
+            lines.append(s)
+        return self._input_as_lines(lines)
+
+    def _tree_out_filename(self):
+        if self.Parameters['--treeout'].isOn():
+            tree_filename = self._absolute(str(self._input_filename))+'.tree'
+        else:
+            raise ValueError, "No tree output file specified."
+        return tree_filename
+
+    def _tempfile_as_multiline_string(self, data):
+        """Write a multiline string to a temp file and return the filename.
+
+            data: a multiline string to be written to a file.
+
+           * Note: the result will be the filename as a FilePath object
+            (which is a string subclass).
+
+        """
+        filename = FilePath(self.getTmpFilename(self.TmpDir))
+        data_file = open(filename,'w')
+        data_file.write(data)
+        data_file.close()
+        return filename
+
+    def getHelp(self):
+        """Method that points to the Mafft documentation."""
+
+        help_str = \
+        """
+        See Mafft documentation at:
+        http://align.bmr.kyushu-u.ac.jp/mafft/software/manual/manual.html
+        """
+        return help_str
+
+    def _get_result_paths(self,data):
+        result = {}
+        if self.Parameters['--treeout'].isOn():
+            out_name = self._tree_out_filename()
+            result['Tree'] = ResultPath(Path=out_name,IsWritten=True)
+        return result
+
+def align_unaligned_seqs(seqs,moltype=DNA,params=None,accurate=False):
+    """Aligns unaligned sequences
+
+    seqs: either list of sequence objects or list of strings
+    add_seq_names: boolean. if True, sequence names are inserted in the list
+        of sequences. if False, it assumes seqs is a list of lines of some
+        proper format that the program can handle
+    """
+    #create SequenceCollection object from seqs
+    seq_collection = SequenceCollection(seqs,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = seq_collection.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+    #Create Mafft app.
+    app = Mafft(InputHandler='_input_as_multiline_string',params=params)
+
+    #Turn on correct moltype
+    moltype_string = moltype.label.upper()
+    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
+
+    #Do not report progress
+    app.Parameters['--quiet'].on()
+
+    #More accurate alignment, sacrificing performance.
+    if accurate:
+        app.Parameters['--globalpair'].on()
+        app.Parameters['--maxiterate'].Value=1000
+
+    #Get results using int_map as input to app
+    res = app(int_map.toFasta())
+    #Get alignment as dict out of results
+    alignment = dict(parse_fasta(res['StdOut']))
+    #Make new dict mapping original IDs
+    new_alignment = {}
+    for k,v in alignment.items():
+        new_alignment[int_keys[k]]=v
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment,MolType=moltype)
+    #Clean up
+    res.cleanUp()
+    del(seq_collection,int_map,int_keys,app,res,alignment)
+
+    return new_alignment
+
+
+def align_and_build_tree(seqs, moltype, best_tree=False, params={}):
+    """Returns an alignment and a tree from Sequences object seqs.
+
+    seqs: SequenceCollection object, or data that can be used to build one.
+
+    best_tree: if True (default:False), uses a slower but more accurate
+    algorithm to build the tree.
+
+    params: dict of parameters to pass in to the Mafft app controller.
+
+    The result will be a tuple containing an Alignment object and a
+    cogent.core.tree.PhyloNode object (or None for the alignment and/or tree
+    if either fails).
+    """
+    #Current version of Mafft does not support tree building.
+    raise NotImplementedError, """Current version of Mafft does not support tree building."""
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={},\
+    working_dir='/tmp'):
+    """Returns a tree from Alignment object aln.
+
+    aln: a cogent.core.alignment.Alignment object, or data that can be used
+    to build one.
+
+    best_tree: if True (default:False), uses a slower but more accurate
+    algorithm to build the tree.
+        NOTE: Mafft does not necessarily support best_tree option.
+        Will only return guide tree used to align sequences.  Passing
+        best_tree = True will construct the guide tree 100 times instead
+        of default 2 times.
+
+        ***Mafft does allow you to get the guide tree back, but the IDs in the
+        output guide tree do not match the original IDs in the fasta file
+        and are impossible to map.  Sent bug report to Mafft authors; possibly
+        expect this option in future version.***
+
+    params: dict of parameters to pass in to the Mafft app controller.
+
+    The result will be an cogent.core.tree.PhyloNode object, or None if tree
+    fails.
+    """
+    #Current version of Mafft does not support tree building.
+    raise NotImplementedError, """Current version of Mafft does not support tree building."""
+
+def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False):
+    """Returns an Alignment object from seqs and existing Alignment.
+
+    seqs: a cogent.core.sequence.Sequence object, or data that can be used
+    to build one.
+
+    aln: an cogent.core.alignment.Alignment object, or data that can be used
+    to build one
+
+    params: dict of parameters to pass in to the Mafft app controller.
+    """
+    #create SequenceCollection object from seqs
+    seq_collection = SequenceCollection(seqs,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    seq_int_map, seq_int_keys = seq_collection.getIntMap()
+    #Create SequenceCollection from int_map.
+    seq_int_map = SequenceCollection(seq_int_map,MolType=moltype)
+
+    #create Alignment object from aln
+    aln = Alignment(aln,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_')
+    #Create SequenceCollection from int_map.
+    aln_int_map = Alignment(aln_int_map,MolType=moltype)
+
+    #Update seq_int_keys with aln_int_keys
+    seq_int_keys.update(aln_int_keys)
+
+    #Create Mafft app.
+    app = Mafft(InputHandler='_input_as_multiline_string',\
+        params=params,
+        SuppressStderr=True)
+
+    #Turn on correct moltype
+    moltype_string = moltype.label.upper()
+    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
+
+    #Do not report progress
+    app.Parameters['--quiet'].on()
+
+    #Add aln_int_map as seed alignment
+    app.Parameters['--seed'].on(\
+        app._tempfile_as_multiline_string(aln_int_map.toFasta()))
+
+    #More accurate alignment, sacrificing performance.
+    if accurate:
+        app.Parameters['--globalpair'].on()
+        app.Parameters['--maxiterate'].Value=1000
+
+    #Get results using int_map as input to app
+    res = app(seq_int_map.toFasta())
+    #Get alignment as dict out of results
+    alignment = dict(parse_fasta(res['StdOut']))
+
+    #Make new dict mapping original IDs
+    new_alignment = {}
+    for k,v in alignment.items():
+        key = k.replace('_seed_','')
+        new_alignment[seq_int_keys[key]]=v
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment,MolType=moltype)
+    #Clean up
+    res.cleanUp()
+    remove(app.Parameters['--seed'].Value)
+    del(seq_collection,seq_int_map,seq_int_keys,\
+        aln,aln_int_map,aln_int_keys,app,res,alignment)
+
+    return new_alignment
+
+def align_two_alignments(aln1, aln2, moltype, params=None):
+    """Returns an Alignment object from two existing Alignments.
+
+    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
+    used to build them.
+        - Mafft profile alignment only works with aligned sequences. Alignment
+        object used to handle unaligned sequences.
+
+    params: dict of parameters to pass in to the Mafft app controller.
+    """
+    #create SequenceCollection object from seqs
+    aln1 = Alignment(aln1,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    aln1_int_map, aln1_int_keys = aln1.getIntMap()
+    #Create SequenceCollection from int_map.
+    aln1_int_map = Alignment(aln1_int_map,MolType=moltype)
+
+    #create Alignment object from aln
+    aln2 = Alignment(aln2,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    aln2_int_map, aln2_int_keys = aln2.getIntMap(prefix='seqn_')
+    #Create SequenceCollection from int_map.
+    aln2_int_map = Alignment(aln2_int_map,MolType=moltype)
+
+    #Update aln1_int_keys with aln2_int_keys
+    aln1_int_keys.update(aln2_int_keys)
+
+    #Create Mafft app.
+    app = Mafft(InputHandler='_input_as_paths',\
+        params=params,
+        SuppressStderr=False)
+    app._command = 'mafft-profile'
+
+    aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta())
+    aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta())
+    filepaths = [aln1_path,aln2_path]
+
+    #Get results using int_map as input to app
+    res = app(filepaths)
+
+    #Get alignment as dict out of results
+    alignment = dict(parse_fasta(res['StdOut']))
+
+    #Make new dict mapping original IDs
+    new_alignment = {}
+    for k,v in alignment.items():
+        key = k.replace('_seed_','')
+        new_alignment[aln1_int_keys[key]]=v
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment,MolType=moltype)
+    #Clean up
+    res.cleanUp()
+    remove(aln1_path)
+    remove(aln2_path)
+    remove('pre')
+    remove('trace')
+    del(aln1,aln1_int_map,aln1_int_keys,\
+        aln2,aln2_int_map,aln2_int_keys,app,res,alignment)
+
+    return new_alignment
diff --git a/bfillings/mothur.py b/bfillings/mothur.py
new file mode 100644
index 0000000..ae6aca9
--- /dev/null
+++ b/bfillings/mothur.py
@@ -0,0 +1,589 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Provides an application controller for the commandline version of
+mothur Version 1.6.0
+"""
+
+
+from __future__ import with_statement, division
+from os import path, getcwd, mkdir, remove, listdir
+import re
+from shutil import copyfile, rmtree
+from subprocess import Popen
+from tempfile import NamedTemporaryFile, mkdtemp, gettempdir
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import ValuedParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+                            CommandLineAppResult, ApplicationError)
+
+
+def is_empty(line):
+    """Returns True empty lines and lines consisting only of whitespace."""
+    return (not line) or line.isspace()
+
+
+def parse_otu_list(lines, precision=0.0049):
+    """Parser for mothur *.list file
+
+    To ensure all distances are of type float, the parser returns a
+    distance of 0.0 for the unique groups.  However, if some sequences
+    are very similar, mothur may return a grouping at zero distance.
+    What Mothur really means by this, however, is that the clustering
+    is at the level of Mothur's precision.  In this case, the parser
+    returns the distance explicitly.
+
+    If you are parsing otu's with a non-default precision, you must
+    specify the precision here to ensure that the parsed distances are
+    in order.
+
+    Returns an iterator over (distance, otu_list)
+    """
+    for line in lines:
+        if is_empty(line):
+            continue
+        tokens = line.strip().split('\t')
+
+        distance_str = tokens.pop(0)
+        if distance_str.lstrip().lower().startswith('u'):
+            distance = 0.0
+        elif distance_str == '0.0':
+            distance = float(precision)
+        else:
+            distance = float(distance_str)
+
+        num_otus = int(tokens.pop(0))
+        otu_list = [t.split(',') for t in tokens]
+
+        yield (distance, otu_list)
+
+
+class Mothur(CommandLineApplication):
+
+    """Mothur application controller
+    """
+    _options = {
+        # Clustering algorithm.  Choices are furthest, nearest, and
+        # average
+        'method': ValuedParameter(
+            Name='method', Value='furthest', Delimiter='=', Prefix=''),
+        # Cutoff distance for the distance matrix
+        'cutoff': ValuedParameter(
+            Name='cutoff', Value=None, Delimiter='=', Prefix=''),
+        # Minimum pairwise distance to consider for clustering
+        'precision': ValuedParameter(
+            Name='precision', Value=None, Delimiter='=', Prefix=''),
+    }
+    _parameters = {}
+    _parameters.update(_options)
+    _input_handler = '_input_as_multiline_string'
+    _command = 'mothur'
+
+    def __init__(self, params=None, InputHandler=None, SuppressStderr=None,
+                 SuppressStdout=None, WorkingDir=None, TmpDir=None,
+                 TmpNameLen=20, HALT_EXEC=False):
+        """Initialize a Mothur application controller
+
+            params: a dictionary mapping the Parameter id or synonym to its
+                value (or None for FlagParameters or MixedParameters in flag
+                mode) for Parameters that should be turned on
+            InputHandler: this is the method to be run on data when it is
+                passed into call. This should be a string containing the
+                method name. The default is _input_as_string which casts data
+                to a string before appending it to the command line argument
+            SuppressStderr: if set to True, will route standard error to
+                /dev/null, False by default
+            SuppressStdout: if set to True, will route standard out to
+                /dev/null, False by default
+            WorkingDir: the directory where you want the application to run,
+                default is the current working directory, but is useful to
+                change in cases where the program being run creates output
+                to its current working directory and you either don't want
+                it to end up where you are running the program, or the user
+                running the script doesn't have write access to the current
+                working directory
+                WARNING: WorkingDir MUST be an absolute path!
+            TmpDir: the directory where temp files will be created, default
+                value is determined by environment variables.
+            TmpNameLen: the length of the temp file name
+            HALT_EXEC: if True, raises exception w/ command output just
+                before execution, doesn't clean up temp files. Default False.
+
+        Note: Mothur input files are copied to the working directory,
+        not the temp directory, when the application controller is
+        called.  Output files generated by mothur are generated based on
+        the name of the input file, and if written to the tmp directory
+        could collide with another filename.  Our strategy is to allow
+        the user to specify a working directory where input, output, and
+        log files are created.
+
+        File cleanup is handled in the same way as other app
+        controllers: input files are removed when the controller is
+        called unless remove_tmp is True, output and log files are
+        cleaned up when the cleanUp method is called on the results.
+        """
+        super(Mothur, self).__init__(
+            params=params, InputHandler=InputHandler,
+            SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout,
+            WorkingDir='', TmpDir='', TmpNameLen=TmpNameLen,
+            HALT_EXEC=HALT_EXEC)
+        # Prevent self.WorkingDir from being explicitly cast as a
+        # FilePath object.  This behavior does not seem necessary in
+        # the parent's __init__() method, since the casting is
+        # repeated in _set_WorkingDir().
+        if WorkingDir is not None:
+            working_dir = WorkingDir
+        else:
+            working_dir = self._working_dir or getcwd()
+        self.WorkingDir = working_dir
+        if TmpDir is not None:
+            self.TmpDir = TmpDir
+        else:
+            self.TmpDir = gettempdir()
+
+    @staticmethod
+    def getHelp():
+        """Returns link to online manual"""
+        help = (
+            'See manual, available on the MOTHUR wiki:\n'
+            'http://schloss.micro.umass.edu/mothur/'
+        )
+        return help
+
+    def __call__(self, data=None, remove_tmp=True):
+        """Run the application with the specified kwargs on data
+
+            data: anything that can be cast into a string or written out to
+                a file. Usually either a list of things or a single string or
+                number. input_handler will be called on this data before it
+                is passed as part of the command-line argument, so by creating
+                your own input handlers you can customize what kind of data
+                you want your application to accept
+
+            remove_tmp: if True, removes tmp files
+        """
+        # Process the input data.  Input filepath is stored in
+        # self._input_filename
+        getattr(self, self.InputHandler)(data)
+
+        if self.SuppressStdout:
+            outfile = None
+        else:
+            outfile = open(self.getTmpFilename(self.TmpDir), 'w')
+        if self.SuppressStderr:
+            errfile = None
+        else:
+            errfile = open(self.getTmpFilename(self.TmpDir), 'w')
+
+        args = [self._command, self._compile_mothur_script()]
+        process = Popen(
+            args, stdout=outfile, stderr=errfile, cwd=self.WorkingDir)
+        exit_status = process.wait()
+        if not self._accept_exit_status(exit_status):
+            raise ApplicationError(
+                'Unacceptable application exit status: %s, command: %s' %
+                (exit_status, args))
+
+        if outfile is not None:
+            outfile.seek(0)
+        if errfile is not None:
+            errfile.seek(0)
+        result = CommandLineAppResult(
+            outfile, errfile, exit_status, result_paths=self._get_result_paths())
+
+        # Clean up the input file if one was created
+        if remove_tmp:
+            if self._input_filename:
+                remove(self._input_filename)
+                self._input_filename = None
+
+        return result
+
+    def _accept_exit_status(self, status):
+        return int(status) == 0
+
+    def _compile_mothur_script(self):
+        """Returns a Mothur batch script as a string"""
+        def format_opts(*opts):
+            """Formats a series of options for a Mothur script"""
+            return ', '.join(filter(None, map(str, opts)))
+        vars = {
+            'in': self._input_filename,
+            'unique': self._derive_unique_path(),
+            'dist': self._derive_dist_path(),
+            'names': self._derive_names_path(),
+            'cluster_opts': format_opts(
+                self.Parameters['method'],
+                self.Parameters['cutoff'],
+                self.Parameters['precision'],
+            ),
+        }
+        script = (
+            '#'
+            'unique.seqs(fasta=%(in)s); '
+            'dist.seqs(fasta=%(unique)s); '
+            'read.dist(column=%(dist)s, name=%(names)s); '
+            'cluster(%(cluster_opts)s)' % vars
+        )
+        return script
+
+    def _get_result_paths(self):
+        paths = {
+            'distance matrix': self._derive_dist_path(),
+            'otu list': self._derive_list_path(),
+            'rank abundance': self._derive_rank_abundance_path(),
+            'species abundance': self._derive_species_abundance_path(),
+            'unique names': self._derive_names_path(),
+            'unique seqs': self._derive_unique_path(),
+            'log': self._derive_log_path(),
+        }
+        return dict([(k, ResultPath(v)) for (k, v) in paths.items()])
+
+    # Methods to derive/guess output pathnames produced by MOTHUR.
+    # TODO: test for input files that do not have a filetype extension
+
+    def _derive_log_path(self):
+        """Guess logfile path produced by Mothur
+
+        This method checks the working directory for log files
+        generated by Mothur.  It will raise an ApplicationError if no
+        log file can be found.
+
+        Mothur generates log files named in a nondeterministic way,
+        using the current time.  We return the log file with the most
+        recent time, although this may lead to incorrect log file
+        detection if you are running many instances of mothur
+        simultaneously.
+        """
+        filenames = listdir(self.WorkingDir)
+        lognames = [
+            x for x in filenames if re.match(
+                "^mothur\.\d+\.logfile$",
+                x)]
+        if not lognames:
+            raise ApplicationError(
+                'No log file detected in directory %s. Contents: \n\t%s' % (
+                    input_dir, '\n\t'.join(possible_logfiles)))
+        most_recent_logname = sorted(lognames, reverse=True)[0]
+        return path.join(self.WorkingDir, most_recent_logname)
+
+    def _derive_unique_path(self):
+        """Guess unique sequences path produced by Mothur"""
+        base, ext = path.splitext(self._input_filename)
+        return '%s.unique%s' % (base, ext)
+
+    def _derive_dist_path(self):
+        """Guess distance matrix path produced by Mothur"""
+        base, ext = path.splitext(self._input_filename)
+        return '%s.unique.dist' % base
+
+    def _derive_names_path(self):
+        """Guess unique names file path produced by Mothur"""
+        base, ext = path.splitext(self._input_filename)
+        return '%s.names' % base
+
+    def __get_method_abbrev(self):
+        """Abbreviated form of clustering method parameter.
+
+        Used to guess output filenames for MOTHUR.
+        """
+        abbrevs = {
+            'furthest': 'fn',
+            'nearest': 'nn',
+            'average': 'an',
+        }
+        if self.Parameters['method'].isOn():
+            method = self.Parameters['method'].Value
+        else:
+            method = self.Parameters['method'].Default
+        return abbrevs[method]
+
+    def _derive_list_path(self):
+        """Guess otu list file path produced by Mothur"""
+        base, ext = path.splitext(self._input_filename)
+        return '%s.unique.%s.list' % (base, self.__get_method_abbrev())
+
+    def _derive_rank_abundance_path(self):
+        """Guess rank abundance file path produced by Mothur"""
+        base, ext = path.splitext(self._input_filename)
+        return '%s.unique.%s.rabund' % (base, self.__get_method_abbrev())
+
+    def _derive_species_abundance_path(self):
+        """Guess species abundance file path produced by Mothur"""
+        base, ext = path.splitext(self._input_filename)
+        return '%s.unique.%s.sabund' % (base, self.__get_method_abbrev())
+
+    def getTmpFilename(self, tmp_dir=None, prefix='tmp', suffix='.txt'):
+        """Returns a temporary filename
+
+        Similar interface to tempfile.mktmp()
+        """
+        # Override to change default constructor to str(). FilePath
+        # objects muck up the Mothur script.
+        return super(Mothur, self).getTmpFilename(
+            tmp_dir=tmp_dir, prefix=prefix, suffix=suffix,
+            result_constructor=str)
+
+    # Temporary input file needs to be in the working directory, so we
+    # override all input handlers.
+
+    def _input_as_multiline_string(self, data):
+        """Write multiline string to temp file, return filename
+
+        data: a multiline string to be written to a file.
+        """
+        self._input_filename = self.getTmpFilename(
+            self.WorkingDir, suffix='.fasta')
+        with open(self._input_filename, 'w') as f:
+            f.write(data)
+        return self._input_filename
+
+    def _input_as_lines(self, data):
+        """Write sequence of lines to temp file, return filename
+
+        data: a sequence to be written to a file, each element of the
+            sequence will compose a line in the file
+
+        * Note: '\n' will be stripped off the end of each sequence
+            element before writing to a file in order to avoid
+            multiple new lines accidentally be written to a file
+        """
+        self._input_filename = self.getTmpFilename(
+            self.WorkingDir, suffix='.fasta')
+        with open(self._input_filename, 'w') as f:
+            # Use lazy iteration instead of list comprehension to
+            # prevent reading entire file into memory
+            for line in data:
+                f.write(str(line).strip('\n'))
+                f.write('\n')
+        return self._input_filename
+
+    def _input_as_path(self, data):
+        """Copys the provided file to WorkingDir and returns the new filename
+
+        data: path or filename
+        """
+        self._input_filename = self.getTmpFilename(
+            self.WorkingDir, suffix='.fasta')
+        copyfile(data, self._input_filename)
+        return self._input_filename
+
+    def _input_as_paths(self, data):
+        raise NotImplementedError('Not applicable for MOTHUR controller.')
+
+    def _input_as_string(self, data):
+        raise NotImplementedError('Not applicable for MOTHUR controller.')
+
+    # FilePath objects muck up the Mothur script, so we override the
+    # property methods for self.WorkingDir
+
+    def _get_WorkingDir(self):
+        """Gets the working directory"""
+        return self._curr_working_dir
+
+    def _set_WorkingDir(self, path):
+        """Sets the working directory
+        """
+        self._curr_working_dir = path
+        try:
+            mkdir(self.WorkingDir)
+        except OSError:
+            # Directory already exists
+            pass
+
+    WorkingDir = property(_get_WorkingDir, _set_WorkingDir)
+
+
+def mothur_from_file(file):
+    app = Mothur(InputHandler='_input_as_lines')
+    result = app(file)
+    # Force evaluation, so we can safely clean up files
+    otus = list(parse_otu_list(result['otu list']))
+    result.cleanUp()
+    return otus
+
+
+# Files with dashes currently break MOTHUR -- in the upcoming version
+# of the software, they may be escaped with a backslash.  We implement
+# and test for this now, since it's broken anyway!
+
+
+class _MothurFilepathParameter(ValuedParameter):
+
+    """Inserts escape characters in filepath parameters for Mothur."""
+
+    def _get_value(self):
+        return self._Value
+
+    def _set_value(self, val):
+        if val:
+            self._Value = str(val).replace("-", "\\-")
+        else:
+            self._Value = val
+
+    Value = property(_get_value, _set_value)
+
+
+class MothurClassifySeqs(Mothur):
+    _options = {
+        'reference': _MothurFilepathParameter(
+            Name='reference', Value=None, Delimiter='=', Prefix=''),
+        'taxonomy': _MothurFilepathParameter(
+            Name='taxonomy', Value=None, Delimiter='=', Prefix=''),
+        'cutoff': ValuedParameter(
+            Name='cutoff', Value=None, Delimiter='=', Prefix=''),
+        'iters': ValuedParameter(
+            Name='iters', Value=None, Delimiter='=', Prefix=''),
+        'ksize': ValuedParameter(
+            Name='ksize', Value=None, Delimiter='=', Prefix=''),
+    }
+    _parameters = {}
+    _parameters.update(_options)
+    _filepath_parameters = set(['reference', 'taxonomy'])
+
+    def _format_function_arguments(self, opts):
+        """Format a series of function arguments in a Mothur script."""
+        params = [self.Parameters[x] for x in opts]
+        return ', '.join(filter(None, map(str, params)))
+
+    def _compile_mothur_script(self):
+        """Returns a Mothur batch script as a string"""
+        fasta = self._input_filename
+
+        required_params = ["reference", "taxonomy"]
+        for p in required_params:
+            if self.Parameters[p].Value is None:
+                raise ValueError("Must provide value for parameter %s" % p)
+        optional_params = ["ksize", "cutoff", "iters"]
+        args = self._format_function_arguments(
+            required_params + optional_params)
+        script = '#classify.seqs(fasta=%s, %s)' % (fasta, args)
+        return script
+
+    def _get_result_paths(self):
+        input_base, ext = path.splitext(path.basename(self._input_filename))
+        result_by_suffix = {
+            ".summary": "summary",
+            ".taxonomy": "assignments",
+            ".accnos": "accnos",
+        }
+
+        paths = {'log': self._derive_log_path()}
+        input_dir = path.dirname(self._input_filename)
+        for fn in listdir(input_dir):
+            if fn.startswith(input_base):
+                for suffix, result_key in result_by_suffix.items():
+                    if fn.endswith(suffix):
+                        paths[result_key] = path.join(input_dir, fn)
+        return dict([(k, ResultPath(v)) for (k, v) in paths.items()])
+
+
+def parse_mothur_assignments(lines):
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        seq_id, _, assignment = line.partition("\t")
+
+        # Special case: unidentified sequences should be given a
+        # confidence of 0.0.  Newer versions of MOTHUR return a real
+        # value for the confidence -- maybe we should consider keeping
+        # the value if present, because a sequence may conceivably be
+        # unknown with 85% confidence.
+        if re.match('unknown', assignment, re.IGNORECASE):
+            yield seq_id, ["Unknown"], 0.0
+            continue
+
+        toks = assignment.rstrip(";").split(";")
+        lineage = []
+        conf = 0.0
+        for tok in toks:
+            matchobj = re.match("(.+)\((\d+)\)$", tok)
+            if matchobj:
+                lineage.append(matchobj.group(1))
+                pct_conf = int(matchobj.group(2))
+                conf = pct_conf / 100.0
+        yield seq_id, lineage, conf
+
+
+def mothur_classify_file(
+        query_file, ref_fp, tax_fp, cutoff=None, iters=None, ksize=None,
+        output_fp=None, tmp_dir=None):
+    """Classify a set of sequences using Mothur's naive bayes method
+
+    Dashes are used in Mothur to provide multiple filenames.  A
+    filepath with a dash typically breaks an otherwise valid command
+    in Mothur.  This wrapper script makes a copy of both files, ref_fp
+    and tax_fp, to ensure that the path has no dashes.
+
+    For convenience, we also ensure that each taxon list in the
+    id-to-taxonomy file ends with a semicolon.
+    """
+    if tmp_dir is None:
+        tmp_dir = gettempdir()
+
+    ref_seq_ids = set()
+
+    user_ref_file = open(ref_fp)
+    tmp_ref_file = NamedTemporaryFile(dir=tmp_dir, suffix=".ref.fa")
+    for seq_id, seq in parse_fasta(user_ref_file):
+        id_token = seq_id.split()[0]
+        ref_seq_ids.add(id_token)
+        tmp_ref_file.write(">%s\n%s\n" % (seq_id, seq))
+    tmp_ref_file.seek(0)
+
+    user_tax_file = open(tax_fp)
+    tmp_tax_file = NamedTemporaryFile(dir=tmp_dir, suffix=".tax.txt")
+    for line in user_tax_file:
+        line = line.rstrip()
+        if not line:
+            continue
+
+        # MOTHUR is particular that each assignment end with a semicolon.
+        if not line.endswith(";"):
+            line = line + ";"
+
+        id_token, _, _ = line.partition("\t")
+        if id_token in ref_seq_ids:
+            tmp_tax_file.write(line)
+            tmp_tax_file.write("\n")
+    tmp_tax_file.seek(0)
+
+    params = {"reference": tmp_ref_file.name, "taxonomy": tmp_tax_file.name}
+    if cutoff is not None:
+        params["cutoff"] = cutoff
+    if ksize is not None:
+        params["ksize"] = ksize
+    if iters is not None:
+        params["iters"] = iters
+
+    # Create a temporary working directory to accommodate mothur's output
+    # files, which are generated automatically based on the input
+    # file.
+    work_dir = mkdtemp(dir=tmp_dir)
+
+    app = MothurClassifySeqs(
+        params, InputHandler='_input_as_lines', WorkingDir=work_dir,
+        TmpDir=tmp_dir)
+    result = app(query_file)
+
+    # Force evaluation so we can safely clean up files
+    assignments = list(parse_mothur_assignments(result['assignments']))
+    result.cleanUp()
+    rmtree(work_dir)
+
+    if output_fp is not None:
+        f = open(output_fp, "w")
+        for query_id, taxa, conf in assignments:
+            taxa_str = ";".join(taxa)
+            f.write("%s\t%s\t%.2f\n" % (query_id, taxa_str, conf))
+        f.close()
+        return None
+    return dict((a, (b, c)) for a, b, c in assignments)
diff --git a/bfillings/muscle_v38.py b/bfillings/muscle_v38.py
new file mode 100644
index 0000000..3af9971
--- /dev/null
+++ b/bfillings/muscle_v38.py
@@ -0,0 +1,777 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for muscle 3.8
+"""
+from os import remove
+from random import choice
+import tempfile
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import FlagParameter, ValuedParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+                            get_tmp_filename, guess_input_handler)
+
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.parse.tree import DndParser
+from cogent.core.tree import PhyloNode
+from cogent import DNA
+
+
+class Muscle(CommandLineApplication):
+    """Muscle application controller"""
+
+    _options ={
+        # Minimum spacing between anchor columns. [Integer]
+        '-anchorspacing':ValuedParameter('-',Name='anchorspacing',Delimiter=' '),
+        # Center parameter. Should be negative [Float]
+        '-center':ValuedParameter('-',Name='center',Delimiter=' '),
+
+        # Clustering method. cluster1 is used in iteration 1
+        # and 2, cluster2 in later iterations
+        '-cluster1':ValuedParameter('-',Name='cluster1',Delimiter=' '),
+        '-cluster2':ValuedParameter('-',Name='cluster2',Delimiter=' '),
+
+        # Minimum length of diagonal.
+        '-diaglength':ValuedParameter('-',Name='diaglength',Delimiter=' '),
+
+        # Discard this many positions at ends of diagonal.
+        '-diagmargin':ValuedParameter('-',Name='diagmargin',Delimiter=' '),
+
+        # Distance measure for iteration 1.
+        '-distance1':ValuedParameter('-',Name='distance1',Delimiter=' '),
+
+        # Distance measure for iterations 2, 3 ...
+        '-distance2':ValuedParameter('-',Name='distance2',Delimiter=' '),
+
+        # The gap open score. Must be negative.
+        '-gapopen':ValuedParameter('-',Name='gapopen',Delimiter=' '),
+
+        # Window size for determining whether a region is hydrophobic.
+        '-hydro':ValuedParameter('-',Name='hydro',Delimiter=' '),
+
+        # Multiplier for gap open/close penalties in hydrophobic regions.
+        '-hydrofactor':ValuedParameter('-',Name='hydrofactor',Delimiter=' '),
+
+        # Where to find the input sequences.
+        '-in':ValuedParameter('-',Name='in',Delimiter=' ', Quote="\""),
+        '-in1':ValuedParameter('-',Name='in1',Delimiter=' ', Quote="\""),
+        '-in2':ValuedParameter('-',Name='in2',Delimiter=' ', Quote="\""),
+
+        # Log file name (delete existing file).
+        '-log':ValuedParameter('-',Name='log',Delimiter=' '),
+
+        # Log file name (append to existing file).
+        '-loga':ValuedParameter('-',Name='loga',Delimiter=' '),
+
+        # Maximum distance between two diagonals that allows them to merge
+        # into one diagonal.
+        '-maxdiagbreak':ValuedParameter('-',Name='maxdiagbreak',Delimiter=' '),
+
+        # Maximum time to run in hours. The actual time may exceed the
+        # requested limit by a few minutes. Decimals are allowed, so 1.5
+        # means one hour and 30 minutes.
+        '-maxhours':ValuedParameter('-',Name='maxhours',Delimiter=' '),
+
+        # Maximum number of iterations.
+        '-maxiters':ValuedParameter('-',Name='maxiters',Delimiter=' '),
+
+        # Maximum memory in Mb
+        '-maxmb': ValuedParameter('-', Name='maxmb', Delimiter=' '),
+
+        # Maximum number of new trees to build in iteration 2.
+        '-maxtrees':ValuedParameter('-',Name='maxtrees',Delimiter=' '),
+
+        # Minimum score a column must have to be an anchor.
+        '-minbestcolscore':ValuedParameter('-',Name='minbestcolscore',Delimiter=' '),
+
+        # Minimum smoothed score a column must have to be an anchor.
+        '-minsmoothscore':ValuedParameter('-',Name='minsmoothscore',Delimiter=' '),
+
+        # Objective score used by tree dependent refinement.
+        # sp=sum-of-pairs score.
+        # spf=sum-of-pairs score (dimer approximation)
+        # spm=sp for < 100 seqs, otherwise spf
+        # dp=dynamic programming score.
+        # ps=average profile-sequence score.
+        # xp=cross profile score.
+        '-objscore':ValuedParameter('-',Name='objscore',Delimiter=' '),
+
+        # Where to write the alignment.
+        '-out':ValuedParameter('-',Name='out',Delimiter=' ', Quote="\""),
+
+        # Where to write the file in phylip sequenctial format (v3.6 only).
+        '-physout':ValuedParameter('-',Name='physout',Delimiter=' '),
+
+        # Where to write the file in phylip interleaved format (v3.6 only).
+        '-phyiout':ValuedParameter('-',Name='phyiout',Delimiter=' '),
+
+        # Set to profile for aligning two alignments and adding seqs to an
+        # existing alignment
+        '-profile':FlagParameter(Prefix='-',Name='profile'),
+
+        # Method used to root tree; root1 is used in iteration 1 and 2, root2
+        # in later iterations.
+        '-root1':ValuedParameter('-',Name='root1',Delimiter=' '),
+        '-root2':ValuedParameter('-',Name='root2',Delimiter=' '),
+
+        # Sequence type.
+        '-seqtype':ValuedParameter('-',Name='seqtype',Delimiter=' '),
+
+        # Maximum value of column score for smoothing purposes.
+        '-smoothscoreceil':ValuedParameter('-',Name='smoothscoreceil',Delimiter=' '),
+
+        # Constant used in UPGMB clustering. Determines the relative fraction
+        # of average linkage (SUEFF) vs. nearest-neighbor linkage (1 . SUEFF).
+        '-SUEFF':ValuedParameter('-',Name='SUEFF',Delimiter=' '),
+
+        # Save tree produced in first or second iteration to given file in
+        # Newick (Phylip-compatible) format.
+        '-tree1':ValuedParameter('-',Name='tree1',Delimiter=' ', Quote="\""),
+        '-tree2':ValuedParameter('-',Name='tree2',Delimiter=' ', Quote="\""),
+
+        # Sequence weighting scheme.
+        # weight1 is used in iterations 1 and 2.
+        # weight2 is used for tree-dependent refinement.
+        # none=all sequences have equal weight.
+        # henikoff=Henikoff & Henikoff weighting scheme.
+        # henikoffpb=Modified Henikoff scheme as used in PSI-BLAST.
+        # clustalw=CLUSTALW method.
+        # threeway=Gotoh three-way method.
+        '-weight1':ValuedParameter('-',Name='weight1',Delimiter=' '),
+        '-weight2':ValuedParameter('-',Name='weight2',Delimiter=' '),
+
+        # Use anchor optimization in tree dependent refinement iterations
+        '-anchors':FlagParameter(Prefix='-',Name='anchors'),
+
+        # Write output in CLUSTALW format (default is FASTA).
+        '-clw':FlagParameter(Prefix='-',Name='clw'),
+
+        # Cluster sequences
+        '-clusteronly':FlagParameter(Prefix='-',Name='clusteronly'),
+        # neighborjoining is "unrecognized"
+        #'-neighborjoining':FlagParameter(Prefix='-',Name='neighborjoining'),
+
+
+        # Write output in CLUSTALW format with the "CLUSTAL W (1.81)" header
+        # rather than the MUSCLE version. This is useful when a post-processing
+        # step is picky about the file header.
+        '-clwstrict':FlagParameter(Prefix='-',Name='clwstrict'),
+
+        # Do not catch exceptions.
+        '-core':FlagParameter(Prefix='-',Name='core'),
+
+        # Write output in FASTA format. Alternatives include .clw,
+        # .clwstrict, .msf and .html.
+        '-fasta':FlagParameter(Prefix='-',Name='fasta'),
+
+        # Group similar sequences together in the output. This is the default.
+        # See also .stable.
+        '-group':FlagParameter(Prefix='-',Name='group'),
+
+        # Write output in HTML format (default is FASTA).
+        '-html':FlagParameter(Prefix='-',Name='html'),
+
+        # Use log-expectation profile score (VTML240). Alternatives are to use
+        # -sp or -sv. This is the default for amino acid sequences.
+        '-le':FlagParameter(Prefix='-',Name='le'),
+
+        # Write output in MSF format (default is FASTA).
+        '-msf':FlagParameter(Prefix='-',Name='msf'),
+
+        # Disable anchor optimization. Default is -anchors.
+        '-noanchors':FlagParameter(Prefix='-',Name='noanchors'),
+
+        # Catch exceptions and give an error message if possible.
+        '-nocore':FlagParameter(Prefix='-',Name='nocore'),
+
+        # Do not display progress messages.
+        '-quiet':FlagParameter(Prefix='-',Name='quiet'),
+
+        # Input file is already aligned, skip first two iterations and begin
+        # tree dependent refinement.
+        '-refine':FlagParameter(Prefix='-',Name='refine'),
+
+        # Use sum-of-pairs protein profile score (PAM200). Default is -le.
+        '-sp':FlagParameter(Prefix='-',Name='sp'),
+
+        # Use sum-of-pairs nucleotide profile score (BLASTZ parameters). This
+        # is the only option for nucleotides, and is therefore the default.
+        '-spn':FlagParameter(Prefix='-',Name='spn'),
+
+        # Preserve input order of sequences in output file. Default is to group
+        # sequences by similarity (-group).
+        '-stable':FlagParameter(Prefix='-',Name='stable'),
+
+        # Use sum-of-pairs profile score (VTML240). Default is -le.
+        '-sv':FlagParameter(Prefix='-',Name='sv'),
+
+        # Diagonal optimization
+        '-diags':FlagParameter(Prefix='-',Name='diags'),
+        '-diags1':FlagParameter(Prefix='-',Name='diags1'),
+        '-diags2':FlagParameter(Prefix='-',Name='diags2'),
+
+
+        # Terminal gaps penalized with full penalty.
+        # [1] Not fully supported in this version.
+        '-termgapsfull':FlagParameter(Prefix='-',Name='termgapsfull'),
+
+        # Terminal gaps penalized with half penalty.
+        # [1] Not fully supported in this version.
+        '-termgapshalf':FlagParameter(Prefix='-',Name='termgapshalf'),
+
+        # Terminal gaps penalized with half penalty if gap relative to
+        # longer sequence, otherwise with full penalty.
+        # [1] Not fully supported in this version.
+        '-termgapshalflonger':FlagParameter(Prefix='-',Name='termgapshalflonger'),
+
+        # Write parameter settings and progress messages to log file.
+        '-verbose':FlagParameter(Prefix='-',Name='verbose'),
+
+        # Write version string to stdout and exit.
+        '-version':FlagParameter(Prefix='-',Name='version'),
+    }
+
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "muscle"
+
+    def _input_as_seqs(self,data):
+        lines = []
+        for i,s in enumerate(data):
+            #will number the sequences 1,2,3,etc.
+            lines.append(''.join(['>',str(i+1)]))
+            lines.append(s)
+        return self._input_as_lines(lines)
+
+    def _input_as_lines(self,data):
+        if data:
+            self.Parameters['-in']\
+                .on(super(Muscle,self)._input_as_lines(data))
+
+        return ''
+
+    def _input_as_string(self,data):
+        """Makes data the value of a specific parameter
+
+        This method returns the empty string. The parameter will be printed
+        automatically once set.
+        """
+        if data:
+            self.Parameters['-in'].on(str(data))
+        return ''
+
+    def _input_as_multiline_string(self, data):
+        if data:
+            self.Parameters['-in']\
+                .on(super(Muscle,self)._input_as_multiline_string(data))
+        return ''
+
+    def _input_as_multifile(self, data):
+        """For use with the -profile option
+
+        This input handler expects data to be a tuple containing two
+        filenames. Index 0 will be set to -in1 and index 1 to -in2
+        """
+        if data:
+            try:
+                filename1, filename2 = data
+            except:
+                raise ValueError, "Expected two filenames"
+
+            self.Parameters['-in'].off()
+            self.Parameters['-in1'].on(filename1)
+            self.Parameters['-in2'].on(filename2)
+        return ''
+
+    def _align_out_filename(self):
+
+        if self.Parameters['-out'].isOn():
+            aln_filename = self._absolute(str(self.Parameters['-out'].Value))
+        else:
+            raise ValueError, "No output file specified."
+        return aln_filename
+
+    def _tree1_out_filename(self):
+
+        if self.Parameters['-tree1'].isOn():
+            aln_filename = self._absolute(str(self.Parameters['-tree1'].Value))
+        else:
+            raise ValueError, "No tree output file specified."
+        return aln_filename
+
+    def _tree2_out_filename(self):
+
+        if self.Parameters['-tree2'].isOn():
+            tree_filename = self._absolute(str(self.Parameters['-tree2'].Value))
+        else:
+            raise ValueError, "No tree output file specified."
+        return tree_filename
+
+    def _get_result_paths(self,data):
+
+        result = {}
+        if self.Parameters['-out'].isOn():
+            out_name = self._align_out_filename()
+            result['MuscleOut'] = ResultPath(Path=out_name,IsWritten=True)
+        if self.Parameters['-tree1'].isOn():
+            out_name = self._tree1_out_filename()
+            result['Tree1Out'] = ResultPath(Path=out_name,IsWritten=True)
+        if self.Parameters['-tree2'].isOn():
+            out_name = self._tree2_out_filename()
+            result['Tree2Out'] = ResultPath(Path=out_name,IsWritten=True)
+        return result
+
+
+    def getHelp(self):
+        """Muscle help"""
+
+        help_str = """
+"""
+        return help_str
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+def muscle_seqs(seqs,
+                 add_seq_names=False,
+                 out_filename=None,
+                 input_handler=None,
+                 params={},
+                 WorkingDir=tempfile.gettempdir(),
+                 SuppressStderr=None,
+                 SuppressStdout=None):
+    """Muscle align list of sequences.
+
+    seqs: a list of sequences as strings or objects, you must set add_seq_names=True
+    or sequences in a multiline string, as read() from a fasta file
+    or sequences in a list of lines, as readlines() from a fasta file
+    or a fasta seq filename.
+
+    == for eg, testcode for guessing
+        #guess_input_handler should correctly identify input
+        gih = guess_input_handler
+        self.assertEqual(gih('abc.txt'), '_input_as_string')
+        self.assertEqual(gih('>ab\nTCAG'), '_input_as_multiline_string')
+        self.assertEqual(gih(['ACC','TGA'], True), '_input_as_seqs')
+        self.assertEqual(gih(['>a','ACC','>b','TGA']), '_input_as_lines')
+
+    == docstring for blast_seqs, apply to muscle_seqs ==
+    seqs: either file name or list of sequence objects or list of strings or
+    single multiline string containing sequences.
+
+    WARNING: DECISION RULES FOR INPUT HANDLING HAVE CHANGED. Decision rules
+    for data are as follows. If it's s list, treat as lines, unless
+    add_seq_names is true (in which case treat as list of seqs). If it's a
+    string, test whether it has newlines. If it doesn't have newlines, assume
+    it's a filename. If it does have newlines, it can't be a filename, so
+    assume it's a multiline string containing sequences.
+
+    If you want to skip the detection and force a specific type of input
+    handler, use input_handler='your_favorite_handler'.
+
+    add_seq_names: boolean. if True, sequence names are inserted in the list
+        of sequences. if False, it assumes seqs is a list of lines of some
+        proper format that the program can handle
+
+    Addl docs coming soon
+    """
+
+    if out_filename:
+        params["-out"] = out_filename
+    #else:
+    #    params["-out"] = get_tmp_filename(WorkingDir)
+
+    ih = input_handler or guess_input_handler(seqs, add_seq_names)
+    muscle_app = Muscle(
+                   params=params,
+                   InputHandler=ih,
+                   WorkingDir=WorkingDir,
+                   SuppressStderr=SuppressStderr,
+                   SuppressStdout=SuppressStdout)
+    return muscle_app(seqs)
+
+
+def cluster_seqs(seqs,
+                 neighbor_join=False,
+                 params={},
+                 add_seq_names=True,
+                 WorkingDir=tempfile.gettempdir(),
+                 SuppressStderr=None,
+                 SuppressStdout=None,
+                 max_chars=1000000,
+                 max_hours=1.0,
+                 constructor=PhyloNode,
+                 clean_up=True
+                 ):
+    """Muscle cluster list of sequences.
+
+    seqs: either file name or list of sequence objects or list of strings or
+        single multiline string containing sequences.
+
+    Addl docs coming soon
+    """
+    num_seqs = len(seqs)
+    if num_seqs < 2:
+        raise ValueError, "Muscle requres 2 or more sequences to cluster."
+
+
+    num_chars = sum(map(len, seqs))
+    if num_chars > max_chars:
+        params["-maxiters"] = 2
+        params["-diags1"] = True
+        params["-sv"] = True
+        #params["-distance1"] = "kmer6_6"
+        #params["-distance1"] = "kmer20_3"
+        #params["-distance1"] = "kbit20_3"
+        print "lots of chars, using fast align", num_chars
+
+
+    params["-maxhours"] = max_hours
+    #params["-maxiters"] = 10
+
+    #cluster_type = "upgmb"
+    #if neighbor_join:
+    #    cluster_type = "neighborjoining"
+
+    params["-clusteronly"] = True
+    params["-tree1"] = get_tmp_filename(WorkingDir)
+
+    muscle_res = muscle_seqs(seqs,
+                 params=params,
+                 add_seq_names=add_seq_names,
+                 WorkingDir=WorkingDir,
+                 SuppressStderr=SuppressStderr,
+                 SuppressStdout=SuppressStdout)
+
+    tree = DndParser(muscle_res["Tree1Out"], constructor=constructor)
+
+    if clean_up:
+        muscle_res.cleanUp()
+    return tree
+
+def aln_tree_seqs(seqs,
+                 input_handler=None,
+                 tree_type='neighborjoining',
+                 params={},
+                 add_seq_names=True,
+                 WorkingDir=tempfile.gettempdir(),
+                 SuppressStderr=None,
+                 SuppressStdout=None,
+                 max_hours=5.0,
+                 constructor=PhyloNode,
+                 clean_up=True
+                 ):
+    """Muscle align sequences and report tree from iteration2.
+
+    Unlike cluster_seqs, returns tree2 which is the tree made during the
+    second muscle iteration (it should be more accurate that the cluster from
+    the first iteration which is made fast based on  k-mer words)
+
+    seqs: either file name or list of sequence objects or list of strings or
+        single multiline string containing sequences.
+    tree_type: can be either neighborjoining (default) or upgmb for UPGMA
+    clean_up: When true, will clean up output files
+    """
+
+    params["-maxhours"] = max_hours
+    if tree_type:
+        params["-cluster2"] = tree_type
+    params["-tree2"] = get_tmp_filename(WorkingDir)
+    params["-out"] = get_tmp_filename(WorkingDir)
+
+    muscle_res = muscle_seqs(seqs,
+                 input_handler=input_handler,
+                 params=params,
+                 add_seq_names=add_seq_names,
+                 WorkingDir=WorkingDir,
+                 SuppressStderr=SuppressStderr,
+                 SuppressStdout=SuppressStdout)
+    tree = DndParser(muscle_res["Tree2Out"], constructor=constructor)
+    aln = [line for line in muscle_res["MuscleOut"]]
+
+    if clean_up:
+        muscle_res.cleanUp()
+    return tree, aln
+
+def fastest_aln_seqs(seqs,
+                 params={},
+                 out_filename=None,
+                 add_seq_names=True,
+                 WorkingDir=tempfile.gettempdir(),
+                 SuppressStderr=None,
+                 SuppressStdout=None
+                 ):
+    """Fastest (and least accurate) version of muscle
+
+    seqs: either file name or list of sequence objects or list of strings or
+        single multiline string containing sequences.
+
+    Addl docs coming soon
+    """
+
+    params["-maxiters"] = 1
+    params["-diags1"] = True
+    params["-sv"] = True
+    params["-distance1"] = "kbit20_3"
+
+    muscle_res = muscle_seqs(seqs,
+                 params=params,
+                 add_seq_names=add_seq_names,
+                 out_filename=out_filename,
+                 WorkingDir=WorkingDir,
+                 SuppressStderr=SuppressStderr,
+                 SuppressStdout=SuppressStdout)
+    return muscle_res
+
+def align_unaligned_seqs(seqs, moltype=DNA, params=None):
+    """Returns an Alignment object from seqs.
+
+    seqs: SequenceCollection object, or data that can be used to build one.
+
+    moltype: a MolType object.  DNA, RNA, or PROTEIN.
+
+    params: dict of parameters to pass in to the Muscle app controller.
+
+    Result will be an Alignment object.
+    """
+    if not params:
+        params = {}
+    #create SequenceCollection object from seqs
+    seq_collection = SequenceCollection(seqs,MolType=moltype)
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = seq_collection.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+    #get temporary filename
+    params.update({'-out':get_tmp_filename()})
+    #Create Muscle app.
+    app = Muscle(InputHandler='_input_as_multiline_string',\
+                 params=params, WorkingDir=tempfile.gettempdir())
+    #Get results using int_map as input to app
+    res = app(int_map.toFasta())
+    #Get alignment as dict out of results
+    alignment = dict(parse_fasta(res['MuscleOut']))
+    #Make new dict mapping original IDs
+    new_alignment = {}
+    for k,v in alignment.items():
+        new_alignment[int_keys[k]]=v
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment,MolType=moltype)
+    #Clean up
+    res.cleanUp()
+    del(seq_collection,int_map,int_keys,app,res,alignment,params)
+
+    return new_alignment
+
+
+def align_and_build_tree(seqs, moltype, best_tree=False, params=None):
+    """Returns an alignment and a tree from Sequences object seqs.
+
+    seqs: a cogent.core.alignment.SequenceCollection object, or data that can
+    be used to build one.
+
+    moltype: cogent.core.moltype.MolType object
+
+    best_tree: if True (default:False), uses a slower but more accurate
+    algorithm to build the tree.
+
+    params: dict of parameters to pass in to the Muscle app controller.
+
+    The result will be a tuple containing a cogent.core.alignment.Alignment
+    and a cogent.core.tree.PhyloNode object (or None for the alignment
+    and/or tree if either fails).
+    """
+    aln = align_unaligned_seqs(seqs, moltype=moltype, params=params)
+    tree = build_tree_from_alignment(aln, moltype, best_tree, params)
+    return {'Align':aln, 'Tree':tree}
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params=None):
+    """Returns a tree from Alignment object aln.
+
+    aln: a cogent.core.alignment.Alignment object, or data that can be used
+    to build one.
+
+    moltype: cogent.core.moltype.MolType object
+
+    best_tree: unsupported
+
+    params: dict of parameters to pass in to the Muscle app controller.
+
+    The result will be an cogent.core.tree.PhyloNode object, or None if tree
+    fails.
+    """
+    # Create instance of app controller, enable tree, disable alignment
+    app = Muscle(InputHandler='_input_as_multiline_string', params=params, \
+                   WorkingDir=tempfile.gettempdir())
+
+    app.Parameters['-clusteronly'].on()
+    app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir))
+    app.Parameters['-seqtype'].on(moltype.label)
+
+    seq_collection = SequenceCollection(aln, MolType=moltype)
+
+    #Create mapping between abbreviated IDs and full IDs
+    int_map, int_keys = seq_collection.getIntMap()
+    #Create SequenceCollection from int_map.
+    int_map = SequenceCollection(int_map,MolType=moltype)
+
+
+    # Collect result
+    result = app(int_map.toFasta())
+
+    # Build tree
+    tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode)
+
+    for tip in tree.tips():
+        tip.Name = int_keys[tip.Name]
+
+    # Clean up
+    result.cleanUp()
+    del(seq_collection, app, result)
+
+    return tree
+
+def add_seqs_to_alignment(seqs, aln, params=None):
+    """Returns an Alignment object from seqs and existing Alignment.
+
+    seqs: a cogent.core.alignment.SequenceCollection object, or data that can
+    be used to build one.
+
+    aln: a cogent.core.alignment.Alignment object, or data that can be used
+    to build one
+
+    params: dict of parameters to pass in to the Muscle app controller.
+    """
+    if not params:
+        params = {}
+
+    #create SequenceCollection object from seqs
+    seqs_collection = SequenceCollection(seqs)
+    #Create mapping between abbreviated IDs and full IDs
+    seqs_int_map, seqs_int_keys = seqs_collection.getIntMap(prefix='seq_')
+    #Create SequenceCollection from int_map.
+    seqs_int_map = SequenceCollection(seqs_int_map)
+
+    #create SequenceCollection object from aln
+    aln_collection = SequenceCollection(aln)
+    #Create mapping between abbreviated IDs and full IDs
+    aln_int_map, aln_int_keys = aln_collection.getIntMap(prefix='aln_')
+    #Create SequenceCollection from int_map.
+    aln_int_map = SequenceCollection(aln_int_map)
+
+    #set output and profile options
+    params.update({'-out':get_tmp_filename(), '-profile':True})
+
+    #save seqs to tmp file
+    seqs_filename = get_tmp_filename()
+    seqs_out = open(seqs_filename,'w')
+    seqs_out.write(seqs_int_map.toFasta())
+    seqs_out.close()
+
+    #save aln to tmp file
+    aln_filename = get_tmp_filename()
+    aln_out = open(aln_filename, 'w')
+    aln_out.write(aln_int_map.toFasta())
+    aln_out.close()
+
+    #Create Muscle app and get results
+    app = Muscle(InputHandler='_input_as_multifile', params=params,
+                 WorkingDir=tempfile.gettempdir())
+    res = app((aln_filename, seqs_filename))
+
+    #Get alignment as dict out of results
+    alignment = dict(parse_fasta(res['MuscleOut']))
+    #Make new dict mapping original IDs
+    new_alignment = {}
+    for k,v in alignment.items():
+        if k in seqs_int_keys:
+            new_alignment[seqs_int_keys[k]] = v
+        else:
+            new_alignment[aln_int_keys[k]] = v
+
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment)
+
+    #Clean up
+    res.cleanUp()
+    del(seqs_collection, seqs_int_map, seqs_int_keys)
+    del(aln_collection, aln_int_map, aln_int_keys)
+    del(app, res, alignment, params)
+    remove(seqs_filename)
+    remove(aln_filename)
+
+    return new_alignment
+
+def align_two_alignments(aln1, aln2, params=None):
+    """Returns an Alignment object from two existing Alignments.
+
+    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be
+    used to build them.
+
+    params: dict of parameters to pass in to the Muscle app controller.
+    """
+    if not params:
+        params = {}
+
+    #create SequenceCollection object from aln1
+    aln1_collection = SequenceCollection(aln1)
+    #Create mapping between abbreviated IDs and full IDs
+    aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_')
+    #Create SequenceCollection from int_map.
+    aln1_int_map = SequenceCollection(aln1_int_map)
+
+    #create SequenceCollection object from aln2
+    aln2_collection = SequenceCollection(aln2)
+    #Create mapping between abbreviated IDs and full IDs
+    aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_')
+    #Create SequenceCollection from int_map.
+    aln2_int_map = SequenceCollection(aln2_int_map)
+
+    #set output and profile options
+    params.update({'-out':get_tmp_filename(), '-profile':True})
+
+    #save aln1 to tmp file
+    aln1_filename = get_tmp_filename()
+    aln1_out = open(aln1_filename,'w')
+    aln1_out.write(aln1_int_map.toFasta())
+    aln1_out.close()
+
+    #save aln2 to tmp file
+    aln2_filename = get_tmp_filename()
+    aln2_out = open(aln2_filename, 'w')
+    aln2_out.write(aln2_int_map.toFasta())
+    aln2_out.close()
+
+    #Create Muscle app and get results
+    app = Muscle(InputHandler='_input_as_multifile', params=params,
+                 WorkingDir=tempfile.gettempdir())
+    res = app((aln1_filename, aln2_filename))
+
+    #Get alignment as dict out of results
+    alignment = dict(parse_fasta(res['MuscleOut']))
+
+    #Make new dict mapping original IDs
+    new_alignment = {}
+    for k,v in alignment.items():
+        if k in aln1_int_keys:
+            new_alignment[aln1_int_keys[k]] = v
+        else:
+            new_alignment[aln2_int_keys[k]] = v
+
+    #Create an Alignment object from alignment dict
+    new_alignment = Alignment(new_alignment)
+
+    #Clean up
+    res.cleanUp()
+    del(aln1_collection, aln1_int_map, aln1_int_keys)
+    del(aln2_collection, aln2_int_map, aln2_int_keys)
+    del(app, res, alignment, params)
+    remove(aln1_filename)
+    remove(aln2_filename)
+
+    return new_alignment
diff --git a/bfillings/parsinsert.py b/bfillings/parsinsert.py
new file mode 100644
index 0000000..ce33754
--- /dev/null
+++ b/bfillings/parsinsert.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for ParsInsert
+
+designed for ParsInsert v1.03 """
+
+from StringIO import StringIO
+from os.path import splitext, join, abspath
+
+from burrito.parameters import ValuedParameter, FlagParameter, MixedParameter
+from burrito.util import (CommandLineApplication, FilePath, system,
+                            CommandLineAppResult, ResultPath, remove,
+                            ApplicationError)
+
+from cogent.core.tree import PhyloNode
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import SequenceCollection, Alignment
+from cogent.parse.phylip import get_align_for_phylip
+
+
+class ParsInsert(CommandLineApplication):
+    """ParsInsert application Controller"""
+
+    _command = 'ParsInsert'
+    _input_handler = '_input_as_multiline_string'
+    _parameters = {
+                    # read mask from this file
+                    '-m':ValuedParameter('-',Name='m',Delimiter=' '),
+
+                    # read core tree sequences from this file
+                    '-s':ValuedParameter('-',Name='s',Delimiter=' '),
+
+                    # read core tree from this file
+                    '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+                    # read core tree taxomony from this file
+                    '-x':ValuedParameter('-',Name='x',Delimiter=' '),
+
+                    # output taxonomy for each insert sequence to this file
+                    '-o':ValuedParameter('-',Name='o',Delimiter=' '),
+
+                    # create log file
+                    '-l':ValuedParameter('-',Name='l',Delimiter=' '),
+
+                    # number of best matches to display
+                    '-n':ValuedParameter('-',Name='n',Delimiter=' '),
+
+                    #percent threshold cutoff
+                    '-c':ValuedParameter('-',Name='c',Delimiter=' '),
+                   }
+
+    def _handle_app_result_build_failure(self,out,err,exit_status,result_paths):
+        """ Catch the error when files are not produced """
+        raise ApplicationError, \
+         'ParsInsert failed to produce an output file due to the following error: \n\n%s ' \
+         % err.read()
+
+    def _get_result_paths(self,data):
+        """ Get the resulting tree"""
+        result = {}
+        result['Tree'] = ResultPath(Path=splitext(self._input_filename)[0] + \
+                                                  '.tree')
+        return result
+
+def insert_sequences_into_tree(aln, moltype, params={}):
+    """Returns a tree from placement of sequences
+    """
+    # convert aln to phy since seq_names need fixed to run through parsinsert
+    new_aln=get_align_for_phylip(StringIO(aln))
+
+    # convert aln to fasta in case it is not already a fasta file
+    aln2 = Alignment(new_aln)
+    seqs = aln2.toFasta()
+
+    parsinsert_app = ParsInsert(params=params)
+    result = parsinsert_app(seqs)
+
+    # parse tree
+    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
+
+    # cleanup files
+    result.cleanUp()
+
+    return tree
diff --git a/bfillings/pplacer.py b/bfillings/pplacer.py
new file mode 100644
index 0000000..66992dd
--- /dev/null
+++ b/bfillings/pplacer.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for pplacer 1.1"""
+
+from os.path import splitext, abspath, join, split
+from StringIO import StringIO
+
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, FilePath, system,
+                             CommandLineAppResult, ResultPath, remove,
+                             ApplicationError, get_tmp_filename)
+
+from cogent.core.alignment import Alignment
+from cogent.app.guppy import build_tree_from_json_using_params
+from cogent.parse.phylip import get_align_for_phylip
+from cogent.parse.tree import DndParser
+from cogent.core.tree import PhyloNode
+
+class Pplacer(CommandLineApplication):
+    """pplacer Application Controller
+    """
+
+    _command = 'pplacer'
+    _input_handler = '_input_as_multiline_string'
+    _parameters = {
+        # -c Specify the path to the reference package.
+        '-c': ValuedParameter('-', Name='c', Delimiter=' ', IsPath=True),
+
+        # -t Specify the reference tree filename.
+        '-t': ValuedParameter('-', Name='t', Delimiter=' ', IsPath=True),
+
+        # -r Specify the reference alignment filename.
+        '-r': ValuedParameter('-', Name='r', Delimiter=' ', IsPath=True),
+
+        # -s Supply a phyml stats.txt or a RAxML info file giving the model parameters.
+        '-s': ValuedParameter('-', Name='s', Delimiter=' ', IsPath=True),
+
+        # -d Specify the directory containing the reference information.
+        '-d': ValuedParameter('-', Name='d', Delimiter=' ', IsPath=True),
+
+        # -p Calculate posterior probabilities.
+        '-p': FlagParameter('-', Name='p'),
+
+        # -m Substitution model. Protein: are LG, WAG, or JTT. Nucleotides: GTR.
+        '-m': ValuedParameter('-', Name='m', Delimiter=' '),
+
+        # --model-freqs Use model frequencies instead of reference alignment frequencies.
+        '--model-freqs': FlagParameter('--', Name='model-freqs'),
+
+        # --gamma-cats Number of categories for discrete gamma model.
+        '--gamma-cats': ValuedParameter('--', Name='gamma-cats', Delimiter=' '),
+
+        # --gamma-alpha Specify the shape parameter for a discrete gamma model.
+        '--gamma-alpha': ValuedParameter('--', Name='gamma-alpha', Delimiter=' '),
+
+        # --ml-tolerance 1st stage branch len optimization tolerance (2nd stage to 1e-5). Default: 0.01.
+        '--ml-tolerance': ValuedParameter('--', Name='ml-tolerance', Delimiter=' '),
+
+        # --pp-rel-err Relative error for the posterior probability calculation. Default is 0.01.
+        '--pp-rel-err': ValuedParameter('--', Name='pp-rel-err', Delimiter=' '),
+
+        # --unif-prior Use a uniform prior rather than exponential.
+        '--unif-prior': FlagParameter('--', Name='unif-prior'),
+
+        # --start-pend Starting pendant branch length. Default is 0.1.
+        '--start-pend': ValuedParameter('--', Name='start-pend', Delimiter=' '),
+
+        # --max-pend Set the maximum ML pendant branch length. Default is 2.
+        '--max-pend': ValuedParameter('--', Name='max-pend', Delimiter=' '),
+
+        # --max-strikes Maximum number of strikes for baseball. 0 -> no ball playing. Default is 6.
+        '--max-strikes': ValuedParameter('--', Name='max-strikes', Delimiter=' '),
+
+        # --strike-box Set the size of the strike box in log likelihood units. Default is 3.
+        '--strike-box': ValuedParameter('--', Name='strike-box', Delimiter=' '),
+
+        # --max-pitches Set the maximum number of pitches for baseball. Default is 40.
+        '--max-pitches': ValuedParameter('--', Name='max-pitches', Delimiter=' '),
+
+        # --fantasy Desired likelihood cutoff for fantasy baseball mode. 0 -> no fantasy.
+        '--fantasy': ValuedParameter('--', Name='fantasy', Delimiter=' '),
+
+        # --fantasy-frac Fraction of fragments to use when running fantasy baseball. Default is 0.1.
+        '--fantasy-frac': ValuedParameter('--', Name='fantasy-frac', Delimiter=' '),
+
+        # --write-masked Write alignment masked to the region without gaps in the query.
+        '--write-masked': FlagParameter('--', Name='write-masked'),
+
+        # --verbosity Set verbosity level. 0 is silent, and 2 is quite a lot. Default is 1.
+        '--verbosity': ValuedParameter('--', Name='verbosity', Delimiter=' '),
+
+        # --unfriendly Do not run friend finder pre-analysis.
+        '--unfriendly': FlagParameter('--', Name='unfriendly'),
+
+        # --out-dir Specify the directory to write place files to.
+        '--out-dir': ValuedParameter('--', Name='out-dir', Delimiter=' ', IsPath=True),
+
+        # --pretend Only check out the files then report. Do not run the analysis.
+        '--pretend': FlagParameter('--', Name='pretend'),
+
+        # --csv Make a CSV file with the results.
+        '--csv': FlagParameter('--', Name='csv'),
+
+        # --old-format Make an old-format placefile with the resuls.
+        '--old-format': FlagParameter('--', Name='old-format'),
+
+        # --diagnostic Write file describing the 'diagnostic' mutations for various clades.
+        '--diagnostic': FlagParameter('--', Name='diagnostic'),
+
+        # --check-like Write out the likelihood of the reference tree, calculated two ways.
+        '--check-like': FlagParameter('--', Name='check-like'),
+
+        # --version Write out the version number and exit.
+        '--version': FlagParameter('--', Name='version'),
+
+        # --help  Display this list of options
+        '--help': FlagParameter('--', Name='help'),
+    }
+
+    def getTmpFilename(self, tmp_dir="/tmp",prefix='tmp',suffix='.fasta',\
+           include_class_id=False,result_constructor=FilePath):
+        """ Define Tmp filename to contain .fasta suffix, since pplacer requires
+            the suffix to be .fasta """
+
+        return super(Pplacer,self).getTmpFilename(tmp_dir=tmp_dir,
+                                    prefix=prefix,
+                                    suffix=suffix,
+                                    include_class_id=include_class_id,
+                                    result_constructor=result_constructor)
+
+    def _handle_app_result_build_failure(self,out,err,exit_status,result_paths):
+        """ Catch the error when files are not produced """
+        raise ApplicationError, \
+         'Pplacer failed to produce an output file due to the following error: \n\n%s ' \
+         % out.read()
+
+    def _get_result_paths(self,data):
+        """ Define the output filepaths """
+        output_dir = self.Parameters['--out-dir'].Value
+        result = {}
+        result['json'] = ResultPath(Path=join(output_dir,
+                                splitext(split(self._input_filename)[-1])[0] + \
+                                '.jplace'))
+        return result
+
+def insert_sequences_into_tree(aln, moltype, params={},
+                                           write_log=True):
+    """Returns a tree from Alignment object aln.
+
+    aln: an xxx.Alignment object, or data that can be used to build one.
+
+    moltype: cogent.core.moltype.MolType object
+
+    params: dict of parameters to pass in to the RAxML app controller.
+
+    The result will be an xxx.Alignment object, or None if tree fails.
+    """
+
+    # convert aln to phy since seq_names need fixed to run through pplacer
+
+    new_aln=get_align_for_phylip(StringIO(aln))
+
+    # convert aln to fasta in case it is not already a fasta file
+    aln2 = Alignment(new_aln)
+    seqs = aln2.toFasta()
+
+    ih = '_input_as_multiline_string'
+
+    pplacer_app = Pplacer(params=params,
+                      InputHandler=ih,
+                      WorkingDir=None,
+                      SuppressStderr=False,
+                      SuppressStdout=False)
+
+    pplacer_result = pplacer_app(seqs)
+
+    # write a log file
+    if write_log:
+        log_fp = join(params["--out-dir"],'log_pplacer_' + \
+                      split(get_tmp_filename())[-1])
+        log_file=open(log_fp,'w')
+        log_file.write(pplacer_result['StdOut'].read())
+        log_file.close()
+
+    # use guppy to convert json file into a placement tree
+    guppy_params={'tog':None}
+
+    new_tree=build_tree_from_json_using_params(pplacer_result['json'].name, \
+                                               output_dir=params['--out-dir'], \
+                                               params=guppy_params)
+
+    pplacer_result.cleanUp()
+
+    return new_tree
diff --git a/bfillings/raxml_v730.py b/bfillings/raxml_v730.py
new file mode 100644
index 0000000..84a7356
--- /dev/null
+++ b/bfillings/raxml_v730.py
@@ -0,0 +1,875 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for RAxML (v7.3.0).
+
+WARNING: Because of the use of the -x option, this version is no longer
+compatible with RAxML version VI.
+"""
+from random import choice, randint
+from os import walk, listdir
+from os.path import isabs, join, split
+import re
+
+from cogent.core.tree import PhyloNode
+from cogent.core.alignment import Alignment
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.parse.tree import DndParser
+from cogent.app.guppy import build_tree_from_json_using_params
+
+from burrito.parameters import FlagParameter, ValuedParameter, FilePath
+from burrito.util import (CommandLineApplication, ResultPath,
+                            get_tmp_filename, ApplicationError)
+
+
+class Raxml(CommandLineApplication):
+    """RAxML application controller"""
+
+    _options ={
+
+        # Specify a column weight file name to assign individual wieghts to
+        # each column of the alignment. Those weights must be integers
+        # separated by any number and type of whitespaces whithin a separate
+        # file, see file "example_weights" for an example.
+        '-a':ValuedParameter('-',Name='a',Delimiter=' '),
+
+        # Specify one of the secondary structure substitution models implemented
+        # in RAxML. The same nomenclature as in the PHASE manual is used,
+        # available models:  S6A, S6B, S6C, S6D, S6E, S7A, S7B, S7C, S7D, S7E,
+        # S7F, S16, S16A, S16B
+        # DEFAULT: 16-state GTR model (S16)
+        '-A':ValuedParameter('-',Name='A',Delimiter=' '),
+
+        #  Specify an integer number (random seed) for bootstrapping
+        '-b':ValuedParameter('-',Name='b',Delimiter=' '),
+
+        # specify a floating point number between 0.0 and 1.0 that will be used
+        # as cutoff threshold for the MR-based bootstopping criteria. The
+        # recommended setting is 0.03.
+        '-B':ValuedParameter('-',Name='B',Delimiter=' '),
+
+        # Specify number of distinct rate catgories for raxml when
+        # ModelOfEvolution is set to GTRCAT or HKY85CAT.
+        # Individual per-site rates are categorized into numberOfCategories
+        # rate categories to accelerate computations. (Default = 50)
+        '-c':ValuedParameter('-',Name='c',Delimiter=' '),
+
+        # Conduct model parameter optimization on gappy, partitioned multi-gene
+        # alignments with per-partition branch length estimates (-M enabled)
+        # using the fast method with pointer meshes described in:
+        # Stamatakis and Ott: "Efficient computation of the phylogenetic
+        # likelihood function on multi-gene alignments and multi-core
+        # processors"
+        # WARNING: We can not conduct useful tree searches using this method
+        # yet! Does not work with Pthreads version.
+        '-C':ValuedParameter('-',Name='C',Delimiter=' '),
+
+        # This option allows you to start the RAxML search with a complete
+        # random starting tree instead of the default Maximum Parsimony
+        # Starting tree. On smaller datasets (around 100-200 taxa) it has
+        # been observed that this might sometimes yield topologies of distinct
+        # local likelihood maxima which better correspond to empirical
+        # expectations.
+        '-d':FlagParameter('-',Name='d'),
+
+        # ML search convergence criterion. This will break off ML searches if
+        # the relative Robinson-Foulds distance between the trees obtained from
+        # two consecutive lazy SPR cycles is smaller or equal to 1%. Usage
+        # recommended for very large datasets in terms of taxa. On trees with
+        # more than 500 taxa this will yield execution time improvements of
+        # approximately 50% While yielding only slightly worse trees.
+        # DEFAULT: OFF
+        '-D':ValuedParameter('-',Name='D'),
+
+        # This allows you to specify up to which likelihood difference.
+        # Default is 0.1 log likelihood units, author recommends 1 or 2 to
+        # rapidly evaluate different trees.
+        '-e':ValuedParameter('-',Name='e',Delimiter=' '),
+
+        # specify an exclude file name, that contains a specification of
+        # alignment positions you wish to exclude. Format is similar to Nexus,
+        # the file shall contain entries like "100-200 300-400", to exclude a
+        # single column write, e.g., "100-100", if you use a mixed model, an
+        # appropriatly adapted model file will be written.
+        '-E':ValuedParameter('-',Name='E',Delimiter=' '),
+
+        # select search algorithm:
+        #   a rapid Bootstrap analysis and search for best-scoring ML tree in
+        #       one program run
+        #   A compute marginal ancestral states on a ROOTED reference tree
+        #       provided with "t" - ONLY IN 7.3.0
+        #   b draw bipartition information on a tree provided with "-t" based on
+        #       multiple trees (e.g., from a bootstrap) in a file specifed by
+        #       "-z"
+        #   c check if the alignment can be properly read by RAxML
+        #   d for normal hill-climbing search (Default)
+        #     when -f option is omitted this algorithm will be used
+        #   e optimize model+branch lengths for given input tree under
+        #       GAMMA/GAMMAI only
+        #   E execute very fast experimental tree search, at present only for
+        #       testing
+        #   F execute fast experimental tree search, at present only for testing
+        #   g compute per site log Likelihoods for one ore more trees passed via
+        #       "-z" and write them to a file that can be read by CONSEL
+        #       WARNING: does not print likelihoods in the original column order
+        #   h compute log likelihood test (SH-test) between best tree passed via
+        #       "-t" and a bunch of other trees passed via "-z"
+        #   i EXPERIMENTAL do not use for real tree inferences: conducts a
+        #       single cycle of fast lazy SPR moves on a given input tree, to be
+        #       used in combination with -C and -M
+        #   I EXPERIMENTAL do not use for real tree inferences: conducts a
+        #       single cycle of thorough lazy SPR moves on a given input tree,
+        #       to be used in combination with -C and -M
+        #   j generate a bunch of bootstrapped alignment files from an original
+        #       alignemnt file. You need to specify a seed with "-b" and the
+        #       number of replicates with "-#"
+        # following "J" is for version 7.2.8
+        #   J Compute SH-like support values on a given tree passed via "-t".
+        #   m compare bipartitions between two bunches of trees passed via "-t"
+        #       and "-z" respectively. This will return the Pearson correlation
+        #       between all bipartitions found in the two tree files. A file
+        #       called RAxML_bipartitionFrequencies.outpuFileName will be
+        #       printed that contains the pair-wise bipartition frequencies of
+        #       the two sets
+        #   n compute the log likelihood score of all trees contained in a tree
+        #       file provided by "-z" under GAMMA or GAMMA+P-Invar
+        #   o old (slower) algorithm from v. 2.1.3
+        #   p perform pure stepwise MP addition of new sequences to an
+        #       incomplete starting tree and exit
+        #   r compute pairwise Robinson-Foulds (RF) distances between all pairs
+        #       of trees in a tree file passed via "-z" if the trees have node
+        #       labales represented as integer support values the program will
+        #       also compute two flavors of the weighted Robinson-Foulds (WRF)
+        #       distance
+        # following "R" is for version 7.2.8
+        #   R compute rogue taxa using new statistical method based on the
+        #       evolutionary placement algorithm
+        #       WARNING: this is experimental code - DEPRECATED IN 7.3.0
+        #   s (split) splits into individual genes, provided with model file
+        # following "S" is for version 7.2.8
+        #   S compute site-specific placement bias using a leave one out test
+        #       inspired by the evolutionary placement algorithm
+        #   t do randomized tree searches on one fixed starting tree
+        #   u execute morphological weight calibration using maximum likelihood,
+        #       this will return a weight vector. you need to provide a
+        #       morphological alignment and a reference tree via "-t"
+        #   U execute morphological wieght calibration using parsimony, this
+        #       will return a weight vector. you need to provide a morphological
+        #       alignment and a reference tree via "-t" - DEPRECATED IN 7.3.0
+        #   v classify a bunch of environmental sequences into a reference tree
+        #       using the slow heuristics without dynamic alignment you will
+        #       need to start RAxML with a non-comprehensive reference tree and
+        #       an alignment containing all sequences (reference + query)
+        #   w compute ELW test on a bunch of trees passed via "-z"
+        #   x compute pair-wise ML distances, ML model parameters will be
+        #       estimated on an MP starting tree or a user-defined tree passed
+        #       via "-t", only allowed for GAMMA-based models of rate
+        #       heterogeneity
+        #   y classify a bunch of environmental sequences into a reference tree
+        #       using the fast heuristics without dynamic alignment you will
+        #       need to start RAxML with a non-comprehensive reference tree and
+        #       an alignment containing all sequences (reference + query)
+        '-f':ValuedParameter('-',Name='f',Delimiter=' ', Value="d"),
+
+        # enable ML tree searches under CAT model for very large trees without
+        # switching to GAMMA in the end (saves memory). This option can also be
+        # used with the GAMMA models in order to avoid the thorough optimization
+        # of the best-scoring ML tree in the end.
+        # DEFAULT: OFF
+        '-F':FlagParameter('-',Name='F'),
+
+        # select grouping file name: allows incomplete multifurcating constraint
+        # tree in newick format -- resolves multifurcations randomly, adds
+        # other taxa using parsimony insertion
+        '-g':ValuedParameter('-', Name='g',Delimiter=' '),
+
+        # enable the ML-based evolutionary placement algorithm heuristics by
+        # specifiyng a threshold value (fraction of insertion branches to be
+        # evaluated using slow insertions under ML).
+        '-G':FlagParameter('-', Name='G'),
+
+        # prints help and exits
+        '-h':FlagParameter('-', Name='h'),
+
+        # enable the MP-based evolutionary placement algorithm heuristics
+        # by specifiyng a threshold value (fraction of insertion branches to be
+        # evaluated using slow insertions under ML) - DEPRECATED IN 7.3.0
+        #'-H':ValuedParameter('-', Name='H',Delimiter=' '),
+
+        # allows initial rearrangement to be constrained, e.g. 10 means
+        # insertion will not be more than 10 nodes away from original.
+        # default is to pick a "good" setting.
+        '-i':ValuedParameter('-', Name='i', Delimiter=' '),
+
+        # a posteriori bootstopping analysis. Use:
+        #   "-I autoFC" for the frequency-based criterion
+        #   "-I autoMR" for the majority-rule consensus tree criterion
+        #   "-I autoMRE" for the extended majority-rule consensus tree criterion
+        #   "-I autoMRE_IGN" for metrics similar to MRE, but include
+        #       bipartitions under the threshold whether they are compatible
+        #       or not. This emulates MRE but is faster to compute.
+        #   You also need to pass a tree file containg several bootstrap
+        #   replicates via "-z"
+        '-I':ValuedParameter('-', Name='I', Delimiter=' '),
+
+        # writes checkpoints (off by default)
+        '-j':FlagParameter('-', Name='j'),
+
+        # Compute majority rule consensus tree with "-J MR" or extended majority
+        # rule consensus tree with "-J MRE" or strict consensus tree with "-J
+        # STRICT" You will need to provide a tree file containing several
+        # UNROOTED trees via "-z"
+        '-J':ValuedParameter('-', Name='J', Delimiter=' '),
+
+        #specifies that RAxML will optimize model parameters (for GTRMIX and
+        # GTRGAMMA) as well as calculating likelihoods for bootstrapped trees.
+        '-k':FlagParameter('-', Name='k'),
+
+        # Specify one of the multi-state substitution models (max 32 states)
+        # implemented in RAxML. Available models are: ORDERED, MK, GTR
+        '-K':ValuedParameter('-', Name='K', Delimiter=' '),
+
+        # Model of Binary (Morphological), Nucleotide, Multi-State, or Amino
+        #   Acid Substitution::
+        # BINARY:
+        #   -m BINCAT : Optimization of site-specific evolutionary rates which
+        #       are categorized into numberOfCategories distinct rate categories
+        #       for greater computational efficiency. Final tree might be
+        #       evaluated automatically under BINGAMMA, depending on the tree
+        #       search option
+        #   -m BINCATI : Optimization of site-specific evolutionary rates which
+        #       are categorized into numberOfCategories distinct rate categories
+        #       for greater computational efficiency. Final tree might be
+        #       evaluated automatically under BINGAMMAI, depending on the tree
+        #       search option
+        #   -m BINGAMMA : GAMMA model of rate heterogeneity (alpha parameter
+        #       will be estimated)
+        #   -m BINGAMMAI : Same as BINGAMMA, but with estimate of proportion of
+        #       invariable sites
+        # NUCLEOTIDES
+        #   -m GTRCAT: GTR + Optimization of substitution rates +  Optimization
+        #       of site-specific evolutionary rates which are categorized into
+        #       numberOfCategories distinct rate categories for greater
+        #       computational efficiency
+        #   -m GTRCAT_FLOAT : Same as above but uses single-precision floating
+        #       point arithemtics instead of double-precision Usage only
+        #       recommened for testing, the code will run slower, but can save
+        #       almost 50% of memory. If you have problems with phylogenomic
+        #       datasets and large memory requirements you may give it a shot.
+        #       Keep in mind that numerical stability seems to be okay but needs
+        #       further testing. - DEPRECATED IN 7.3.0
+        #   -m GTRCATI : GTR + Optimization of substitution rates + Optimization
+        #       of site-specific evolutionary rates which are categorized into
+        #       numberOfCategories distinct rate categories for greater
+        #       computational efficiency.  Final tree might be evaluated under
+        #       GTRGAMMAI, depending on the tree search option
+        #   -m GTRGAMMA: GTR + Optimization of substitution rates + Gamma
+        #   -m GTRGAMMA_FLOAT : Same as GTRGAMMA, but also with
+        #       single-precision arithmetics, same cautionary notes as for
+        #       GTRCAT_FLOAT apply. - DEPRECATED IN 7.3.0
+        #   -m GTRGAMMAI : Same as GTRGAMMA, but with estimate of proportion of
+        #       invariable sites
+        # MULTI-STATE:
+        #   -m MULTICAT : Optimization of site-specific evolutionary rates which
+        #       are categorized into numberOfCategories distinct rate categories
+        #       for greater computational efficiency. Final tree might be
+        #       evaluated automatically under MULTIGAMMA, depending on the tree
+        #       search option
+        #   -m MULTICATI : Optimization of site-specific evolutionary rates
+        #       which are categorized into numberOfCategories distinct rate
+        #       categories for greater computational efficiency. Final tree
+        #       might be evaluated automatically under MULTIGAMMAI, depending on
+        #       the tree search option
+        #   -m MULTIGAMMA : GAMMA model of rate heterogeneity (alpha parameter
+        #       will be estimated)
+        #   -m MULTIGAMMAI : Same as MULTIGAMMA, but with estimate of proportion
+        #       of invariable sites
+        # You can use up to 32 distinct character states to encode multi-state
+        # regions, they must be used in the following order: 0, 1, 2, 3, 4, 5,
+        # 6, 7, 8, 9, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S,
+        # T, U, V i.e., if you have 6 distinct character states you would use 0,
+        # 1, 2, 3, 4, 5 to encode these. The substitution model for the
+        # multi-state regions can be selected via the "-K" option
+        # Amino Acid Models:
+        #   -m PROTCATmatrixName[F] : specified AA matrix + Optimization of
+        #       substitution rates + Optimization of site-specific evolutionary
+        #       rates which are categorized into numberOfCategories distinct
+        #       rate categories for greater computational efficiency.   Final
+        #       tree might be evaluated automatically under
+        #       PROTGAMMAmatrixName[f], depending on the tree search option
+        #   -m PROTCATmatrixName[F]_FLOAT : PROTCAT with single precision
+        #       arithmetics, same cautionary notes as for GTRCAT_FLOAT apply
+        #       - DEPRECATED IN 7.3.0
+        #   -m PROTCATImatrixName[F] : specified AA matrix + Optimization of
+        #       substitution rates + Optimization of site-specific
+        #       evolutionary rates which are categorized into numberOfCategories
+        #       distinct rate categories for greater computational efficiency.
+        #       Final tree might be evaluated automatically under
+        #       PROTGAMMAImatrixName[f], depending on the tree search option
+        #   -m PROTGAMMAmatrixName[F] : specified AA matrix + Optimization of
+        #       substitution rates + GAMMA model of rate heterogeneity (alpha
+        #       parameter will be estimated)
+        #   -m PROTGAMMAmatrixName[F]_FLOAT : PROTGAMMA with single precision
+        #       arithmetics, same cautionary notes as for GTRCAT_FLOAT apply
+        #       - DEPRECATED IN 7.3.0
+        #   -m PROTGAMMAImatrixName[F] : Same as PROTGAMMAmatrixName[F], but
+        #       with estimate of proportion of invariable sites
+        # Available AA substitution models: DAYHOFF, DCMUT, JTT, MTREV, WAG,
+        # RTREV, CPREV, VT, BLOSUM62, MTMAM, LG, GTR. With the optional "F"
+        # appendix you can specify if you want to use empirical base frequencies
+        # Please note that for mixed models you can in addition specify the
+        # per-gene AA model in the mixed model file (see manual for details).
+        # Also note that if you estimate AA GTR parameters on a partitioned
+        # dataset, they will be linked (estimated jointly) across all partitions
+        # to avoid over-parametrization
+        '-m':ValuedParameter('-',Name='m',Delimiter=' '),
+
+        # Switch on estimation of individual per-partition branch lengths. Only
+        # has effect when used in combination with "-q". Branch lengths for
+        # individual partitions will be printed to separate files. A weighted
+        # average of the branch lengths is computed by using the respective
+        # partition lengths.
+        # DEFAULT: OFF
+        '-M':FlagParameter('-',Name='M'),
+
+        # Specifies the name of the output file.
+        '-n':ValuedParameter('-',Name='n',Delimiter=' '),
+
+        # Specifies the name of the outgroup (or outgroups: comma-delimited,
+        # no spaces, should be monophyletic).
+        '-o':ValuedParameter('-',Name='o',Delimiter=' '),
+
+        # Enable checkpointing using the dmtcp library available at
+        # http://dmtcp.sourceforge.net/. This only works if you call the program
+        # by preceded by the command "dmtcp_checkpoint" and if you compile a
+        # dedicated binary using the appropriate Makefile. With "-O" you can
+        # specify the interval between checkpoints in seconds.
+        # DEFAULT: 3600.0 seconds - DEPRECATED IN 7.3.0
+        #'-O':ValuedParameter('-',Name='O',Delimiter=' ',Value=3600.0),
+
+        # Specify a random number seed for the parsimony inferences. This allows
+        # you to reproduce your results and will help me debug the program.
+        '-p':ValuedParameter('-',Name='p',Delimiter=' '),
+
+        # Specify the file name of a user-defined AA (Protein) substitution
+        # model. This file must contain 420 entries, the first 400 being the AA
+        # substitution rates (this must be a symmetric matrix) and the last 20
+        # are the empirical base frequencies
+        '-P':ValuedParameter('-',Name='P',Delimiter=' '),
+
+        # Specified MultipleModel file name, in format:
+        #    gene1 = 1-500
+        #    gene2 = 501-1000
+        #    (note: ranges can also be discontiguous, e.g. 1-100, 200-300,
+        #     or can specify codon ranges as e.g. 1-100/3, 2-100/3, 3-100/3))
+        '-q':ValuedParameter('-', Name='q', Delimiter=' '),
+
+        # THE FOLLOWING "Q" is DEPRECATED IN 7.2.8
+        # Turn on computation of SH-like support values on tree.
+        # DEFAULT: OFF
+        '-Q':FlagParameter('-', Name='Q'),
+
+        # Constraint file name: allows a bifurcating Newick tree to be passed
+        # in as a constraint file, other taxa will be added by parsimony.
+        '-r':ValuedParameter('-',Name='r',Delimiter=' '),
+
+        # THE FOLLOWING "R" is IN 7.2.8
+        # Specify the file name of a binary model parameter file that has
+        # previously been generated with RAxML using the -f e tree evaluation
+        # option. The file name should be:  RAxML_binaryModelParameters.runID
+        '-R':ValuedParameter('-',Name='R',Delimiter=' '),
+
+        # specify the name of the alignment data file, in relaxed PHYLIP
+        # format.
+        '-s':ValuedParameter('-',Name='s',Delimiter=' '),
+
+        # Specify the name of a secondary structure file. The file can contain
+        # "." for alignment columns that do not form part of a stem and
+        # characters "()<>[]{}" to define stem regions and pseudoknots
+        '-S':ValuedParameter('-',Name='S',Delimiter=' '),
+
+        # Specify a user starting tree file name in Newick format
+        '-t':ValuedParameter('-',Name='t',Delimiter=' '),
+
+        # PTHREADS VERSION ONLY! Specify the number of threads you want to run.
+        # Make sure to set "-T" to at most the number of CPUs you have on your
+        # machine, otherwise, there will be a huge performance decrease!
+        '-T':ValuedParameter('-',Name='T',Delimiter=' '),
+
+        # THE FOLLOWING "U" is IN 7.2.8
+        # Try to save memory by using SEV-based implementation for gap columns
+        # on large gappy alignments
+        # WARNING: this will only work for DNA under GTRGAMMA and is still in an
+        # experimental state.
+        '-U':ValuedParameter('-',Name='U',Delimiter=' '),
+
+        # Print the version
+        '-v':FlagParameter('-',Name='v'),
+
+        # Name of the working directory where RAxML-V will write its output
+        # files.
+        '-w':ValuedParameter('-',Name='w',Delimiter=' '),
+
+        # THE FOLLOWING "W" is IN 7.2.8
+        # Sliding window size for leave-one-out site-specific placement bias
+        # algorithm only effective when used in combination with "-f S"
+        #   DEFAULT: 100 sites
+        '-W':ValuedParameter('-',Name='W',Delimiter=' '),
+
+        # Specify an integer number (random seed) and turn on rapid
+        # bootstrapping. CAUTION: unlike in version 7.0.4 RAxML will conduct
+        # rapid BS replicates under the model of rate heterogeneity you
+        # specified via "-m" and not by default under CAT
+        '-x':ValuedParameter('-',Name='x',Delimiter=' '),
+
+        # EXPERIMENTAL OPTION: This option will do a per-site estimate of
+        # protein substitution models by looping over all given, fixed models
+        # LG, WAG, JTT, etc and using their respective base frequencies to
+        # independently assign a prot subst. model to each site via ML
+        # optimization. At present this option only works with the GTR+GAMMA
+        # model, unpartitioned datasets, and in the sequential version only.
+        #   DEFAULT: OFF
+        '-X':FlagParameter('-', Name='X'),
+
+        # Compute only randomized starting parsimony tree with RAxML, do not
+        # optimize an ML analysis of the tree
+        '-y':FlagParameter('-', Name='y'),
+
+        # Do a more thorough parsimony tree search using a parsimony ratchet and
+        # exit. Specify the number of ratchet searches via "-#" or "-N". This
+        # has just been implemented for completeness, if you want a fast MP
+        # implementation use TNT
+        # DEFAULT: OFF - DEPRECATED IN 7.3.0
+        #'-Y':FlagParameter('-', Name='Y'),
+
+        # Multiple tree file, for use with -f b (to draw bipartitions onto the
+        # common tree specified with -t)
+        '-z':ValuedParameter('-', Name='z', Delimiter=' '),
+
+        # Specifies number of runs on distinct starting trees.
+        '-#':ValuedParameter('-', Name='#', Delimiter=' ',Value=1),
+
+        # Specifies number of runs on distinct starting trees.
+        '-N':ValuedParameter('-', Name='N', Delimiter=' '),
+
+    }
+
+    _parameters = {}
+    _parameters.update(_options)
+    _command = "raxmlHPC"
+    _out_format = "RAxML_%s.%s"
+
+    def _format_output(self, outfile_name, out_type):
+        """ Prepend proper output prefix to output filename """
+
+        outfile_name = self._absolute(outfile_name)
+        outparts = outfile_name.split("/")
+        outparts[-1] = self._out_format % (out_type, outparts[-1] )
+
+        return '/'.join(outparts)
+
+    def _input_as_seqs(self,data):
+        lines = []
+        for i,s in enumerate(data):
+            #will number the sequences 1,2,3,etc.
+            lines.append(''.join(['>',str(i+1)]))
+            lines.append(s)
+        return self._input_as_lines(lines)
+
+    def _input_as_lines(self,data):
+        if data:
+            self.Parameters['-s']\
+                .on(super(Raxml,self)._input_as_lines(data))
+        return ''
+
+    def _input_as_string(self,data):
+        """Makes data the value of a specific parameter
+
+        This method returns the empty string. The parameter will be printed
+        automatically once set.
+        """
+        if data:
+            self.Parameters['-in'].on(str(data))
+        return ''
+
+    def _input_as_multiline_string(self, data):
+        if data:
+            self.Parameters['-s']\
+                .on(super(Raxml,self)._input_as_multiline_string(data))
+        return ''
+
+    def _absolute(self,path):
+        path = FilePath(path)
+        if isabs(path):
+            return path
+        elif self.Parameters['-w'].isOn():
+            return self.Parameters['-w'].Value + path
+        else:
+            return self.WorkingDir + path
+
+    def _log_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), "log")
+        else:
+            raise ValueError, "No output file specified."
+
+    def _info_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), "info")
+        else:
+            raise ValueError, "No output file specified."
+
+    def _parsimony_tree_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "parsimonyTree")
+        else:
+            raise ValueError, "No output file specified."
+
+    # added for tree-insertion
+    def _originallabelled_tree_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "originalLabelledTree")
+        else:
+            raise ValueError, "No output file specified."
+
+    # added for tree-insertion
+    def _labelled_tree_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "labelledTree")
+        else:
+            raise ValueError, "No output file specified."
+
+    # added for tree-insertion
+    def _classification_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "classification")
+        else:
+            raise ValueError, "No output file specified."
+
+    # added for tree-insertion
+    def _classificationlikelihoodweights_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "classificationLikelihoodWeights")
+        else:
+            raise ValueError, "No output file specified."
+
+    # added for tree-insertion
+    def _best_tree_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "bestTree")
+        else:
+            raise ValueError, "No output file specified."
+
+    # added for tree-insertion
+    def _entropy_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "entropy")
+        else:
+            raise ValueError, "No output file specified."
+
+    # added for tree-insertion
+    def _json_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "portableTree")
+        else:
+            raise ValueError, "No output file specified."
+
+    # added for tree-insertion
+    def _parsimony_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "equallyParsimoniousPlacements")
+        else:
+            raise ValueError, "No output file specified."
+
+    def _result_tree_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "result")
+        else:
+            raise ValueError, "No output file specified."
+
+    def _result_bootstrap_out_filename(self):
+        if self.Parameters['-n'].isOn():
+            return self._format_output(str(self.Parameters['-n'].Value), \
+                                            "bootstrap")
+        else:
+            raise ValueError, "No output file specified"
+
+    def _checkpoint_out_filenames(self):
+        """
+        RAxML generates a crapload of checkpoint files so need to
+        walk directory to collect names of all of them.
+        """
+        out_filenames = []
+        if self.Parameters['-n'].isOn():
+            out_name = str(self.Parameters['-n'].Value)
+            walk_root = self.WorkingDir
+            if self.Parameters['-w'].isOn():
+                walk_root = str(self.Parameters['-w'].Value)
+            for tup in walk(walk_root):
+                dpath, dnames, dfiles = tup
+                if dpath == walk_root:
+                    for gen_file in dfiles:
+                        if out_name in gen_file and "checkpoint" in gen_file:
+                            out_filenames.append(walk_root + gen_file)
+                    break
+
+        else:
+            raise ValueError, "No output file specified."
+        return out_filenames
+
+    def _handle_app_result_build_failure(self,out,err,exit_status,result_paths):
+        """ Catch the error when files are not produced """
+
+        try:
+            raise ApplicationError, \
+             'RAxML failed to produce an output file due to the following error: \n\n%s ' \
+             % err.read()
+        except:
+            raise ApplicationError,\
+                'RAxML failed to run properly.'
+
+    def _get_result_paths(self,data):
+
+        result = {}
+        result['Info'] = ResultPath(Path=self._info_out_filename(),
+                                            IsWritten=True)
+        if self.Parameters['-k'].isOn():
+            result['Bootstrap'] = ResultPath(
+                            Path=self._result_bootstrap_out_filename(),
+                            IsWritten=True)
+        elif self.Parameters["-f"].Value == 'v':
+            #these were added to handle the results from tree-insertion
+            result['Classification'] = ResultPath(
+                Path=self._classification_out_filename(),
+                IsWritten=True)
+            result['ClassificationLikelihoodWeights'] = ResultPath(
+                Path=self._classificationlikelihoodweights_out_filename(),
+                IsWritten=True)
+            result['OriginalLabelledTree'] = ResultPath(
+                Path=self._originallabelled_tree_out_filename(),
+                IsWritten=True)
+            result['Result'] = ResultPath(
+                Path=self._labelled_tree_out_filename(),IsWritten=True)
+            result['entropy'] = ResultPath(
+                Path=self._entropy_out_filename(),IsWritten=True)
+            result['json'] = ResultPath(
+                Path=self._json_out_filename()+'.jplace',IsWritten=True)
+        elif self.Parameters["-f"].Value == 'y':
+            #these were added to handle the results from tree-insertion
+
+            result['Parsimony'] = ResultPath(
+                Path=self._parsimony_out_filename(),
+                IsWritten=True)
+            result['OriginalLabelledTree'] = ResultPath(
+                Path=self._originallabelled_tree_out_filename(),
+                IsWritten=True)
+            result['json'] = ResultPath(
+                Path=self._json_out_filename()+'.jplace',IsWritten=True)
+        else:
+            result['Log'] = ResultPath(Path=self._log_out_filename(),
+                                            IsWritten=True)
+            result['ParsimonyTree'] = ResultPath(
+                                      Path=self._parsimony_tree_out_filename(),
+                                      IsWritten=True)
+            result['Result'] = ResultPath(
+                            Path=self._result_tree_out_filename(),
+                            IsWritten=True)
+            #
+            result['besttree'] = ResultPath(
+                            Path=self._best_tree_out_filename(),
+                            IsWritten=True)
+
+        for checkpoint_file in self._checkpoint_out_filenames():
+            checkpoint_num = checkpoint_file.split(".")[-1]
+            try:
+                checkpoint_num = int(checkpoint_num)
+            except Exception, e:
+                raise ValueError, "%s does not appear to be a valid checkpoint file"
+            result['Checkpoint%d' % checkpoint_num] = ResultPath(
+                        Path=checkpoint_file,
+                        IsWritten=True)
+
+        return result
+
+
+#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
+def raxml_alignment(align_obj,
+                 raxml_model="GTRCAT",
+                 params={},
+                 SuppressStderr=True,
+                 SuppressStdout=True):
+    """Run raxml on alignment object
+
+    align_obj: Alignment object
+    params: you can set any params except -w and -n
+
+    returns: tuple (phylonode,
+                    parsimonyphylonode,
+                    log likelihood,
+                    total exec time)
+    """
+
+    # generate temp filename for output
+    params["-w"] = "/tmp/"
+    params["-n"] = get_tmp_filename().split("/")[-1]
+    params["-m"] = raxml_model
+    params["-p"] = randint(1,100000)
+    ih = '_input_as_multiline_string'
+    seqs, align_map = align_obj.toPhylip()
+
+    #print params["-n"]
+
+    # set up command
+    raxml_app = Raxml(
+                   params=params,
+                   InputHandler=ih,
+                   WorkingDir=None,
+                   SuppressStderr=SuppressStderr,
+                   SuppressStdout=SuppressStdout)
+
+    # run raxml
+    ra = raxml_app(seqs)
+
+    # generate tree
+    tree_node =  DndParser(ra["Result"])
+
+    # generate parsimony tree
+    parsimony_tree_node =  DndParser(ra["ParsimonyTree"])
+
+    # extract log likelihood from log file
+    log_file = ra["Log"]
+    total_exec_time = exec_time = log_likelihood = 0.0
+    for line in log_file:
+        exec_time, log_likelihood = map(float, line.split())
+        total_exec_time += exec_time
+
+    # remove output files
+    ra.cleanUp()
+
+    return tree_node, parsimony_tree_node, log_likelihood, total_exec_time
+
+def build_tree_from_alignment(aln, moltype=DNA, best_tree=False, params={}):
+    """Returns a tree from Alignment object aln.
+
+    aln: an xxx.Alignment object, or data that can be used to build one.
+
+    moltype: cogent.core.moltype.MolType object
+
+    best_tree: best_tree suppport is currently not implemented
+
+    params: dict of parameters to pass in to the RAxML app controller.
+
+    The result will be an xxx.Alignment object, or None if tree fails.
+    """
+    if best_tree:
+        raise NotImplementedError
+
+    if '-m' not in params:
+        if moltype == DNA or moltype == RNA:
+            #params["-m"] = 'GTRMIX'
+            # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT
+            # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html)
+            params["-m"] = 'GTRGAMMA'
+        elif moltype == PROTEIN:
+            params["-m"] = 'PROTGAMMAmatrixName'
+        else:
+            raise ValueError, "Moltype must be either DNA, RNA, or PROTEIN"
+
+    if not hasattr(aln, 'toPhylip'):
+        aln = Alignment(aln)
+    seqs, align_map = aln.toPhylip()
+
+    # generate temp filename for output
+    params["-w"] = "/tmp/"
+    params["-n"] = get_tmp_filename().split("/")[-1]
+    params["-k"] = True
+    params["-p"] = randint(1,100000)
+    params["-x"] = randint(1,100000)
+
+    ih = '_input_as_multiline_string'
+
+    raxml_app = Raxml(params=params,
+                      InputHandler=ih,
+                      WorkingDir=None,
+                      SuppressStderr=True,
+                      SuppressStdout=True)
+
+    raxml_result = raxml_app(seqs)
+
+    tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode)
+
+    for node in tree.tips():
+        node.Name = align_map[node.Name]
+
+    raxml_result.cleanUp()
+
+    return tree
+
+
+def insert_sequences_into_tree(seqs, moltype, params={},
+                                           write_log=True):
+    """Insert sequences into Tree.
+
+    aln: an xxx.Alignment object, or data that can be used to build one.
+
+    moltype: cogent.core.moltype.MolType object
+
+    params: dict of parameters to pass in to the RAxML app controller.
+
+    The result will be a tree.
+    """
+
+    ih = '_input_as_multiline_string'
+
+    raxml_app = Raxml(params=params,
+                      InputHandler=ih,
+                      WorkingDir=None,
+                      SuppressStderr=False,
+                      SuppressStdout=False,
+                      HALT_EXEC=False)
+
+    raxml_result = raxml_app(seqs)
+
+    # write a log file
+    if write_log:
+        log_fp = join(params["-w"],'log_raxml_'+split(get_tmp_filename())[-1])
+        log_file=open(log_fp,'w')
+        log_file.write(raxml_result['StdOut'].read())
+        log_file.close()
+
+    '''
+    # getting setup since parsimony doesn't output tree..only jplace, however
+    # it is currently corrupt
+
+    # use guppy to convert json file into a placement tree
+    guppy_params={'tog':None}
+
+    new_tree=build_tree_from_json_using_params(raxml_result['json'].name, \
+                                               output_dir=params["-w"], \
+                                               params=guppy_params)
+    '''
+
+    # get tree from 'Result Names'
+    new_tree=raxml_result['Result'].readlines()
+    filtered_tree=re.sub('\[I\d+\]','',str(new_tree))
+    tree = DndParser(filtered_tree, constructor=PhyloNode)
+
+    raxml_result.cleanUp()
+
+    return tree
diff --git a/bfillings/rdp_classifier.py b/bfillings/rdp_classifier.py
new file mode 100644
index 0000000..6c57fcf
--- /dev/null
+++ b/bfillings/rdp_classifier.py
@@ -0,0 +1,589 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for rdp_classifier-2.0
+"""
+
+
+import os.path
+import re
+from os import environ, getenv
+from optparse import OptionParser
+from shutil import rmtree
+import tempfile
+import warnings
+
+from burrito.parameters import ValuedParameter
+from skbio.parse.sequences import parse_fasta
+from burrito.util import (CommandLineApplication, FilePath, ResultPath,
+                            ApplicationNotFoundError, ApplicationError)
+
+from burrito.util import which
+
+
+class RdpClassifier(CommandLineApplication):
+
+    """RDP Classifier application controller
+
+    The RDP Classifier program is distributed as a java archive (.jar)
+    file.  If the file 'rdp_classifier-2.2.jar' is not found in the
+    current directory, the app controller uses the JAR file specified
+    by the environment variable RDP_JAR_PATH.  If this variable is not
+    set, and 'rdp_classifier-2.2.jar' is not found in the current
+    directory, the application controller raises an
+    ApplicationNotFoundError.
+
+    The RDP Classifier often requires memory in excess of Java's
+    default 64M. To correct this situation, the authors recommend
+    increasing the maximum heap size for the java virtual machine.  An
+    option '-Xmx' (default 1000M) is provided for this purpose.
+    Details on this option may be found at
+    http://java.sun.com/j2se/1.5.0/docs/tooldocs/solaris/java.html
+
+    The classifier may optionally use a custom training set.  The full
+    path to the training set may be provided in the option
+    '-training-data'.
+    """
+    _input_handler = '_input_as_lines'
+    _command = "rdp_classifier-2.2.jar"
+    _options = {
+        # output file name for classification assignment
+        '-o': ValuedParameter('-', Name='o', Delimiter=' ', IsPath=True),
+        # a property file contains the mapping of the training
+        # files. Note: the training files and the property file should
+        # be in the same directory. The default property file is set
+        # to data/classifier/rRNAClassifier.properties.
+        '-t': ValuedParameter('-', Name='t', Delimiter=' ', IsPath=True),
+        # all tab delimited output format: [allrank|fixrank|db].
+        # Default is allrank.
+        #
+        #   allrank: outputs the results for all ranks applied for
+        #   each sequence: seqname, orientation, taxon name, rank,
+        #   conf, ...
+        #
+        #   fixrank: only outputs the results for fixed ranks in
+        #   order: no rank, domain, phylum, class, order, family,
+        #   genus
+        #
+        #   db: outputs the seqname, trainset_no, tax_id, conf. This
+        #   is good for storing in a database
+        '-f': ValuedParameter('-', Name='f', Delimiter=' '),
+    }
+
+    # The following are available in the attributes JvmParameters,
+    # JarParameters, and PositionalParameters
+
+    _jvm_synonyms = {}
+    _jvm_parameters = {
+        # Maximum heap size for JVM.
+        '-Xmx': ValuedParameter('-', Name='Xmx', Delimiter='', Value='1000m'),
+    }
+
+    _parameters = {}
+    _parameters.update(_options)
+    _parameters.update(_jvm_parameters)
+
+    def getHelp(self):
+        """Returns documentation string"""
+        # Summary paragraph copied from rdp_classifier-2.0, which is
+        # licensed under the GPL 2.0 and Copyright 2008 Michigan State
+        # University Board of Trustees
+        help_str = """\
+        usage: ClassifierCmd [-f <arg>] [-o <arg>] [-q <arg>] [-t <arg>]
+
+        -f,--format <arg> all tab delimited output format:
+        [allrank|fixrank|db]. Default is allrank.
+
+            allrank: outputs the results for all ranks applied for each
+            sequence: seqname, orientation, taxon name, rank, conf, ...
+
+            fixrank: only outputs the results for fixed ranks in order:
+            no rank, domain, phylum, class, order, family, genus
+
+            db: outputs the seqname, trainset_no, tax_id, conf. This is
+            good for storing in a database
+
+        -o,--outputFile <arg> output file name for classification
+        assignment
+
+        -q,--queryFile <arg> query file contains sequences in one of
+        the following formats: Fasta, Genbank and EMBL
+
+        -t,--train_propfile <arg> a property file contains the mapping
+        of the training files.
+
+        Note: the training files and the property file should be in
+        the same directory. The default property file is set to
+        data/classifier/rRNAClassifier.properties."""
+        return help_str
+
+    def _accept_exit_status(self, status):
+        """Returns false if an error occurred in execution
+        """
+        return (status == 0)
+
+    def _error_on_missing_application(self, params):
+        """Raise an ApplicationNotFoundError if the app is not accessible
+
+        In this case, checks for the java runtime and the RDP jar file.
+        """
+        if not (os.path.exists('java') or which('java')):
+            raise ApplicationNotFoundError(
+                "Cannot find java runtime. Is it installed? Is it in your "
+                "path?")
+        jar_fp = self._get_jar_fp()
+        if jar_fp is None:
+            raise ApplicationNotFoundError(
+                "JAR file not found in current directory and the RDP_JAR_PATH "
+                "environment variable is not set.  Please set RDP_JAR_PATH to "
+                "the full pathname of the JAR file.")
+        if not os.path.exists(jar_fp):
+            raise ApplicationNotFoundError(
+                "JAR file %s does not exist." % jar_fp)
+
+    def _get_jar_fp(self):
+        """Returns the full path to the JAR file.
+
+        If the JAR file cannot be found in the current directory and
+        the environment variable RDP_JAR_PATH is not set, returns
+        None.
+        """
+        # handles case where the jar file is in the current working directory
+        if os.path.exists(self._command):
+            return self._command
+        # handles the case where the user has specified the location via
+        # an environment variable
+        elif 'RDP_JAR_PATH' in environ:
+            return getenv('RDP_JAR_PATH')
+        else:
+            return None
+
+    # Overridden to pull out JVM-specific command-line arguments.
+    def _get_base_command(self):
+        """Returns the base command plus command-line options.
+
+        Does not include input file, output file, and training set.
+        """
+        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
+        jvm_command = "java"
+        jvm_arguments = self._commandline_join(
+            [self.Parameters[k] for k in self._jvm_parameters])
+        jar_arguments = '-jar "%s"' % self._get_jar_fp()
+        rdp_arguments = self._commandline_join(
+            [self.Parameters[k] for k in self._options])
+
+        command_parts = [
+            cd_command, jvm_command, jvm_arguments, jar_arguments,
+            rdp_arguments, '-q']
+        return self._commandline_join(command_parts).strip()
+
+    BaseCommand = property(_get_base_command)
+
+    def _commandline_join(self, tokens):
+        """Formats a list of tokens as a shell command
+
+        This seems to be a repeated pattern; may be useful in
+        superclass.
+        """
+        commands = filter(None, map(str, tokens))
+        return self._command_delimiter.join(commands).strip()
+
+    def _get_result_paths(self, data):
+        """ Return a dict of ResultPath objects representing all possible output
+        """
+        assignment_fp = str(self.Parameters['-o'].Value).strip('"')
+        if not os.path.isabs(assignment_fp):
+            assignment_fp = os.path.relpath(assignment_fp, self.WorkingDir)
+        return {'Assignments': ResultPath(assignment_fp, IsWritten=True)}
+
+
+class RdpTrainer(RdpClassifier):
+    _input_handler = '_input_as_lines'
+    TrainingClass = 'edu.msu.cme.rdp.classifier.train.ClassifierTraineeMaker'
+    PropertiesFile = 'RdpClassifier.properties'
+
+    _parameters = {
+        'taxonomy_file': ValuedParameter(None, None, IsPath=True),
+        'model_output_dir': ValuedParameter(None, None, IsPath=True),
+        'training_set_id': ValuedParameter(None, None, Value='1'),
+        'taxonomy_version': ValuedParameter(None, None, Value='version1'),
+        'modification_info': ValuedParameter(None, None, Value='cogent'),
+    }
+    _jvm_parameters = {
+        # Maximum heap size for JVM.
+        '-Xmx': ValuedParameter('-', Name='Xmx', Delimiter='', Value='1000m'),
+    }
+    _parameters.update(_jvm_parameters)
+
+    def _get_base_command(self):
+        """Returns the base command plus command-line options.
+
+        Handles everything up to and including the classpath.  The
+        positional training parameters are added by the
+        _input_handler_decorator method.
+        """
+        cd_command = ''.join(['cd ', str(self.WorkingDir), ';'])
+        jvm_command = "java"
+        jvm_args = self._commandline_join(
+            [self.Parameters[k] for k in self._jvm_parameters])
+        cp_args = '-cp "%s" %s' % (self._get_jar_fp(), self.TrainingClass)
+
+        command_parts = [cd_command, jvm_command, jvm_args, cp_args]
+        return self._commandline_join(command_parts).strip()
+
+    BaseCommand = property(_get_base_command)
+
+    def _set_input_handler(self, method_name):
+        """Stores the selected input handler in a private attribute.
+        """
+        self.__InputHandler = method_name
+
+    def _get_input_handler(self):
+        """Returns decorator that wraps the requested input handler.
+        """
+        return '_input_handler_decorator'
+
+    InputHandler = property(_get_input_handler, _set_input_handler)
+
+    @property
+    def ModelDir(self):
+        """Absolute FilePath to the training output directory.
+        """
+        model_dir = self.Parameters['model_output_dir'].Value
+        absolute_model_dir = os.path.abspath(model_dir)
+        return FilePath(absolute_model_dir)
+
+    def _input_handler_decorator(self, data):
+        """Adds positional parameters to selected input_handler's results.
+        """
+        input_handler = getattr(self, self.__InputHandler)
+        input_parts = [
+            self.Parameters['taxonomy_file'],
+            input_handler(data),
+            self.Parameters['training_set_id'],
+            self.Parameters['taxonomy_version'],
+            self.Parameters['modification_info'],
+            self.ModelDir,
+        ]
+        return self._commandline_join(input_parts)
+
+    def _get_result_paths(self, output_dir):
+        """Return a dict of output files.
+        """
+        # Only include the properties file here. Add the other result
+        # paths in the __call__ method, so we can catch errors if an
+        # output file is not written.
+        self._write_properties_file()
+        properties_fp = os.path.join(self.ModelDir, self.PropertiesFile)
+        result_paths = {
+            'properties': ResultPath(properties_fp, IsWritten=True,)
+        }
+        return result_paths
+
+    def _write_properties_file(self):
+        """Write an RDP training properties file manually.
+        """
+        # The properties file specifies the names of the files in the
+        # training directory.  We use the example properties file
+        # directly from the rdp_classifier distribution, which lists
+        # the default set of files created by the application.  We
+        # must write this file manually after generating the
+        # training data.
+        properties_fp = os.path.join(self.ModelDir, self.PropertiesFile)
+        properties_file = open(properties_fp, 'w')
+        properties_file.write(
+            "# Sample ResourceBundle properties file\n"
+            "bergeyTree=bergeyTrainingTree.xml\n"
+            "probabilityList=genus_wordConditionalProbList.txt\n"
+            "probabilityIndex=wordConditionalProbIndexArr.txt\n"
+            "wordPrior=logWordPrior.txt\n"
+            "classifierVersion=Naive Bayesian rRNA Classifier Version 1.0, "
+            "November 2003\n"
+        )
+        properties_file.close()
+
+    def __call__(self, data=None, remove_tmp=True):
+        """Run the application with the specified kwargs on data
+
+        data: anything that can be cast into a string or written out
+          to a file. Usually either a list of things or a single
+          string or number. input_handler will be called on this data
+          before it is passed as part of the command-line argument, so
+          by creating your own input handlers you can customize what
+          kind of data you want your application to accept
+
+        remove_tmp: if True, removes tmp files
+        """
+        result = super(
+            RdpClassifier,
+            self).__call__(
+            data=data,
+            remove_tmp=remove_tmp)
+        training_files = {
+            'bergeyTree': 'bergeyTrainingTree.xml',
+            'probabilityList': 'genus_wordConditionalProbList.txt',
+            'probabilityIndex': 'wordConditionalProbIndexArr.txt',
+            'wordPrior': 'logWordPrior.txt',
+        }
+        for key, training_fn in sorted(training_files.items()):
+            training_fp = os.path.join(self.ModelDir, training_fn)
+            if not os.path.exists(training_fp):
+                exception_msg = (
+                    "Training output file %s not found.  This may "
+                    "happen if an error occurred during the RDP training "
+                    "process.  More details may be available in the "
+                    "standard error, printed below.\n\n" % training_fp
+                )
+                stderr_msg = result["StdErr"].read()
+                result["StdErr"].seek(0)
+                raise ApplicationError(exception_msg + stderr_msg)
+            # Not in try/except clause because we already know the
+            # file exists. Failure would be truly exceptional, and we
+            # want to maintain the original exception in that case.
+            result[key] = open(training_fp)
+        return result
+
+
+def parse_command_line_parameters(argv=None):
+    """ Parses command line arguments """
+    usage =\
+        'usage: %prog [options] input_sequences_filepath'
+    version = 'Version: %prog ' + __version__
+    parser = OptionParser(usage=usage, version=version)
+
+    parser.add_option('-o', '--output_fp', action='store',
+                      type='string', dest='output_fp', help='Path to store ' +
+                      'output file [default: generated from input_sequences_filepath]')
+
+    parser.add_option('-c', '--min_confidence', action='store',
+                      type='float', dest='min_confidence', help='minimum confidence ' +
+                      'level to return a classification [default: %default]')
+
+    parser.set_defaults(verbose=False, min_confidence=0.80)
+
+    opts, args = parser.parse_args(argv)
+    if len(args) != 1:
+        parser.error('Exactly one argument is required.')
+
+    return opts, args
+
+
+def assign_taxonomy(
+        data, min_confidence=0.80, output_fp=None, training_data_fp=None,
+        fixrank=True, max_memory=None, tmp_dir=tempfile.gettempdir()):
+    """Assign taxonomy to each sequence in data with the RDP classifier
+
+        data: open fasta file object or list of fasta lines
+        confidence: minimum support threshold to assign taxonomy to a sequence
+        output_fp: path to write output; if not provided, result will be
+         returned in a dict of {seq_id:(taxonomy_assignment,confidence)}
+    """
+    # Going to iterate through this twice in succession, best to force
+    # evaluation now
+    data = list(data)
+
+    # RDP classifier doesn't preserve identifiers with spaces
+    # Use lookup table
+    seq_id_lookup = {}
+    for seq_id, seq in parse_fasta(data):
+        seq_id_lookup[seq_id.split()[0]] = seq_id
+
+    app_kwargs = {}
+    if tmp_dir is not None:
+        app_kwargs['TmpDir'] = tmp_dir
+    app = RdpClassifier(**app_kwargs)
+
+    if max_memory is not None:
+        app.Parameters['-Xmx'].on(max_memory)
+
+    temp_output_file = tempfile.NamedTemporaryFile(
+        prefix='RdpAssignments_', suffix='.txt', dir=tmp_dir)
+    app.Parameters['-o'].on(temp_output_file.name)
+    if training_data_fp is not None:
+        app.Parameters['-t'].on(training_data_fp)
+
+    if fixrank:
+        app.Parameters['-f'].on('fixrank')
+    else:
+        app.Parameters['-f'].on('allrank')
+
+    app_result = app(data)
+
+    assignments = {}
+
+    # ShortSequenceException messages are written to stdout
+    # Tag these ID's as unassignable
+    for line in app_result['StdOut']:
+        excep = parse_rdp_exception(line)
+        if excep is not None:
+            _, rdp_id = excep
+            orig_id = seq_id_lookup[rdp_id]
+            assignments[orig_id] = ('Unassignable', 1.0)
+
+    for line in app_result['Assignments']:
+        rdp_id, direction, taxa = parse_rdp_assignment(line)
+        if taxa[0][0] == "Root":
+            taxa = taxa[1:]
+        orig_id = seq_id_lookup[rdp_id]
+        lineage, confidence = get_rdp_lineage(taxa, min_confidence)
+        if lineage:
+            assignments[orig_id] = (';'.join(lineage), confidence)
+        else:
+            assignments[orig_id] = ('Unclassified', 1.0)
+
+    if output_fp:
+        try:
+            output_file = open(output_fp, 'w')
+        except OSError:
+            raise OSError("Can't open output file for writing: %s" % output_fp)
+        for seq_id, assignment in assignments.items():
+            lineage, confidence = assignment
+            output_file.write(
+                '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence))
+        output_file.close()
+        return None
+    else:
+        return assignments
+
+
+def train_rdp_classifier(
+        training_seqs_file, taxonomy_file, model_output_dir, max_memory=None,
+        tmp_dir=tempfile.gettempdir()):
+    """ Train RDP Classifier, saving to model_output_dir
+
+        training_seqs_file, taxonomy_file: file-like objects used to
+            train the RDP Classifier (see RdpTrainer documentation for
+            format of training data)
+
+        model_output_dir: directory in which to save the files
+            necessary to classify sequences according to the training
+            data
+
+    Once the model data has been generated, the RDP Classifier may
+    """
+    app_kwargs = {}
+    if tmp_dir is not None:
+        app_kwargs['TmpDir'] = tmp_dir
+    app = RdpTrainer(**app_kwargs)
+
+    if max_memory is not None:
+        app.Parameters['-Xmx'].on(max_memory)
+
+    temp_taxonomy_file = tempfile.NamedTemporaryFile(
+        prefix='RdpTaxonomy_', suffix='.txt', dir=tmp_dir)
+    temp_taxonomy_file.write(taxonomy_file.read())
+    temp_taxonomy_file.seek(0)
+
+    app.Parameters['taxonomy_file'].on(temp_taxonomy_file.name)
+    app.Parameters['model_output_dir'].on(model_output_dir)
+    return app(training_seqs_file)
+
+
+def train_rdp_classifier_and_assign_taxonomy(
+        training_seqs_file, taxonomy_file, seqs_to_classify, min_confidence=0.80,
+        model_output_dir=None, classification_output_fp=None, max_memory=None,
+        tmp_dir=tempfile.gettempdir()):
+    """ Train RDP Classifier and assign taxonomy in one fell swoop
+
+    The file objects training_seqs_file and taxonomy_file are used to
+    train the RDP Classifier (see RdpTrainer documentation for
+    details).  Model data is stored in model_output_dir.  If
+    model_output_dir is not provided, a temporary directory is created
+    and removed after classification.
+
+    The sequences in seqs_to_classify are classified according to the
+    model and filtered at the desired confidence level (default:
+    0.80).
+
+    The results are saved to classification_output_fp if provided,
+    otherwise a dict of {seq_id:(taxonomy_assignment,confidence)} is
+    returned.
+    """
+    if model_output_dir is None:
+        training_dir = tempfile.mkdtemp(prefix='RdpTrainer_', dir=tmp_dir)
+    else:
+        training_dir = model_output_dir
+
+    training_results = train_rdp_classifier(
+        training_seqs_file, taxonomy_file, training_dir, max_memory=max_memory,
+        tmp_dir=tmp_dir)
+    training_data_fp = training_results['properties'].name
+
+    assignment_results = assign_taxonomy(
+        seqs_to_classify, min_confidence=min_confidence,
+        output_fp=classification_output_fp, training_data_fp=training_data_fp,
+        max_memory=max_memory, fixrank=False, tmp_dir=tmp_dir)
+
+    if model_output_dir is None:
+        # Forum user reported an error on the call to os.rmtree:
+        # https://groups.google.com/d/topic/qiime-forum/MkNe7-JtSBw/discussion
+        # We were not able to replicate the problem and fix it
+        # properly.  However, even if an error occurs, we would like
+        # to return results, along with a warning.
+        try:
+            rmtree(training_dir)
+        except OSError:
+            msg = (
+                "Temporary training directory %s not removed" % training_dir)
+            if os.path.isdir(training_dir):
+                training_dir_files = os.listdir(training_dir)
+                msg += "\nDetected files %s" % training_dir_files
+            warnings.warn(msg, RuntimeWarning)
+
+    return assignment_results
+
+
+def get_rdp_lineage(rdp_taxa, min_confidence):
+    lineage = []
+    obs_confidence = 1.0
+    for taxon, rank, confidence in rdp_taxa:
+        if confidence >= min_confidence:
+            obs_confidence = confidence
+            lineage.append(taxon)
+        else:
+            break
+    return lineage, obs_confidence
+
+
+def parse_rdp_exception(line):
+    if line.startswith('ShortSequenceException'):
+        matchobj = re.search('recordID=(\S+)', line)
+        if matchobj:
+            rdp_id = matchobj.group(1)
+            return ('ShortSequenceException', rdp_id)
+    return None
+
+
+def parse_rdp_assignment(line):
+    """Returns a list of assigned taxa from an RDP classification line
+    """
+    toks = line.strip().split('\t')
+    seq_id = toks.pop(0)
+    direction = toks.pop(0)
+    if ((len(toks) % 3) != 0):
+        raise ValueError(
+            "Expected assignments in a repeating series of (rank, name, "
+            "confidence), received %s" % toks)
+    assignments = []
+    # Fancy way to create list of triples using consecutive items from
+    # input.  See grouper function in documentation for itertools for
+    # more general example.
+    itoks = iter(toks)
+    for taxon, rank, confidence_str in zip(itoks, itoks, itoks):
+        if not taxon:
+            continue
+        assignments.append((taxon.strip('"'), rank, float(confidence_str)))
+    return seq_id, direction, assignments
+
+
+if __name__ == "__main__":
+    opts, args = parse_command_line_parameters()
+    assign_taxonomy(
+        open(args[0]), min_confidence=opts.min_confidence,
+        output_fp=opts.output_fp)
diff --git a/bfillings/rtax.py b/bfillings/rtax.py
new file mode 100644
index 0000000..36f3c56
--- /dev/null
+++ b/bfillings/rtax.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Application controller for RTAX version 1.0
+
+Includes application controller for RTAX.
+
+Modified from uclust.py and rdp_classifier.py on 12-27-11
+"""
+
+from os import remove, makedirs
+from os.path import exists, split, splitext, basename, isdir, abspath, isfile
+import tempfile
+import os.path
+import re
+from sys import stderr
+from shutil import rmtree
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+                            get_tmp_filename, ApplicationError,
+                            ApplicationNotFoundError)
+from skbio.util import remove_files
+
+from cogent.util.misc import app_path
+from cogent import DNA
+
+
+class RtaxParseError(Exception):
+    pass
+
+
+class Rtax(CommandLineApplication):
+    """ Rtax ApplicationController
+
+    """
+
+    _command = 'rtax'
+    _input_handler = '_input_as_parameters'
+    _parameters = {\
+
+        # -r a reference database in FASTA format
+        '-r':ValuedParameter('-',Name='r',Delimiter=' ', IsPath=True),
+
+        # -t a taxonomy file with sequence IDs matching the reference database
+        '-t':ValuedParameter('-',Name='t',Delimiter=' ', IsPath=True),
+
+        # -a a FASTA file containing query sequences (single-ended, read 1, or paired-end delimited)
+        '-a':ValuedParameter('-',Name='a',Delimiter=' ', IsPath=True),
+
+        # -b a FASTA file containing query sequences (read 2, with matching IDs)
+        '-b':ValuedParameter('-',Name='b',Delimiter=' ', IsPath=True),
+
+        # -l a text file containing sequence IDs to process, one per line
+        '-l':ValuedParameter('-',Name='l',Delimiter=' ', IsPath=True),
+
+        # -d a delimiter separating the two reads when provided in a single file
+        '-d':ValuedParameter('-',Name='d',Delimiter=' ', IsPath=False, Quote="\""),
+
+        # -i a regular expression used to select part of the fasta header to use as the sequence id.
+        '-i':ValuedParameter('-',Name='i',Delimiter=' ', IsPath=False, Quote="'"),
+
+        # -o output file name for classification assignment
+        '-o': ValuedParameter('-', Name='o', Delimiter=' ', IsPath=True),
+
+        # -m temporary directory
+        '-m': ValuedParameter('-', Name='m', Delimiter=' ', IsPath=True),
+
+        # -f allow fallback from paired-end to single-ended classification when one read is missing
+        '-f':FlagParameter(Prefix='-',Name='f'),
+
+        # -g do not allow fallback from paired-end to single-ended classification when one read is too generic
+        '-g':FlagParameter(Prefix='-',Name='g')
+    }
+
+    _suppress_stdout = False
+    _suppress_stderr = False
+
+    #def __init__(self):
+    #    super().__init__()...
+    #    usearch_command = "usearch"
+    #    if not (exists(usearch_command) or app_path(usearch_command)):
+    #        raise ApplicationNotFoundError,\
+ 	#        "Cannot find %s. Is it installed? Is it in your path?"\
+ 	#        % usearch_command
+
+
+    def _input_as_parameters(self,data):
+        """ Set the input path (a fasta filepath)
+        """
+        # The list of values which can be passed on a per-run basis
+        allowed_values = ['-r','-t','-a','-b','-l','-d','i','-o','-m','-v','-f', '-g']
+
+        unsupported_parameters = set(data.keys()) - set(allowed_values)
+        if unsupported_parameters:
+            raise ApplicationError,\
+             "Unsupported parameter(s) passed when calling rtax: %s" %\
+              ' '.join(unsupported_parameters)
+
+        for v in allowed_values:
+            # turn the parameter off so subsequent runs are not
+            # affected by parameter settings from previous runs
+            self.Parameters[v].off()
+            if v in data:
+                # turn the parameter on if specified by the user
+                self.Parameters[v].on(data[v])
+
+        return ''
+
+    def _get_result_paths(self,data):
+        """ Return a dict of ResultPath objects representing all possible output
+        """
+        assignment_fp = str(self.Parameters['-o'].Value).strip('"')
+        if not os.path.isabs(assignment_fp):
+            assignment_fp = os.path.relpath(assignment_fp, self.WorkingDir)
+        return {'Assignments': ResultPath(assignment_fp, IsWritten=True)}
+
+
+
+    def _accept_exit_status(self,exit_status):
+        """ Test for acceptable exit status
+
+            uclust can seg fault and still generate a parsable .uc file
+            so we explicitly check the exit status
+
+        """
+        return exit_status == 0
+
+    def getHelp(self):
+        """Method that points to documentation"""
+        help_str =\
+        """
+        RTAX is hosted at:
+        http://dev.davidsoergel.com/rtax/
+
+        The following paper should be cited if this resource is used:
+
+        Soergel D.A.W., Dey N., Knight R., and Brenner S.E.  2012.
+        Selection of primers for optimal taxonomic classification
+        of environmental 16S rRNA gene sequences.  ISME J (6), 1440-1444
+        """
+        return help_str
+
+def assign_taxonomy(dataPath, reference_sequences_fp, id_to_taxonomy_fp, read_1_seqs_fp, read_2_seqs_fp, single_ok=False, no_single_ok_generic=False,
+                    header_id_regex=None, read_id_regex = "\S+\s+(\S+)", amplicon_id_regex = "(\S+)\s+(\S+?)\/",
+                    output_fp=None, log_path=None, HALT_EXEC=False, base_tmp_dir = '/tmp'):
+    """Assign taxonomy to each sequence in data with the RTAX classifier
+
+        # data: open fasta file object or list of fasta lines
+        dataPath: path to a fasta file
+
+        output_fp: path to write output; if not provided, result will be
+         returned in a dict of {seq_id:(taxonomy_assignment,confidence)}
+    """
+
+    usearch_command = "usearch"
+    if not (exists(usearch_command) or app_path(usearch_command)):
+        raise ApplicationNotFoundError,\
+         "Cannot find %s. Is it installed? Is it in your path?"\
+         % usearch_command
+
+    my_tmp_dir = get_tmp_filename(tmp_dir=base_tmp_dir,prefix='rtax_',suffix='',result_constructor=str)
+    os.makedirs(my_tmp_dir)
+
+
+    try:
+        # RTAX classifier doesn't necessarily preserve identifiers
+        # it reports back only the id extracted as $1 using header_id_regex
+        # since rtax takes the original unclustered sequence files as input,
+        # the usual case is that the regex extracts the amplicon ID from the second field
+
+
+
+        # Use lookup table
+        read_1_id_to_orig_id = {}
+        readIdExtractor = re.compile(read_id_regex)  # OTU clustering produces ">clusterID read_1_id"
+        data = open(dataPath,'r')
+        for seq_id, seq in parse_fasta(data):
+            # apply the regex
+            extract = readIdExtractor.match(seq_id)
+            if extract is None:
+                stderr.write("Matched no ID with read_id_regex " + read_id_regex +" in '" + seq_id + "' from file " + dataPath + "\n")
+            else:
+                read_1_id_to_orig_id[extract.group(1)] = seq_id
+                #stderr.write(extract.group(1) + " => " +  seq_id + "\n")
+            #seq_id_lookup[seq_id.split()[1]] = seq_id
+        data.close()
+
+
+
+        # make list of amplicon IDs to pass to RTAX
+
+        id_list_fp = open(my_tmp_dir+"/ampliconIdsToClassify", "w")
+
+        # Establish mapping of amplicon IDs to read_1 IDs
+        # simultaneously write the amplicon ID file for those IDs found in the input mapping above
+
+        amplicon_to_read_1_id = {}
+        ampliconIdExtractor = re.compile(amplicon_id_regex)  # split_libraries produces >read_1_id ampliconID/1 ...  // see also assign_taxonomy 631
+        read_1_data = open(read_1_seqs_fp,'r')
+        for seq_id, seq in parse_fasta(read_1_data):
+            # apply the regex
+            extract = ampliconIdExtractor.match(seq_id)
+            if extract is None:
+                stderr.write("Matched no ID with amplicon_id_regex " + amplicon_id_regex + " in '" + seq_id + "' from file " + read_1_seqs_fp + "\n")
+            else:
+                read_1_id = extract.group(1)
+                amplicon_id = extract.group(2)
+                try:
+                    amplicon_to_read_1_id[amplicon_id] = read_1_id
+                    bogus = read_1_id_to_orig_id[read_1_id]  # verify that the id is valid
+                    id_list_fp.write('%s\n' % (amplicon_id))
+                except KeyError:
+                    pass
+        data.close()
+        id_list_fp.close()
+
+        app = Rtax(HALT_EXEC=HALT_EXEC)
+
+        temp_output_file = tempfile.NamedTemporaryFile(
+            prefix='RtaxAssignments_', suffix='.txt')
+        app.Parameters['-o'].on(temp_output_file.name)
+        app.Parameters['-r'].on(reference_sequences_fp)
+        app.Parameters['-t'].on(id_to_taxonomy_fp)
+        # app.Parameters['-d'].on(delimiter)
+        app.Parameters['-l'].on(id_list_fp.name)  # these are amplicon IDs
+        app.Parameters['-a'].on(read_1_seqs_fp)
+        if read_2_seqs_fp is not None:
+            app.Parameters['-b'].on(read_2_seqs_fp)
+        app.Parameters['-i'].on(header_id_regex)
+        app.Parameters['-m'].on(my_tmp_dir)
+        if single_ok: app.Parameters['-f'].on();
+        if no_single_ok_generic: app.Parameters['-g'].on();
+        #app.Parameters['-v'].on()
+
+        app_result = app()
+
+        if log_path:
+            f=open(log_path, 'a')
+            errString=''.join(app_result['StdErr'].readlines()) + '\n'
+            f.write(errString)
+            f.close()
+
+        assignments = {}
+
+        # restore original sequence IDs with spaces
+
+        for line in app_result['Assignments']:
+            toks = line.strip().split('\t')
+            rtax_id = toks.pop(0)
+            if len(toks):
+                bestpcid = toks.pop(0)  # ignored
+            lineage = toks
+
+            # RTAX does not provide a measure of confidence.  We could pass one in,
+            # based on the choice of primers, or even look it up on the fly in the tables
+            # from the "optimal primers" paper; but it would be the same for every
+            # query sequence anyway.
+            # we could also return bestpcid, but that's not the same thing as confidence.
+            confidence = 1.0
+
+            read_1_id = amplicon_to_read_1_id[rtax_id]
+            orig_id = read_1_id_to_orig_id[read_1_id]
+            if lineage:
+                assignments[orig_id] = (';'.join(lineage), confidence)
+            else:
+                assignments[orig_id] = ('Unclassified', 1.0)
+
+        if output_fp:
+            try:
+                output_file = open(output_fp, 'w')
+            except OSError:
+                raise OSError("Can't open output file for writing: %s" % output_fp)
+            for seq_id, assignment in assignments.items():
+                lineage, confidence = assignment
+                output_file.write(
+                    '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence))
+            output_file.close()
+            return None
+        else:
+            return assignments
+    finally:
+        try:
+            rmtree(my_tmp_dir)
+        except OSError:
+            pass
diff --git a/bfillings/seqprep.py b/bfillings/seqprep.py
new file mode 100644
index 0000000..2ee9d48
--- /dev/null
+++ b/bfillings/seqprep.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+# Application controller for SeqPrep
+# https://github.com/jstjohn/SeqPrep
+
+import os
+import tempfile
+
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import CommandLineApplication, ResultPath
+
+# SeqPrep help:
+# Usage:
+# SeqPrep [Required Args] [Options]
+# NOTE 1: The output is always gziped compressed.
+# NOTE 2: If the quality strings in the output contain characters less than
+# ascii 33 on an ascii table (they look like lines from a binary file), try
+# running again with or without the -6 option.
+#
+
+
+class SeqPrep(CommandLineApplication):
+
+    """SeqPrep application controller for joining paired-end reads"""
+    _command = 'SeqPrep'
+    _parameters = {
+        # Required Arguments
+        # -f <first read input fastq filename>
+        # -r <second read input fastq filename>
+        # -1 <first read output fastq filename>
+        # -2 <second read output fastq filename>
+        '-f': ValuedParameter(Prefix='-', Delimiter=' ', Name='f'),
+        '-r': ValuedParameter(Prefix='-', Delimiter=' ', Name='r'),
+        '-1': ValuedParameter(Prefix='-', Delimiter=' ', Name='1'),
+        '-2': ValuedParameter(Prefix='-', Delimiter=' ', Name='2'),
+
+        # General Arguments (Optional):
+        # -3 <first read discarded fastq filename>
+        # -4 <second read discarded fastq filename>
+        # -h Display this help message and exit (also works with no args)
+        # -6 Input sequence is in phred+64 rather than phred+33 format, the
+        #    output will still be phred+33
+        # -q <Quality score cutoff for mismatches to be counted in overlap; default = 13>
+        # -L <Minimum length of a trimmed or merged read to print it; default = 30>
+        '-3': ValuedParameter(Prefix='-', Delimiter=' ', Name='3'),
+        '-4': ValuedParameter(Prefix='-', Delimiter=' ', Name='4'),
+        '-h': FlagParameter(Prefix='-', Name='h'),
+        '-6': FlagParameter(Prefix='-', Name='6'),
+        '-q': ValuedParameter(Prefix='-', Delimiter=' ', Name='q'),
+        '-L': ValuedParameter(Prefix='-', Delimiter=' ', Name='L'),
+
+        # Arguments for Adapter/Primer Trimming (Optional):
+        # -A <forward read primer/adapter sequence to trim as it would appear at the
+        #   end of a read (recommend about 20bp of this)
+        #	(should validate by grepping a file);
+        #   default (genomic non-multiplexed adapter1) = AGATCGGAAGAGCGGTTCAG>
+        # -B <reverse read primer/adapter sequence to trim as it would appear at the
+        #   end of a read (recommend about 20bp of this)
+        #	(should validate by grepping a file);
+        #   default (genomic non-multiplexed adapter2) = AGATCGGAAGAGCGTCGTGT>
+        # -O <minimum overall base pair overlap with adapter sequence to trim;
+        #   default = 10>
+        # -M <maximum fraction of good quality mismatching bases for primer/adapter
+        #    overlap; default = 0.020000>
+        # -N <minimum fraction of matching bases for primer/adapter overlap;
+        #   default = 0.870000>
+        # -b <adapter alignment band-width; default = 50>
+        # -Q <adapter alignment gap-open; default = 8>
+        # -t <adapter alignment gap-extension; default = 2>
+        # -e <adapter alignment gap-end; default = 2>
+        # -Z <adapter alignment minimum local alignment score cutoff
+        #   [roughly (2*num_hits) - (num_gaps*gap_open) - (num_gaps*gap_close) -
+        #   (gap_len*gap_extend) - (2*num_mismatches)]; default = 26>
+        # -w <read alignment band-width; default = 50>
+        # -W <read alignment gap-open; default = 26>
+        # -p <read alignment gap-extension; default = 9>
+        # -P <read alignment gap-end; default = 5>
+        # -X <read alignment maximum fraction gap cutoff; default = 0.125000>
+        '-A': ValuedParameter(Prefix='-', Delimiter=' ', Name='A'),
+        '-B': ValuedParameter(Prefix='-', Delimiter=' ', Name='B'),
+        '-O': ValuedParameter(Prefix='-', Delimiter=' ', Name='O'),
+        '-M': ValuedParameter(Prefix='-', Delimiter=' ', Name='M'),
+        '-N': ValuedParameter(Prefix='-', Delimiter=' ', Name='N'),
+        '-b': ValuedParameter(Prefix='-', Delimiter=' ', Name='b'),
+        '-Q': ValuedParameter(Prefix='-', Delimiter=' ', Name='Q'),
+        '-t': ValuedParameter(Prefix='-', Delimiter=' ', Name='t'),
+        '-e': ValuedParameter(Prefix='-', Delimiter=' ', Name='e'),
+        '-Z': ValuedParameter(Prefix='-', Delimiter=' ', Name='Z'),
+        '-w': ValuedParameter(Prefix='-', Delimiter=' ', Name='w'),
+        '-W': ValuedParameter(Prefix='-', Delimiter=' ', Name='W'),
+        '-p': ValuedParameter(Prefix='-', Delimiter=' ', Name='p'),
+        '-P': ValuedParameter(Prefix='-', Delimiter=' ', Name='P'),
+        '-X': ValuedParameter(Prefix='-', Delimiter=' ', Name='X'),
+
+        # Optional Arguments for Merging:
+        # -y <maximum quality score in output ((phred 33) default = ']' )>
+        # -g <print overhang when adapters are present and stripped (use this if
+        #   reads are different length)>
+        # -s <perform merging and output the merged reads to this file>
+        # -E <write pretty alignments to this file for visual Examination>
+        # -x <max number of pretty alignments to write (if -E provided);
+        #   default = 10000>
+        # -o <minimum overall base pair overlap to merge two reads; default = 15>
+        # -m <maximum fraction of good quality mismatching bases to overlap reads;
+        #   default = 0.020000>
+        # -n <minimum fraction of matching bases to overlap reads;
+        #   default = 0.900000>
+        '-y': ValuedParameter(Prefix='-', Delimiter=' ', Name='y'),
+        '-g': FlagParameter(Prefix='-', Name='y'),
+        '-s': ValuedParameter(Prefix='-', Delimiter=' ', Name='s'),
+        '-E': ValuedParameter(Prefix='-', Delimiter=' ', Name='E'),
+        '-x': ValuedParameter(Prefix='-', Delimiter=' ', Name='x'),
+        '-o': ValuedParameter(Prefix='-', Delimiter=' ', Name='o'),
+        '-m': ValuedParameter(Prefix='-', Delimiter=' ', Name='m'),
+        '-n': ValuedParameter(Prefix='-', Delimiter=' ', Name='n')}
+
+    def _unassembled_reads1_out_file_name(self):
+        """Checks file name is set for reads1 output.
+           Returns absolute path."""
+        if self.Parameters['-1'].isOn():
+            unassembled_reads1 = self._absolute(
+                str(self.Parameters['-1'].Value))
+        else:
+            raise ValueError("No reads1 (flag: -1) output path specified")
+        return unassembled_reads1
+
+    def _unassembled_reads2_out_file_name(self):
+        """Checks if file name is set for reads2 output.
+           Returns absolute path."""
+        if self.Parameters['-2'].isOn():
+            unassembled_reads2 = self._absolute(
+                str(self.Parameters['-2'].Value))
+        else:
+            raise ValueError("No reads2 (flag -2) output path specified")
+        return unassembled_reads2
+
+    def _discarded_reads1_out_file_name(self):
+        """Checks if file name is set for discarded reads1 output.
+           Returns absolute path."""
+        if self.Parameters['-3'].isOn():
+            discarded_reads1 = self._absolute(str(self.Parameters['-3'].Value))
+        else:
+            raise ValueError(
+                "No discarded-reads1 (flag -3) output path specified")
+        return discarded_reads1
+
+    def _discarded_reads2_out_file_name(self):
+        """Checks if file name is set for discarded reads2 output.
+           Returns absolute path."""
+        if self.Parameters['-4'].isOn():
+            discarded_reads2 = self._absolute(str(self.Parameters['-4'].Value))
+        else:
+            raise ValueError(
+                "No discarded-reads2 (flag -4) output path specified")
+        return discarded_reads2
+
+    def _assembled_out_file_name(self):
+        """Checks file name is set for assembled output.
+           Returns absolute path."""
+        if self.Parameters['-s'].isOn():
+            assembled_reads = self._absolute(str(self.Parameters['-s'].Value))
+        else:
+            raise ValueError(
+                "No assembled-reads (flag -s) output path specified")
+        return assembled_reads
+
+    def _pretty_alignment_out_file_name(self):
+        """Checks file name is set for pretty alignment output.
+           Returns absolute path."""
+        if self.Parameters['-E'].isOn():
+            pretty_alignment = self._absolute(str(self.Parameters['-E'].Value))
+        else:
+            raise ValueError(
+                "No pretty-=alignment (flag -E) output path specified")
+        return pretty_alignment
+
+    def _get_result_paths(self, data):
+        """Captures SeqPrep output.
+
+        """
+        result = {}
+
+        # Always output:
+        result['UnassembledReads1'] = ResultPath(Path=
+                                                 self._unassembled_reads1_out_file_name(
+                                                 ),
+                                                 IsWritten=True)
+        result['UnassembledReads2'] = ResultPath(Path=
+                                                 self._unassembled_reads2_out_file_name(
+                                                 ),
+                                                 IsWritten=True)
+
+        # optional output, so we check for each
+        # check for assembled reads file
+        if self.Parameters['-s'].isOn():
+            result['Assembled'] = ResultPath(Path=
+                                             self._assembled_out_file_name(),
+                                             IsWritten=True)
+
+        # check for discarded (unassembled) reads1 file
+        if self.Parameters['-3'].isOn():
+            result['Reads1Discarded'] = ResultPath(Path=
+                                                   self._discarded_reads1_out_file_name(
+                                                   ),
+                                                   IsWritten=True)
+
+        # check for discarded (unassembled) reads2 file
+        if self.Parameters['-4'].isOn():
+            result['Reads2Discarded'] = ResultPath(Path=
+                                                   self._discarded_reads2_out_file_name(
+                                                   ),
+                                                   IsWritten=True)
+
+        # check for pretty-alignment file
+        if self.Parameters['-E'].isOn():
+            result['PrettyAlignments'] = ResultPath(Path=
+                                                    self._pretty_alignment_out_file_name(
+                                                    ),
+                                                    IsWritten=True)
+
+        return result
+
+    def getHelp(self):
+        """seqprep help"""
+        help_str = """
+        For basic help, type the following at the command line:
+            'SeqPrep -h'
+
+        Website:
+            https://github.com/jstjohn/SeqPrep
+        """
+        return help_str
+
+
+def join_paired_end_reads_seqprep(
+        reads1_infile_path,
+        reads2_infile_path,
+        outfile_label='seqprep',
+        max_overlap_ascii_q_score='J',
+        min_overlap=None,  # typical default vs 15
+        max_mismatch_good_frac=None,  # typical default is 0.02,
+        min_frac_matching=None,  # typical default is 0.9,
+        phred_64=False,
+        params={},
+        working_dir=tempfile.gettempdir(),
+        SuppressStderr=True,
+        SuppressStdout=True,
+        HALT_EXEC=False):
+    """ Runs SeqPrep parameters to assemble paired-end reads.
+        -reads1_infile_path : reads1.fastq infile path
+        -reads2_infile_path : reads2.fastq infile path
+        -max_overlap_ascii_q_score : 'J' for Illumina 1.8+ phred+33,
+                                    representing a score of 41. See:
+                                    http://en.wikipedia.org/wiki/FASTQ_format
+        -min_overlap : minimum overall base pair overlap to merge two reads
+        -max_mismatch_good_frac : maximum fraction of good quality mismatching
+                                  bases to overlap reads
+        -min_frac_matching : minimum fraction of matching bases to overlap
+                             reads
+        -phred_64 : if input is in phred+64. Output will always be phred+33.
+        -params : other optional SeqPrep parameters
+
+         NOTE: SeqPrep always outputs gzipped files
+    """
+
+    abs_r1_path = os.path.abspath(reads1_infile_path)
+    abs_r2_path = os.path.abspath(reads2_infile_path)
+
+    infile_paths = [abs_r1_path, abs_r2_path]
+
+    # check / make absolute infile paths
+    for p in infile_paths:
+        if not os.path.exists(p):
+            raise IOError('Infile not found at: %s' % p)
+
+    # set up controller
+    seqprep_app = SeqPrep(params=params,
+                          WorkingDir=working_dir,
+                          SuppressStderr=SuppressStderr,
+                          SuppressStdout=SuppressStdout,
+                          HALT_EXEC=HALT_EXEC)
+
+    # required by SeqPrep to assemble:
+    seqprep_app.Parameters['-f'].on(abs_r1_path)
+    seqprep_app.Parameters['-r'].on(abs_r2_path)
+
+    if outfile_label is not None:
+        seqprep_app.Parameters['-s'].on(outfile_label + '_assembled.fastq.gz')
+        seqprep_app.Parameters[
+            '-1'].on(outfile_label + '_unassembled_R1.fastq.gz')
+        seqprep_app.Parameters[
+            '-2'].on(outfile_label + '_unassembled_R2.fastq.gz')
+    else:
+        raise ValueError("Must set an outfile_label in order to set",
+                         " the -s, -1, & -2 options!")
+
+    if min_overlap is not None:
+        if isinstance(min_overlap, int) and min_overlap > 0:
+                seqprep_app.Parameters['-o'].on(min_overlap)
+        else:
+            raise ValueError("min_overlap must be an int >= 0!")
+
+    if max_mismatch_good_frac is not None:
+        if isinstance(max_mismatch_good_frac, float) and 0.0 < max_mismatch_good_frac <= 1.0:
+            seqprep_app.Parameters['-m'].on(max_mismatch_good_frac)
+        else:
+            raise ValueError(
+                "max_mismatch_good_frac must be a float between 0.0-1.0!")
+
+    if min_frac_matching is not None:
+        if isinstance(min_frac_matching, float) and 0.0 < min_frac_matching <= 1.0:
+            seqprep_app.Parameters['-n'].on(min_frac_matching)
+        else:
+            raise ValueError(
+                "min_frac_matching must be a float between 0.0-1.0!")
+
+    if max_overlap_ascii_q_score is not None:
+        if isinstance(max_overlap_ascii_q_score, str) \
+                and len(max_overlap_ascii_q_score) == 1:
+            seqprep_app.Parameters['-y'].on(max_overlap_ascii_q_score)
+        else:
+            raise ValueError("max_overlap_ascii_q_score must be a single",
+                             " ASCII character string. e.g. \'J\'!")
+
+   # if input is phred+64
+    if phred_64 is True:
+        seqprep_app.Parameters['-6'].on()
+
+    # run assembler
+    result = seqprep_app()
+
+    # Store output file path data to dict
+    path_dict = {}
+    path_dict['Assembled'] = result['Assembled'].name
+    path_dict['UnassembledReads1'] = result['UnassembledReads1'].name
+    path_dict['UnassembledReads2'] = result['UnassembledReads2'].name
+
+   # sanity check that files actually exist in path lcoations
+    for path in path_dict.values():
+        if not os.path.exists(path):
+            raise IOError('Output file not found at: %s' % path)
+
+    return path_dict
diff --git a/bfillings/sortmerna_v2.py b/bfillings/sortmerna_v2.py
new file mode 100644
index 0000000..e6cac68
--- /dev/null
+++ b/bfillings/sortmerna_v2.py
@@ -0,0 +1,544 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+Application controller for SortMeRNA version 2.0
+================================================
+"""
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+
+from os.path import split, splitext, dirname, join
+from glob import glob
+import re
+import tempfile
+
+from burrito.util import CommandLineApplication, ResultPath
+from burrito.parameters import ValuedParameter, FlagParameter
+from skbio.parse.sequences import parse_fasta
+
+
+class IndexDB(CommandLineApplication):
+    """ SortMeRNA generic application controller for building databases
+    """
+    _command = 'indexdb_rna'
+    _command_delimiter = ' '
+    _parameters = {
+        # Fasta reference file followed by indexed reference
+        # (ex. /path/to/refseqs.fasta,/path/to/refseqs.idx)
+        '--ref': ValuedParameter('--', Name='ref', Delimiter=' ', IsPath=True),
+
+        # Maximum number of positions to store for each unique seed
+        '--max_pos': ValuedParameter('--', Name='max_pos', Delimiter=' ',
+                                     IsPath=False, Value="10000"),
+
+        # tmp folder for storing unique L-mers (prior to calling CMPH
+        # in indexdb_rna), this tmp file is removed by indexdb_rna
+        # after it is not used any longer
+        '--tmpdir': ValuedParameter('--', Name='tmpdir', Delimiter=' ',
+                                    IsPath=True)
+    }
+
+    def _get_result_paths(self, data):
+        """ Build the dict of result filepaths
+        """
+        # get the filepath of the indexed database (after comma)
+        # /path/to/refseqs.fasta,/path/to/refseqs.idx
+        #                        ^------------------^
+        db_name = (self.Parameters['--ref'].Value).split(',')[1]
+
+        result = {}
+        extensions = ['bursttrie', 'kmer', 'pos', 'stats']
+        for extension in extensions:
+            for file_path in glob("%s.%s*" % (db_name, extension)):
+                # this will match e.g. nr.bursttrie_0.dat, nr.bursttrie_1.dat
+                # and nr.stats
+                key = file_path.split(db_name + '.')[1]
+                result[key] = ResultPath(Path=file_path, IsWritten=True)
+        return result
+
+
+def build_database_sortmerna(fasta_path,
+                             max_pos=None,
+                             output_dir=None,
+                             temp_dir=tempfile.gettempdir(),
+                             HALT_EXEC=False):
+    """ Build sortmerna db from fasta_path; return db name
+        and list of files created
+
+        Parameters
+        ----------
+        fasta_path : string
+            path to fasta file of sequences to build database.
+        max_pos : integer, optional
+            maximum positions to store per seed in index
+            [default: 10000].
+        output_dir : string, optional
+            directory where output should be written
+            [default: same directory as fasta_path]
+        HALT_EXEC : boolean, optional
+            halt just before running the indexdb_rna command
+            and print the command -- useful for debugging
+            [default: False].
+
+        Return
+        ------
+        db_name : string
+            filepath to indexed database.
+        db_filepaths : list
+            output files by indexdb_rna
+    """
+
+    if fasta_path is None:
+        raise ValueError("Error: path to fasta reference "
+                         "sequences must exist.")
+
+    fasta_dir, fasta_filename = split(fasta_path)
+    if not output_dir:
+        output_dir = fasta_dir or '.'
+        # Will cd to this directory, so just pass the filename
+        # so the app is not confused by relative paths
+        fasta_path = fasta_filename
+
+    index_basename = splitext(fasta_filename)[0]
+
+    db_name = join(output_dir, index_basename)
+
+    # Instantiate the object
+    sdb = IndexDB(WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+    # The parameter --ref STRING must follow the format where
+    # STRING = /path/to/ref.fasta,/path/to/ref.idx
+    sdb.Parameters['--ref'].on("%s,%s" % (fasta_path, db_name))
+
+    # Set temporary directory
+    sdb.Parameters['--tmpdir'].on(temp_dir)
+
+    # Override --max_pos parameter
+    if max_pos is not None:
+        sdb.Parameters['--max_pos'].on(max_pos)
+
+    # Run indexdb_rna
+    app_result = sdb()
+
+    # Return all output files (by indexdb_rna) as a list,
+    # first however remove the StdErr and StdOut filepaths
+    # as they files will be destroyed at the exit from
+    # this function (IndexDB is a local instance)
+    db_filepaths = [v.name for k, v in app_result.items()
+                    if k not in {'StdErr', 'StdOut'} and hasattr(v, 'name')]
+
+    return db_name, db_filepaths
+
+
+class Sortmerna(CommandLineApplication):
+    """ SortMeRNA generic application controller for OTU picking
+    """
+
+    _command = 'sortmerna'
+    _command_delimiter = ' '
+    _parameters = {
+        # Verbose (log to stdout)
+        '-v': FlagParameter('-', Name='v', Value=True),
+
+        # Fasta or Fastq input query sequences file
+        '--reads': ValuedParameter('--', Name='reads', Delimiter=' ',
+                                   IsPath=True, Value=None),
+
+        # Fasta reference file followed by indexed reference
+        '--ref': ValuedParameter('--', Name='ref', Delimiter=' ',
+                                 IsPath=True, Value=None),
+
+        # File path + base name for all output files
+        '--aligned': ValuedParameter('--', Name='aligned', Delimiter=' ',
+                                     IsPath=True, Value=None),
+
+        # Output log file with parameters used to launch sortmerna and
+        # statistics on final results (the log file takes on
+        # the basename given in --aligned and the extension '.log')
+        '--log': FlagParameter('--', Name='log', Value=True),
+
+        # Output Fasta or Fastq file of aligned reads (flag)
+        '--fastx': FlagParameter('--', Name='fastx', Value=True),
+
+        # Output BLAST alignment file, options include [0,3] where:
+        # 0: Blast-like pairwise alignment,
+        # 1: Blast tabular format,
+        # 2: 1 + extra column for CIGAR string,
+        # 3: 2 + extra column for query coverage
+        '--blast': ValuedParameter('--', Name='blast', Delimiter=' ',
+                                   IsPath=False, Value=None),
+
+        # Output SAM alignment file
+        '--sam': FlagParameter('--', Name='sam', Value=False),
+
+        # Output SQ tags in the SAM file (useful for whole-genome alignment)
+        '--SQ': FlagParameter('--', Name='SQ', Value=False),
+
+        # Report the best INT number of alignments
+        '--best': ValuedParameter('--', Name='best', Delimiter=' ',
+                                  IsPath=False, Value="1"),
+
+        # Report first INT number of alignments
+        '--num_alignments': ValuedParameter('--', Name='num_alignments',
+                                            Delimiter=' ', IsPath=False,
+                                            Value=None),
+
+        # Number of threads
+        '-a': ValuedParameter('-', Name='a', Delimiter=' ',
+                              IsPath=False, Value="1"),
+
+        # E-value threshold
+        '-e': ValuedParameter('-', Name='e', Delimiter=' ',
+                              IsPath=False, Value="1"),
+
+        # Similarity threshold
+        '--id': ValuedParameter('--', Name='id', Delimiter=' ',
+                                IsPath=False, Value="0.97"),
+
+        # Query coverage threshold
+        '--coverage': ValuedParameter('--', Name='coverage', Delimiter=' ',
+                                      IsPath=False, Value="0.97"),
+
+        # Output Fasta/Fastq file with reads failing to pass the --id and
+        # --coverage thresholds for de novo clustering
+        '--de_novo_otu': FlagParameter('--', Name='de_novo_otu', Value=True),
+
+        # Output an OTU map
+        '--otu_map': FlagParameter('--', Name='otu_map', Value=True),
+
+        # Print a NULL alignment string for non-aligned reads
+        '--print_all_reads': FlagParameter('--', Name='print_all_reads',
+                                           Value=False)
+    }
+    _synonyms = {}
+    _input_handler = '_input_as_string'
+    _supress_stdout = False
+    _supress_stderr = False
+
+    def _get_result_paths(self, data):
+        """ Set the result paths """
+
+        result = {}
+
+        # get the file extension of the reads file (sortmerna
+        # internally outputs all results with this extension)
+        fileExtension = splitext(self.Parameters['--reads'].Value)[1]
+
+        # at this point the parameter --aligned should be set as
+        # sortmerna will not run without it
+        if self.Parameters['--aligned'].isOff():
+            raise ValueError("Error: the --aligned parameter must be set.")
+
+        # file base name for aligned reads
+        output_base = self.Parameters['--aligned'].Value
+
+        # Blast alignments
+        result['BlastAlignments'] =\
+            ResultPath(Path=output_base + '.blast',
+                       IsWritten=self.Parameters['--blast'].isOn())
+
+        # SAM alignments
+        result['SAMAlignments'] =\
+            ResultPath(Path=output_base + '.sam',
+                       IsWritten=self.Parameters['--sam'].isOn())
+
+        # OTU map (mandatory output)
+        result['OtuMap'] =\
+            ResultPath(Path=output_base + '_otus.txt',
+                       IsWritten=self.Parameters['--otu_map'].isOn())
+
+        # FASTA file of sequences in the OTU map (madatory output)
+        result['FastaMatches'] =\
+            ResultPath(Path=output_base + fileExtension,
+                       IsWritten=self.Parameters['--fastx'].isOn())
+
+        # FASTA file of sequences not in the OTU map (mandatory output)
+        result['FastaForDenovo'] =\
+            ResultPath(Path=output_base + '_denovo' +
+                       fileExtension,
+                       IsWritten=self.Parameters['--de_novo_otu'].isOn())
+        # Log file
+        result['LogFile'] =\
+            ResultPath(Path=output_base + '.log',
+                       IsWritten=self.Parameters['--log'].isOn())
+
+        return result
+
+    def getHelp(self):
+        """Method that points to documentation"""
+        help_str = ("SortMeRNA is hosted at:\n"
+                    "http://bioinfo.lifl.fr/RNA/sortmerna/\n"
+                    "https://github.com/biocore/sortmerna\n\n"
+                    "The following paper should be cited if this resource is "
+                    "used:\n\n"
+                    "Kopylova, E., Noe L. and Touzet, H.,\n"
+                    "SortMeRNA: fast and accurate filtering of ribosomal RNAs "
+                    "in\n"
+                    "metatranscriptomic data, Bioinformatics (2012) 28(24)\n"
+                    )
+        return help_str
+
+
+def sortmerna_ref_cluster(seq_path=None,
+                          sortmerna_db=None,
+                          refseqs_fp=None,
+                          result_path=None,
+                          tabular=False,
+                          max_e_value=1,
+                          similarity=0.97,
+                          coverage=0.97,
+                          threads=1,
+                          best=1,
+                          HALT_EXEC=False
+                          ):
+    """Launch sortmerna OTU picker
+
+        Parameters
+        ----------
+        seq_path : str
+            filepath to query sequences.
+        sortmerna_db : str
+            indexed reference database.
+        refseqs_fp : str
+            filepath of reference sequences.
+        result_path : str
+            filepath to output OTU map.
+        max_e_value : float, optional
+            E-value threshold [default: 1].
+        similarity : float, optional
+            similarity %id threshold [default: 0.97].
+        coverage : float, optional
+            query coverage % threshold [default: 0.97].
+        threads : int, optional
+            number of threads to use (OpenMP) [default: 1].
+        tabular : bool, optional
+            output BLAST tabular alignments [default: False].
+        best : int, optional
+            number of best alignments to output per read
+            [default: 1].
+
+        Returns
+        -------
+        clusters : dict of lists
+            OTU ids and reads mapping to them
+
+        failures : list
+            reads which did not align
+    """
+
+    # Instantiate the object
+    smr = Sortmerna(HALT_EXEC=HALT_EXEC)
+
+    # Set input query sequences path
+    if seq_path is not None:
+        smr.Parameters['--reads'].on(seq_path)
+    else:
+        raise ValueError("Error: a read file is mandatory input.")
+
+    # Set the input reference sequence + indexed database path
+    if sortmerna_db is not None:
+        smr.Parameters['--ref'].on("%s,%s" % (refseqs_fp, sortmerna_db))
+    else:
+        raise ValueError("Error: an indexed database for reference set %s must"
+                         " already exist.\nUse indexdb_rna to index the"
+                         " database." % refseqs_fp)
+
+    if result_path is None:
+        raise ValueError("Error: the result path must be set.")
+
+    # Set output results path (for Blast alignments, clusters and failures)
+    output_dir = dirname(result_path)
+    if output_dir is not None:
+        output_file = join(output_dir, "sortmerna_otus")
+        smr.Parameters['--aligned'].on(output_file)
+
+    # Set E-value threshold
+    if max_e_value is not None:
+        smr.Parameters['-e'].on(max_e_value)
+
+    # Set similarity threshold
+    if similarity is not None:
+        smr.Parameters['--id'].on(similarity)
+
+    # Set query coverage threshold
+    if coverage is not None:
+        smr.Parameters['--coverage'].on(coverage)
+
+    # Set number of best alignments to output
+    if best is not None:
+        smr.Parameters['--best'].on(best)
+
+    # Set Blast tabular output
+    # The option --blast 3 represents an
+    # m8 blast tabular output + two extra
+    # columns containing the CIGAR string
+    # and the query coverage
+    if tabular:
+        smr.Parameters['--blast'].on("3")
+
+    # Set number of threads
+    if threads is not None:
+        smr.Parameters['-a'].on(threads)
+
+    # Run sortmerna
+    app_result = smr()
+
+    # Put clusters into a map of lists
+    f_otumap = app_result['OtuMap']
+    rows = (line.strip().split('\t') for line in f_otumap)
+    clusters = {r[0]: r[1:] for r in rows}
+
+    # Put failures into a list
+    f_failure = app_result['FastaForDenovo']
+    failures = [re.split('>| ', label)[0]
+                for label, seq in parse_fasta(f_failure)]
+
+    # remove the aligned FASTA file and failures FASTA file
+    # (currently these are re-constructed using pick_rep_set.py
+    # further in the OTU-picking pipeline)
+    smr_files_to_remove = [app_result['FastaForDenovo'].name,
+                           app_result['FastaMatches'].name,
+                           app_result['OtuMap'].name]
+
+    return clusters, failures, smr_files_to_remove
+
+
+def sortmerna_map(seq_path,
+                  output_dir,
+                  refseqs_fp,
+                  sortmerna_db,
+                  e_value=1,
+                  threads=1,
+                  best=None,
+                  num_alignments=None,
+                  HALT_EXEC=False,
+                  output_sam=False,
+                  sam_SQ_tags=False,
+                  blast_format=3,
+                  print_all_reads=True,
+                  ):
+    """Launch sortmerna mapper
+
+        Parameters
+        ----------
+        seq_path : str
+            filepath to reads.
+        output_dir : str
+            dirpath to sortmerna output.
+        refseqs_fp : str
+            filepath of reference sequences.
+        sortmerna_db : str
+            indexed reference database.
+        e_value : float, optional
+            E-value threshold [default: 1].
+        threads : int, optional
+            number of threads to use (OpenMP) [default: 1].
+        best : int, optional
+            number of best alignments to output per read
+            [default: None].
+        num_alignments : int, optional
+            number of first alignments passing E-value threshold to
+            output per read [default: None].
+        HALT_EXEC : bool, debugging parameter
+            If passed, will exit just before the sortmerna command
+            is issued and will print out the command that would
+            have been called to stdout [default: False].
+        output_sam : bool, optional
+            flag to set SAM output format [default: False].
+        sam_SQ_tags : bool, optional
+            add SQ field to SAM output (if output_SAM is True)
+            [default: False].
+        blast_format : int, optional
+            Output Blast m8 tabular + 2 extra columns for CIGAR
+            string and query coverge [default: 3].
+        print_all_reads : bool, optional
+            output NULL alignments for non-aligned reads
+            [default: True].
+
+        Returns
+        -------
+        dict of result paths set in _get_result_paths()
+    """
+
+    if not (blast_format or output_sam):
+        raise ValueError("Either Blast or SAM output alignment "
+                         "format must be chosen.")
+
+    if (best and num_alignments):
+        raise ValueError("Only one of --best or --num_alignments "
+                         "options must be chosen.")
+
+    # Instantiate the object
+    smr = Sortmerna(HALT_EXEC=HALT_EXEC)
+
+    # Set the input reference sequence + indexed database path
+    smr.Parameters['--ref'].on("%s,%s" % (refseqs_fp, sortmerna_db))
+
+    # Set input query sequences path
+    smr.Parameters['--reads'].on(seq_path)
+
+    # Set Blast tabular output
+    # The option --blast 3 represents an
+    # m8 blast tabular output + two extra
+    # columns containing the CIGAR string
+    # and the query coverage
+    if blast_format:
+        smr.Parameters['--blast'].on(blast_format)
+
+    # Output alignments in SAM format
+    if output_sam:
+        smr.Parameters['--sam'].on()
+        if sam_SQ_tags:
+            smr.Parameters['--SQ'].on()
+
+    # Turn on NULL string alignment output
+    if print_all_reads:
+        smr.Parameters['--print_all_reads'].on()
+
+    # Set output results path (for Blast alignments and log file)
+    output_file = join(output_dir, "sortmerna_map")
+    smr.Parameters['--aligned'].on(output_file)
+
+    # Set E-value threshold
+    if e_value is not None:
+        smr.Parameters['-e'].on(e_value)
+
+    # Set number of best alignments to output per read
+    if best is not None:
+        smr.Parameters['--best'].on(best)
+
+    # Set number of first alignments passing E-value threshold
+    # to output per read
+    if num_alignments is not None:
+        smr.Parameters['--num_alignments'].on(num_alignments)
+
+    # Set number of threads
+    if threads is not None:
+        smr.Parameters['-a'].on(threads)
+
+    # Turn off parameters related to OTU-picking
+    smr.Parameters['--fastx'].off()
+    smr.Parameters['--otu_map'].off()
+    smr.Parameters['--de_novo_otu'].off()
+    smr.Parameters['--id'].off()
+    smr.Parameters['--coverage'].off()
+
+    # Run sortmerna
+    app_result = smr()
+
+    return app_result
diff --git a/bfillings/sumaclust_v1.py b/bfillings/sumaclust_v1.py
new file mode 100644
index 0000000..aa5d2b9
--- /dev/null
+++ b/bfillings/sumaclust_v1.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+Application controller for SumaClust version 1.0
+================================================
+"""
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from os.path import split, isdir, dirname, isfile, exists, realpath
+
+from burrito.util import CommandLineApplication, ResultPath
+from burrito.parameters import ValuedParameter, FlagParameter
+
+
+class Sumaclust(CommandLineApplication):
+    """ SumaClust generic application controller for de novo OTU picking
+    """
+
+    _command = 'sumaclust'
+    _command_delimiter = ' '
+    _parameters = {
+        # Reference sequence length is the shortest
+        '-l': FlagParameter('-', Name='l', Value=True),
+
+        # Filepath of the OTU-map
+        '-O': ValuedParameter('-', Name='O', Delimiter=' ',
+                              Value=None, IsPath=True),
+
+        # Flag '-f' must be passed to deactivate FASTA output
+        '-f': FlagParameter('-', Name='f', Value=True),
+
+        # Number of threads
+        '-p': ValuedParameter('-', Name='p', Delimiter=' ',
+                              Value=1, IsPath=False),
+
+        # Assign sequence to the best matching cluster seed, rather
+        # than the first matching cluster (having >= similarity threshold)
+        '-e': FlagParameter('-', Name='e', Value=False),
+
+        # Similarity threshold
+        '-t': ValuedParameter('-', Name='t', Delimiter=' ',
+                              Value=0.97, IsPath=False),
+
+        # Maximum ratio between abundance of two sequences so that the
+        # less abundant one can be considered as a variant of the more
+        # abundant one.
+        '-R': ValuedParameter('-', Name='R', Delimiter=' ',
+                              Value=1, IsPath=False)
+    }
+
+    _synonyms = {}
+    _input_handler = '_input_as_string'
+    _supress_stdout = False
+    _supress_stderr = False
+
+    def _get_result_paths(self, data):
+        """ Set the result paths
+        """
+
+        result = {}
+
+        # OTU map (mandatory output)
+        result['OtuMap'] = ResultPath(Path=self.Parameters['-O'].Value,
+                                      IsWritten=True)
+
+        # SumaClust will not produce any output file if the
+        # input file was empty, so we create an empty
+        # output file
+        if not isfile(result['OtuMap'].Path):
+            otumap_f = open(result['OtuMap'].Path, 'w')
+            otumap_f.close()
+
+        return result
+
+    def getHelp(self):
+        """ Method that points to documentation
+        """
+        help_str = ("SumaClust is hosted at:\n"
+                    "http://metabarcoding.org/sumatra/\n\n"
+                    "The following paper should be cited if this resource "
+                    "is used:\n\n"
+                    "SUMATRA and SUMACLUST: fast and exact comparison and "
+                    "clustering "
+                    "of full-length barcode sequences\n"
+                    "Mercier, C., Boyer, F., Kopylova, E., Taberlet, P., "
+                    "Bonin, A. and Coissac E.,"
+                    "2014 (in preparation)\n"
+                    )
+
+        return help_str
+
+
+def sumaclust_denovo_cluster(seq_path=None,
+                             result_path=None,
+                             shortest_len=True,
+                             similarity=0.97,
+                             threads=1,
+                             exact=False,
+                             HALT_EXEC=False
+                             ):
+    """ Function  : launch SumaClust de novo OTU picker
+
+        Parameters: seq_path, filepath to reads;
+                    result_path, filepath to output OTU map;
+                    shortest_len, boolean;
+                    similarity, the similarity threshold (between (0,1]);
+                    threads, number of threads to use;
+                    exact, boolean to perform exact matching
+
+        Return    : clusters, list of lists
+    """
+
+    # Sequence path is mandatory
+    if (seq_path is None
+            or not exists(seq_path)):
+        raise ValueError("Error: FASTA query sequence filepath is "
+                         "mandatory input.")
+
+    # Output directory is mandatory
+    if (result_path is None
+            or not isdir(dirname(realpath(result_path)))):
+        raise ValueError("Error: output directory is mandatory input.")
+
+    # Instantiate the object
+    sumaclust = Sumaclust(HALT_EXEC=HALT_EXEC)
+
+    # Set the OTU-map filepath
+    sumaclust.Parameters['-O'].on(result_path)
+
+    # Set the similarity threshold
+    if similarity is not None:
+        sumaclust.Parameters['-t'].on(similarity)
+
+    # Set the option to perform exact clustering (default: False)
+    if exact:
+        sumaclust.Parameters['-e'].on()
+
+    # Turn off option for reference sequence length to be the shortest
+    if not shortest_len:
+        sumaclust.Parameters['-l'].off()
+
+    # Set the number of threads
+    if threads > 0:
+        sumaclust.Parameters['-p'].on(threads)
+    else:
+        raise ValueError("Number of threads must be positive.")
+
+    # Launch SumaClust,
+    # set the data string to include the read filepath
+    # (to be passed as final arguments in the sumaclust command)
+    app_result = sumaclust(seq_path)
+
+    # Put clusters into a list of lists
+    f_otumap = app_result['OtuMap']
+    clusters = [line.strip().split('\t')[1:] for line in f_otumap]
+
+    # Return clusters
+    return clusters
diff --git a/bfillings/swarm_v127.py b/bfillings/swarm_v127.py
new file mode 100644
index 0000000..820d3e6
--- /dev/null
+++ b/bfillings/swarm_v127.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+Application controller for Swarm version 1.2.7
+==============================================
+"""
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from os.path import exists
+from tempfile import mkstemp
+from os import close, linesep
+from subprocess import Popen, PIPE
+import re
+
+from burrito.util import (CommandLineApplication, ResultPath,
+                            ApplicationNotFoundError)
+from burrito.parameters import ValuedParameter
+from skbio.parse.sequences import parse_fasta
+from skbio.util import remove_files
+
+
+class Swarm(CommandLineApplication):
+    """ Swarm generic application controller for de novo OTU picking
+    """
+
+    _command = 'swarm'
+    _command_delimiter = ' '
+    _parameters = {
+        # Resolution
+        '-d': ValuedParameter('-', Name='d', Delimiter=' ',
+                              Value=1, IsPath=False),
+        # OTU-map result filename
+        '-o': ValuedParameter('-', Name='o', Delimiter=' ',
+                              Value=None, IsPath=True),
+        # Threads
+        '-t': ValuedParameter('-', Name='t', Delimiter=' ',
+                              Value=1, IsPath=False),
+    }
+
+    _synonyms = {}
+    _input_handler = '_input_as_string'
+    _supress_stdout = False
+    _supress_stderr = False
+    files_to_remove = []
+
+    def __call__(self, seq_path):
+        """
+            Input : seq_path, a filepath to input FASTA reads
+
+            Method: de-replicate FASTA reads,
+                    launch Swarm followed by swarm_breaker.py,
+                    expand clusters
+
+            Return: clusters, a list of lists
+        """
+
+        # De-replicate query sequences
+        exact_match_id_map, seq_path =\
+            self._apply_identical_sequences_prefilter(seq_path)
+
+        # Run Swarm
+        super(Swarm, self).__call__(seq_path)
+
+        # Run swarm_breaker.py to refine the clusters
+        clusters = self._swarm_breaker(seq_path)
+
+        # Expand clusters
+        clusters = self._map_filtered_clusters_to_full_clusters(
+            clusters, exact_match_id_map)
+
+        return clusters
+
+    def _accept_exit_status(self, exit_status):
+        """ Test for acceptable exit status
+        """
+        return exit_status == 0
+
+    def _swarm_breaker(self,
+                       seq_path):
+        """
+            Input : seq_path, a filepath to de-replicated
+                    input FASTA reads
+
+            Method: using swarm_breaker.py, break
+                    chains of amplicons based on
+                    abundance information. Abundance
+                    is stored after the final
+                    underscore '_' in each sequence
+                    label (recommended procedure for
+                    Swarm)
+
+            Return: clusters, a list of lists
+        """
+        swarm_breaker_command = ["swarm_breaker.py",
+                                 "-f",
+                                 seq_path,
+                                 "-s",
+                                 self.Parameters['-o'].Value,
+                                 "-d",
+                                 str(self.Parameters['-d'].Value)]
+
+        try:
+            # launch swarm_breaker.py as a subprocess,
+            # pipe refined OTU-map to the standard stream
+            proc = Popen(swarm_breaker_command,
+                         stdout=PIPE,
+                         stderr=PIPE,
+                         close_fds=True)
+
+            stdout, stderr = proc.communicate()
+
+            if stderr:
+                raise StandardError("Process exited with %s" % stderr)
+
+            # store refined clusters in list of lists
+            clusters = []
+            for line in stdout.split(linesep):
+                # skip line if contains only the newline character
+                if not line:
+                    break
+                seq_ids = re.split("\t| ", line.strip())
+                # remove the abundance information from the labels
+                for i in range(len(seq_ids)):
+                    seq_ids[i] = seq_ids[i].rsplit("_", 1)[0]
+                clusters.append(seq_ids)
+        except OSError:
+            raise ApplicationNotFoundError("Cannot find swarm_breaker.py "
+                                           "in the $PATH directories.")
+
+        return clusters
+
+    def _prefilter_exact_matches(self,
+                                 seqs):
+        """
+        """
+        unique_sequences = {}
+        seq_id_map = {}
+        filtered_seqs = []
+        for seq_id, seq in seqs:
+            seq_id = seq_id.split()[0]
+            try:
+                temp_seq_id = unique_sequences[seq]
+            except KeyError:
+                temp_seq_id = 'ExactMatch.%s' % seq_id
+                unique_sequences[seq] = temp_seq_id
+                seq_id_map[temp_seq_id] = []
+                filtered_seqs.append((temp_seq_id, seq))
+            seq_id_map[temp_seq_id].append(seq_id)
+        return filtered_seqs, seq_id_map
+
+    def _apply_identical_sequences_prefilter(self,
+                                             seq_path):
+        """
+            Input : seq_path, a filepath to input FASTA reads
+            Method: prepares and writes de-replicated reads
+                    to a temporary FASTA file, calls
+                    parent method to do the actual
+                    de-replication
+            Return: exact_match_id_map, a dictionary storing
+                    de-replicated amplicon ID as key and
+                    all original FASTA IDs with identical
+                    sequences as values;
+                    unique_seqs_fp, filepath to FASTA file
+                    holding only de-replicated sequences
+        """
+        # creating mapping for de-replicated reads
+        seqs_to_cluster, exact_match_id_map =\
+            self._prefilter_exact_matches(parse_fasta(seq_path))
+
+        # create temporary file for storing the de-replicated reads
+        fd, unique_seqs_fp = mkstemp(
+            prefix='SwarmExactMatchFilter', suffix='.fasta')
+        close(fd)
+
+        self.files_to_remove.append(unique_seqs_fp)
+
+        # write de-replicated reads to file
+        unique_seqs_f = open(unique_seqs_fp, 'w')
+        for seq_id, seq in seqs_to_cluster:
+            unique_seqs_f.write('>%s_%d\n%s\n'
+                                % (seq_id,
+                                   len(exact_match_id_map[seq_id]),
+                                   seq))
+        unique_seqs_f.close()
+
+        return exact_match_id_map, unique_seqs_fp
+
+    def _map_filtered_clusters_to_full_clusters(self,
+                                                clusters,
+                                                filter_map):
+        """
+            Input:  clusters, a list of cluster lists
+                    filter_map, the seq_id in each clusters
+                                is the key to the filter_map
+                                containing all seq_ids with
+                                duplicate FASTA sequences
+            Output: an extended list of cluster lists
+        """
+        results = []
+        for cluster in clusters:
+            full_cluster = []
+            for seq_id in cluster:
+                full_cluster += filter_map[seq_id]
+            results.append(full_cluster)
+        return results
+
+    def _get_result_paths(self, data):
+        """ Set the result paths
+        """
+
+        # Swarm OTU map (mandatory output)
+        return {'OtuMap': ResultPath(Path=self.Parameters['-o'].Value,
+                                     IsWritten=True)}
+
+    def getHelp(self):
+        """ Method that points to documentation
+        """
+        help_str = ("Swarm is hosted at:\n"
+                    "https://github.com/torognes/swarm\n\n"
+                    "The following paper should be cited if this resource "
+                    "is used:\n\n"
+                    "Swarm: robust and fast clustering method for "
+                    "amplicon-based studies\n"
+                    "Mahe, F., Rognes, T., Quince, C., de Vargas, C., "
+                    "and Dunthorn, M."
+                    "2014 (submitted)\n"
+                    )
+
+        return help_str
+
+
+def swarm_denovo_cluster(seq_path,
+                         d=1,
+                         threads=1,
+                         HALT_EXEC=False):
+    """ Function  : launch the Swarm de novo OTU picker
+
+        Parameters: seq_path, filepath to reads
+                    d, resolution
+                    threads, number of threads to use
+
+        Return    : clusters, list of lists
+    """
+
+    # Check sequence file exists
+    if not exists(seq_path):
+        raise ValueError("%s does not exist" % seq_path)
+
+    # Instantiate the object
+    swarm = Swarm(HALT_EXEC=HALT_EXEC)
+
+    # Set the resolution
+    if d > 0:
+        swarm.Parameters['-d'].on(d)
+    else:
+        raise ValueError("Resolution -d must be a positive integer.")
+
+    # Set the number of threads
+    if threads > 0:
+        swarm.Parameters['-t'].on(threads)
+    else:
+        raise ValueError("Number of threads must be a positive integer.")
+
+    # create temporary file for Swarm OTU-map
+    f, tmp_swarm_otumap = mkstemp(prefix='temp_otumap_',
+                                  suffix='.swarm')
+    close(f)
+
+    swarm.Parameters['-o'].on(tmp_swarm_otumap)
+
+    # Remove this file later, the final OTU-map
+    # is output by swarm_breaker.py and returned
+    # as a list of lists (clusters)
+    swarm.files_to_remove.append(tmp_swarm_otumap)
+
+    # Launch Swarm
+    # set the data string to include the read filepath
+    # (to be passed as final arguments in the swarm command)
+    clusters = swarm(seq_path)
+
+    remove_files(swarm.files_to_remove, error_on_missing=False)
+
+    # Return clusters
+    return clusters
diff --git a/bfillings/tests/__init__.py b/bfillings/tests/__init__.py
new file mode 100644
index 0000000..0b50c1b
--- /dev/null
+++ b/bfillings/tests/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
diff --git a/bfillings/tests/test_blast.py b/bfillings/tests/test_blast.py
new file mode 100644
index 0000000..f98e8e9
--- /dev/null
+++ b/bfillings/tests/test_blast.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from string import split, strip
+from os import popen, remove
+from glob import glob
+from unittest import TestCase, main
+
+from cogent.parse.blast import QMEBlast9
+
+from bfillings.blast import (seqs_to_stream, make_subject_match_scorer,
+                          make_shotgun_scorer, keep_everything_scorer,
+                          ids_from_seq_lower_threshold, PsiBlast,
+                          psiblast_n_neighbors)
+
+
+class BlastTests(TestCase):
+    """Tests of top-level functions"""
+
+    def setUp(self):
+        """Define some standard data"""
+        self.rec = """# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181	ece:Z4181	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4181	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-06	52.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 2
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181	ece:Z4181	100.00	110	0	0	1	110	1	110	3e-54	 211
+ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-54	 211
+ece:Z4181	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-08	59.0
+ece:Z4181	sfl:CP0138	33.98	103	57	2	8	110	6	97	6e-06	50.5
+ece:Z4181	spt:SPA2730	37.50	72	45	0	39	110	30	101	1e-05	49.8
+ece:Z4181	sec:SC2804	37.50	72	45	0	39	110	30	101	1e-05	49.8
+ece:Z4181	stm:STM2872	37.50	72	45	0	39	110	30	101	1e-05	49.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4182
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4182	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4182	ecs:ECs3718	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4182	cvi:CV2422	41.67	72	42	0	39	110	29	100	2e-06	52.8""".split('\n')
+
+        self.rec2 = """# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181	ece:Z4181	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4181	spt:SPA2730	37.50	72	45	0	39	110	30	101	1e-05	49.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 2
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-54	 211
+ece:Z4181	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-08	59.0
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4182
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4182	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4182	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-06	52.8""".split('\n')
+
+        self.rec3 = """# BLASTP 2.2.10 [Oct-19-2004]
+# BLASTP 2.2.10 [Oct-19-2004]
+# Query: ece:Z4181
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4181	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4181	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4181	spt:SPA2730	37.50	72	45	0	39	110	30	101	1e-05	49.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Iteration: 1
+# Query: ece:Z4182
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4182	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4182	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-06	52.8
+# BLASTP 2.2.10 [Oct-19-2004]
+# Query: ece:Z4183
+# Database: db/everything.faa
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+ece:Z4183	ece:Z4182	100.00	110	0	0	1	110	1	110	3e-47	 187
+ece:Z4183	ecs:ECs3717	100.00	110	0	0	1	110	1	110	3e-54	 211
+ece:Z4183	cvi:CV2421	41.67	72	42	0	39	110	29	100	2e-08	59.0""".split('\n')
+
+        self.query_1 = """>gi|100002553| Bd2556c Bd2556c two-component system sensor histidine kinase 3092017:3094158 reverse MW:81963
+MRLKNRLNNWISIRMGMVIVIFLGVSCGSMRSSTPPPAKDRLTEIDSLERLLPDCPTIASTLPLLRRLAFLYQQQSEMKVYNERLYENAMAVDSISVAYLGLKNLAEYYYDQSVRDSLEYYCSLVDSIAKARHEYPNVLFDVKSLSSQDLLWLGNYELAMSEAMDLYRLASNLDHRYGLLRCSETLGLIYQRIRRDSDAVVSFQESLDLLKDIKDVPDIMDTKVRLTSYQLESSVRTKQYASTERILGQYMALLDEQYKIYQEKNDLLSIKREYWLLYSFYTSFYLSQGDLENAKRSLDQASSYADSNWVEGDYAINTYLTVKARYHKAAGDIPLALHCINEVLETERLPEDIQFKADILKEQGQLGEVMALYDELYSTLTKRRGTSFLRQVNQLRTLHELHEKELKETELKEAGQRIARKQDLLIFILSISVVLLILLYVLFLYYRHLRSLKNQLQREKELLLESQRQLIKEKTRAEEASLMKSAFLANMS [...]
+
+        self.query_2 = """>gi|100002557| Bd2560c Bd2560c conserved hypothetical protein 3097971:3098210 reverse MW:8927
+MGKNQLIHGNEFHLLKQAEIHKATGKLVESLNLAAGSTGGFDIYKVVEAYFTDLEKRKEINDLLGISEPCETRVTEECFS
+"""
+
+        self.fasta_recs = """>gi|100002550| Bd2553c Bd2553c conserved hypothetical protein 3090609:3091013 reverse MW:14682
+MMDFISVPLVVGIVCAGIYGLFELFVRKRERLAIIEKIGDKLDTSAFDGKLGLPNYMRNFSFSSLKAGCLLAGIGLGLLVGFIINMCMATNSYYDDGWYRHEVAGTAYGASVLLFGGIGLIIAFVIELKLGKNNK
+>gi|100002551| Bd2554 Bd2554 RNA polymerase ECF-type sigma factor 3091112:3091717 forward MW:23408
+LLPQVVTYLPGLRPLSTMELYTDTYYIQRIQAGDVACFACLLDKYSRPIHSLILKVVRSQEEAEELAQDTFMKVFKNLASFKGDCSFSTWIYRIAYNTAISSVRKKRYEFLAIEETTLENVSEEEITNLFGQTESTEQVQRLEVALEQLLPDERALILLFYWKEKTIEELVSITGLTASNIKVKLHRIRKKLFVLLNGMDHE
+>gi|100002552| Bd2555 Bd2555 conserved hypothetical protein 3091713:3092066 forward MW:13332
+MSKINTNKEQPDLLGDLFKRIPEEELPASFRSNVMRQIMLESAKAKKRDERFSLLAAIVASLIMISLAIVSFVYMEIPKIAIPTISTSALAFYLYIGAITLILLLADYKLRNLFHKKG
+>gi|100002553| Bd2556c Bd2556c two-component system sensor histidine kinase 3092017:3094158 reverse MW:81963
+MRLKNRLNNWISIRMGMVIVIFLGVSCGSMRSSTPPPAKDRLTEIDSLERLLPDCPTIASTLPLLRRLAFLYQQQSEMKVYNERLYENAMAVDSISVAYLGLKNLAEYYYDQSVRDSLEYYCSLVDSIAKARHEYPNVLFDVKSLSSQDLLWLGNYELAMSEAMDLYRLASNLDHRYGLLRCSETLGLIYQRIRRDSDAVVSFQESLDLLKDIKDVPDIMDTKVRLTSYQLESSVRTKQYASTERILGQYMALLDEQYKIYQEKNDLLSIKREYWLLYSFYTSFYLSQGDLENAKRSLDQASSYADSNWVEGDYAINTYLTVKARYHKAAGDIPLALHCINEVLETERLPEDIQFKADILKEQGQLGEVMALYDELYSTLTKRRGTSFLRQVNQLRTLHELHEKELKETELKEAGQRIARKQDLLIFILSISVVLLILLYVLFLYYRHLRSLKNQLQREKELLLESQRQLIKEKTRAEEASLMKSAFLANMS [...]
+>gi|100002554| Bd2557c Bd2557c two-component system sensor histidine kinase 3094158:3095507 reverse MW:51247
+LERKYNGEGKIFPVKRHRCLMSCYYCELYTMKGNSGKAQAYLDQATAYLDSSFGDRVEAQYLRTKSFYYWKEKDYRHALSAVNLALKINRDLDKLEMKKAVLQSSGQLQEAVTIYEEIINKTETINTDAFDRQIEQLRVLNDLNDLEKQDRELKLKSEQEALKQKQIVVSIGLLLVLMGLLYMLWRIYMHTKRLRNELLQEKDSLTASEKQLRVVTKEAEAANKKKSAFIANISHEVRTPLNAIVGFSELLASSEYSEEEKIRFAGEVNHSSELLLNLVNDVLDLSRLESGKIKFSVKPNDLVACCQRALDSIRHRVKPGVRLTFTPSIESYTLNTDALRLQQLLTNLLSNAAKFTSEGEINLSFTVDEGKEEVCFSVTDTGCGIPEDKCEKIFERFEKLDDFIQGTGLGLSVCQIISEQLNGSLSVDISYKDGARFVFIHPTNLIETPI
+>gi|100002555| Bd2558c Bd2558c hypothetical protein 3095527:3095985 reverse MW:17134
+LRGKNIHLGRVGCNYGKLLIFIDIYFVSLRIVSDKSMSRGFLRKSSVNTFIGIVWILFAVGTSAQNAVSKFRADSIRQSLSRIQKPQDKIPLLKELIGLYWQLPEEVLALKEIIDIAMPLDSIGIVYDAMAGLSRYYPAIRTFVRVGGALETV
+>gi|100002556| Bd2559 Bd2559 30S ribosomal protein S1 3096095:3097882 forward MW:67092
+MENLKNIQPVEDFNWDAFEQGETYTEVSKDDLVKTYDETLNTVKDKEVVMGTVTSMNKREVVVNIGFKSDGVVPMSEFRYNPDLKIGDEVEVYIESQEDKKGQLILSHKKARATRSWDRVNEALEKDEIIKGYIKCRTKGGMIVDVFGIEAFLPGSQIDVKPIRDYDVFVGKTMEFKIVKINQEFKNVVVSHKALIEAELEQQKKDIISKLEKGQVLEGTVKNITSYGVFIDLGGVDGLIHITDLSWGRVSHPEEIVQLDQKINVVILDFDDEKKRIALGLKQLTPHPWDALDTNLKVGDKVKGKVVVMADYGAFIEIAPGVEGLIHVSEMSWTQHLRSAQDFMKVGDEIEAVILTLDRDERKMSLGIKQLKADPWENIEERFPVGSRHAAKVRNFTNFGVFVEIEEGVDGLIHISDLSWTKKIKHPSEFTQIGAEIEVQVLEIDKENRRLSLGHKQLEENPWDVFETIFTVGSIHEGTIIEVLDKGAVISL [...]
+>gi|100002557| Bd2560c Bd2560c conserved hypothetical protein 3097971:3098210 reverse MW:8927
+MGKNQLIHGNEFHLLKQAEIHKATGKLVESLNLAAGSTGGFDIYKVVEAYFTDLEKRKEINDLLGISEPCETRVTEECFS
+>gi|100002558| Bd2561 Bd2561 phosphoglycolate phosphatase 3098389:3099033 forward MW:24182
+MKKLVIFDLDGTLLNTIADLAHSTNHALRQNGFPTHDVKEYNFFVGNGINKLFERALPEGEKTAENILKVREEFLKHYDLHNTDRSVPYPGVPELLALLQERGIKLAVASNKYQAATRKLIAHFFPSIQFTEVLGQREGVKAKPDPSIVNEIVERASISKESTLYVGDSDVDMQTAINSEVTSCGVTWGFRPRTELEKYAPDHIAEKAEDILKFI
+>gi|100002559| Bd2562 Bd2562 conserved hypothetical protein 3099382:3100299 forward MW:35872
+MSGNIKKIVEPNSGIDYSLEKDFKIFTLSKELPITTYPSYIRLGIVIYCVKGNAKIDIYSNKHIITPKELIIILPGQLVALTDVSVDFQIRYFTITESFYSDILSGISRFSPHFFFYMRQHYYFKMEDVETLSFVDFFELLIRKAVDPENQYRRESVILLLRILFLDIYNHYKVNSLDSTATIDVHKKELTHKFFQLVMSNYKVNRSVTFYANSLCITPKYLTMVVKEVSGKSAKDWITEYMILELKGLLTNSTLNIQEIVEKTQFSNQSSLGRFFRRHTGLSPLQYRKKYLTTEQRTNFSKNNTI
+"""
+
+    def test_seqs_to_stream(self):
+        """seqs_to_stream should iterate over seqs"""
+        sts = seqs_to_stream
+        self.assertEqual(list(sts('>a\nTG\n>b\nWW\n', \
+            '_input_as_multiline_string')),\
+            [['>a','TG'],['>b','WW']])
+        #skipping test for file open
+        self.assertEqual(list(sts(['TG','WW'], '_input_as_seqs')), \
+            [['>0','TG'],['>1','WW']])
+        self.assertEqual(list(sts(['>a','TG','>b','WW'], \
+            '_input_as_lines')),\
+            [['>a','TG'],['>b','WW']])
+        self.assertRaises(TypeError, sts, 'abc', 'xyz')
+
+    def test_make_subject_match_scorer(self):
+        """make_subject_match_scorer should keep ids matching n queries"""
+        qm1 = make_subject_match_scorer(1)
+        qm3 = make_subject_match_scorer(3)
+        qm5 = make_subject_match_scorer(5)
+        qmes = wrap_qmes(QMEBlast9(self.rec3))
+        self.assertItemsEqual(qm1(qmes), ['ece:Z4181','ece:Z4182','ece:Z4183'])
+        self.assertItemsEqual(qm3(qmes), ['ece:Z4181','ece:Z4183'])
+        self.assertItemsEqual(qm5(qmes), [])
+
+    def test_make_shotgun_scorer(self):
+        """make_shotgun_scorer should keep ids matching n queries"""
+        sg1 = make_shotgun_scorer(1)
+        sg2 = make_shotgun_scorer(2)
+        sg3 = make_shotgun_scorer(3)
+        sg4 = make_shotgun_scorer(4)
+        sg5 = make_shotgun_scorer(5)
+        qmes = wrap_qmes(QMEBlast9(self.rec3))
+        self.assertItemsEqual(sg1(qmes), keep_everything_scorer(qmes))
+        self.assertItemsEqual(sg2(qmes), \
+            ['ece:Z4181','ece:Z4182','ece:Z4183','cvi:CV2421','ecs:ECs3717'])
+        self.assertItemsEqual(sg3(qmes), \
+            ['ece:Z4181','ece:Z4182','ece:Z4183'])
+        self.assertItemsEqual(sg4(qmes), \
+            ['ece:Z4182'])
+        self.assertItemsEqual(sg5(qmes), [])
+
+    def test_keep_everything_scorer(self):
+        """keep_everything_scorer should keep all ids found."""
+        k = keep_everything_scorer(wrap_qmes(QMEBlast9(self.rec2)))
+        self.assertItemsEqual(k, \
+            ['ece:Z4181','ecs:ECs3717','spt:SPA2730','cvi:CV2421','ece:Z4182'])
+
+    def test_ids_from_seq_lower_threshold(self):
+        "ids_from_seq_lower_threshold returns psiblast hits, decreasing sens"
+        bdb_seqs = self.fasta_recs
+        f = open('test_bdb', 'w')
+        f.write(bdb_seqs)
+        f.close()
+        temp = popen('formatdb -i test_bdb -o T -p T')
+        params = {'-j':2,
+                '-d':'test_bdb'}
+        query = self.query_1.split('\n')
+        app = PsiBlast(params=params,
+                InputHandler='_input_as_lines')
+        #the command below should result in finding itself and 2554
+        #it should run for max_iterations
+        result = ids_from_seq_lower_threshold(query, n=12, \
+                max_iterations=4, app=app, core_threshold=1e-50, \
+                lower_threshold=1e-20, step=10000)
+        self.assertEqual(result[0],\
+                [('gi|100002553', '0.0'), ('gi|100002554', '0.0')])
+        self.assertEqual(result[1], 4)
+        #if n=2, it should find the same sequences but only run for 1 iteration
+        #since it would hit n after the first blast search
+        result = ids_from_seq_lower_threshold(query, n=2, \
+                max_iterations=4, app=app, core_threshold=1e-50, \
+                lower_threshold=1e-20, step=10000)
+        self.assertEqual(result[0],\
+                [('gi|100002553', '0.0'), ('gi|100002554', '0.0')])
+        self.assertEqual(result[1], 1)
+        query = self.query_2.split('\n')
+        #query_2_s e-value for itself is 9e-47, it should not be found
+        #with the lower_threshold set to 1e-48
+        result = ids_from_seq_lower_threshold(query, n=12, \
+                max_iterations=4, app=app, core_threshold=1e-50, \
+                lower_threshold=1e-48, step=10000)
+        self.assertEqual(result[0], [])
+        #it also should not be found if the max_iterations is set to 1
+        result = ids_from_seq_lower_threshold(query, n=12, \
+                max_iterations=1, app=app, core_threshold=1e-50, \
+                lower_threshold=1e-20, step=10000)
+        self.assertEqual(result[0], [])
+        for fname in ['formatdb.log'] + glob('test_bdb*'):
+            remove(fname)
+
+    def test_psiblast_n_neighbors(self):
+        "psiblast_n_neighbors psiblasts and stops when n neighbors are reached"
+        bdb_seqs = self.fasta_recs
+        f = open('test_bdb', 'w')
+        f.write(bdb_seqs)
+        f.close()
+        temp = popen('formatdb -i test_bdb -o T -p T')
+        params = {'-j':11}
+        lines = bdb_seqs.split('\n')
+        results = psiblast_n_neighbors(lines, n=12, blast_db='test_bdb', \
+                method='lower_threshold', params=params,\
+                core_threshold=1e-50, step=10000)
+        #there should be 10 result entries since there were 10 queries
+        self.assertEqual(len(results), 10)
+        for i in results:
+            #each query should at least find itself
+            self.failUnless(len(results[i][0]) >= 1)
+            #each query should iterate 8 times since it can never reach max
+            self.assertEqual(results[i][1], 11)
+        for fname in ['formatdb.log'] + glob('test_bdb*'):
+            remove(fname)
+
+
+def wrap_qmes(qmes):
+    """Converts qmes into a dict of {q:{m:e}}"""
+    d = {}
+    for q, m, e in qmes:
+        if q not in d:
+            d[q] = {}
+        d[q][m] = e
+    return d
+
+if __name__ == "__main__":
+    main()
diff --git a/bfillings/tests/test_blat.py b/bfillings/tests/test_blat.py
new file mode 100755
index 0000000..190d15a
--- /dev/null
+++ b/bfillings/tests/test_blat.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from unittest import TestCase, main
+from os.path import exists
+from os import remove
+from re import search
+from tempfile import mkstemp
+
+from bfillings.blat import (Blat, assign_reads_to_database,
+                         assign_dna_reads_to_dna_database,
+                         assign_dna_reads_to_protein_database)
+
+__author__ = "Adam Robbins-Pianka"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Adam Robbins-Pianka", "Daniel McDonald"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "Adam Robbins-Pianka"
+__email__ = "adam.robbinspianka at colorado.edu"
+__status__ = "Production"
+
+
+class BlatTests(TestCase):
+    files_to_remove = []
+
+    def setUp(self):
+        """Sets up files for testing.
+        """
+        _, self.test_db_prot_filename = mkstemp()
+        self.test_db_prot = open(self.test_db_prot_filename, 'w')
+
+        _, self.test_db_dna_filename = mkstemp()
+        self.test_db_dna = open(self.test_db_dna_filename, 'w')
+
+        _, self.test_query_filename = mkstemp()
+        self.test_query = open(self.test_query_filename, 'w')
+
+        # write the global variables at the bottom of this file to the
+        # temporary test files. Can't use file-like objects because the
+        # external application needs actual files.
+        self.test_db_prot.write('\n'.join(test_db_prot))
+        self.test_db_dna.write('\n'.join(test_db_dna))
+        self.test_query.write('\n'.join(test_query))
+
+        # close the files
+        self.test_db_prot.close()
+        self.test_db_dna.close()
+        self.test_query.close()
+
+        # prepare output file path
+        _, self.testout = mkstemp()
+
+        self.files_to_remove += [self.test_db_prot_filename,
+                                 self.test_db_dna_filename,
+                                 self.test_query_filename, self.testout]
+
+    def tearDown(self):
+        """Removes temporary files created during the tests
+        """
+        for filename in self.files_to_remove:
+            if exists(filename):
+                remove(filename)
+
+    def test_assign_reads_to_database(self):
+        """Tests that assign_reads_to_database works as expected.
+
+        Checks the output file against the expected result when known
+        database and query files are used.
+        """
+        exp = [l for l in assign_reads_exp if not l.startswith('#')]
+        obs_lines = assign_reads_to_database(self.test_query_filename,
+                                             self.test_db_dna_filename,
+                                             self.testout).read().splitlines()
+        obs = [l for l in obs_lines if not l.startswith('#')]
+
+        self.assertEqual(obs, exp)
+
+    def test_assign_dna_reads_to_dna_database(self):
+        """Tests that assign_dna_reads_to_dna_database works as expected.
+
+        Checks the output file against the expected result when known
+        database and query files are used.
+        """
+        exp = [l for l in assign_reads_exp if not l.startswith('#')]
+
+        obs_lines = assign_dna_reads_to_dna_database(self.test_query_filename,
+                                                     self.test_db_dna_filename,
+                                                     self.testout).read().splitlines()
+        obs = [l for l in obs_lines if not l.startswith('#')]
+
+        self.assertEqual(obs, exp)
+
+    def test_assign_dna_reads_to_protein_database(self):
+        """Tests that assign_dna_reads_to_protein_database works as expected.
+
+        Checks the output file against the expected result when known
+        database and query files are used.
+        """
+        exp = [l for l in assign_reads_prot_exp if not l.startswith('#')]
+
+        obs_lines = assign_dna_reads_to_protein_database(
+            self.test_query_filename,
+            self.test_db_prot_filename,
+            self.testout).read().splitlines()
+        obs = [l for l in obs_lines if not l.startswith('#')]
+
+        self.assertEqual(obs, exp)
+
+    def test_get_base_command(self):
+        """Tests that _get_base_command generates the proper command given
+        various inputs.
+        """
+        test_parameters_blank = {}
+        files = (self.test_query_filename, self.test_db_dna_filename,
+                 self.testout)
+        exp_blank = 'blat %s %s %s' % (files[1], files[0], files[2])
+
+        # initialize a Blat instance with these parameters and get the
+        # command string
+        b = Blat(params={}, HALT_EXEC=True)
+        # need to set the positional parameters' values
+        b._input_as_list(files)
+        cmd = b._get_base_command()
+
+        # find the end of the cd command and trim the base command
+        cmd_index = search('cd ".+"; ', cmd).end()
+        cmd = cmd[cmd_index:]
+        self.assertEqual(cmd, exp_blank)
+
+        test_parameters_1 = {
+            '-t': 'dna',
+            '-q': 'dna',
+            '-ooc': '11.ooc',
+            '-tileSize': 1,
+            '-stepSize': 2,
+            '-oneOff': 1,
+            '-minMatch': 2,
+            '-minScore': 3,
+            '-minIdentity': 4,
+            '-maxGap': 5,
+            '-makeOoc': 'N.ooc',
+            '-repMatch': 6,
+            '-mask': 'lower',
+            '-qMask': 'lower',
+            '-repeats': 'lower',
+            '-minRepDivergence': 7,
+            '-dots': 8,
+            '-out': 'psl',
+            '-maxIntron': 9}
+        exp_1 = 'blat %s %s ' % (files[1], files[0]) + \
+                '-dots=8 -makeOoc="N.ooc" -mask=lower -maxGap=5 ' + \
+                '-maxIntron=9 -minIdentity=4 -minMatch=2 ' + \
+                '-minRepDivergence=7 -minScore=3 -oneOff=1 -ooc="11.ooc" ' + \
+                '-out=psl -q=dna -qMask=lower -repMatch=6 -repeats=lower ' + \
+                '-stepSize=2 -t=dna -tileSize=1 %s' % files[2]
+
+        # initialize a Blat instance with these parameters and get the
+        # command string
+        b = Blat(params=test_parameters_1, HALT_EXEC=True)
+        # need to set the positional parameters' values
+        b._input_as_list(files)
+        cmd = b._get_base_command()
+
+        # find the end of the cd command and trim the base command
+        cmd_index = search('cd ".+"; ', cmd).end()
+        cmd = cmd[cmd_index:]
+        self.assertEqual(cmd, exp_1)
+
+        test_parameters_2 = {
+            '-tileSize': 1,
+            '-stepSize': 2,
+            '-minMatch': 2,
+            '-minScore': 3,
+            '-minIdentity': 4,
+            '-maxGap': 5,
+            '-makeOoc': 'N.ooc',
+            '-out': 'psl',
+            '-maxIntron': 9}
+        exp_2 = 'blat %s %s ' % (files[1], files[0]) + \
+                '-makeOoc="N.ooc" -maxGap=5 -maxIntron=9 -minIdentity=4 ' + \
+                '-minMatch=2 -minScore=3 -out=psl -stepSize=2 ' + \
+                '-tileSize=1 %s' % files[2]
+
+        # initialize a Blat instance with these parameters and get the
+        # command string
+        b = Blat(params=test_parameters_2, HALT_EXEC=True)
+        # need to set the positional parameters' values
+        b._input_as_list(files)
+        cmd = b._get_base_command()
+
+        # find the end of the cd command and trim the base command
+        cmd_index = search('cd ".+"; ', cmd).end()
+        cmd = cmd[cmd_index:]
+        self.assertEqual(cmd, exp_2)
+
+assign_reads_exp = """# BLAT 34 [2006/03/10]
+# Query: NZ_GG770509_647533119
+# Database: test_db.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG770509_647533119	NZ_GG770509_647533119	100.00	1371	0	0	1	1371	1	1371	0.0e+00	2187.0
+NZ_GG770509_647533119	NZ_ACIZ01000148_643886127	85.49	634	92	0	336	969	337	970	4.5e-234	807.0
+NZ_GG770509_647533119	NZ_ACIZ01000148_643886127	86.08	237	33	0	1135	1371	1137	1373	1.2e-77	287.0
+NZ_GG770509_647533119	NZ_ACIZ01000148_643886127	83.12	154	26	0	976	1129	977	1130	2.2e-48	190.0
+NZ_GG770509_647533119	NZ_GG739926_647533195	78.42	329	71	0	656	984	657	985	4.8e-97	351.0
+NZ_GG770509_647533119	NZ_GG739926_647533195	89.09	110	11	1	1138	1246	1141	1250	1.1e-30	131.0
+NZ_GG770509_647533119	NZ_GG739926_647533195	86.96	69	9	0	1021	1089	1023	1091	3.2e-20	96.0
+NZ_GG770509_647533119	NZ_GG739926_647533195	75.26	97	22	2	356	450	356	452	2.3e-13	73.0
+NZ_GG770509_647533119	NZ_GG739926_647533195	90.57	53	5	0	1319	1371	1315	1367	2.5e-10	63.0
+NZ_GG770509_647533119	NZ_GG739926_647533195	81.82	22	4	0	989	1010	992	1013	1.5e+02	24.0
+# BLAT 34 [2006/03/10]
+# Query: NZ_GG739926_647533195
+# Database: test_db.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG739926_647533195	NZ_GG739926_647533195	100.00	1367	0	0	1	1367	1	1367	0.0e+00	2235.0
+NZ_GG739926_647533195	NZ_ACIZ01000148_643886127	76.22	572	136	0	414	985	414	985	1.7e-158	556.0
+NZ_GG739926_647533195	NZ_ACIZ01000148_643886127	76.80	181	42	0	1023	1203	1022	1202	6.4e-53	205.0
+NZ_GG739926_647533195	NZ_ACIZ01000148_643886127	96.00	50	2	0	1209	1258	1207	1256	6.4e-14	75.0
+NZ_GG739926_647533195	NZ_ACIZ01000148_643886127	88.68	53	6	0	1315	1367	1321	1373	1.6e-09	61.0
+NZ_GG739926_647533195	NZ_ACIZ01000148_643886127	77.27	22	5	0	992	1013	990	1011	8.5e+02	22.0
+NZ_GG739926_647533195	NZ_GG770509_647533119	79.29	280	58	0	657	936	656	935	9.9e-82	301.0
+NZ_GG739926_647533195	NZ_GG770509_647533119	89.09	110	11	1	1141	1250	1138	1246	1.1e-30	131.0
+NZ_GG739926_647533195	NZ_GG770509_647533119	86.96	69	9	0	1023	1091	1021	1089	3.2e-20	96.0
+NZ_GG739926_647533195	NZ_GG770509_647533119	75.26	97	22	2	356	452	356	450	2.3e-13	73.0
+NZ_GG739926_647533195	NZ_GG770509_647533119	90.57	53	5	0	1315	1367	1319	1371	2.5e-10	63.0
+NZ_GG739926_647533195	NZ_GG770509_647533119	80.00	30	6	0	956	985	955	984	1.2e-03	41.0
+NZ_GG739926_647533195	NZ_GG770509_647533119	81.82	22	4	0	992	1013	989	1010	1.5e+02	24.0
+# BLAT 34 [2006/03/10]
+# Query: NZ_ACIZ01000148_643886127
+# Database: test_db.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_ACIZ01000148_643886127	NZ_ACIZ01000148_643886127	100.00	1373	0	0	1	1373	1	1373	0.0e+00	2165.0
+NZ_ACIZ01000148_643886127	NZ_GG770509_647533119	85.49	634	92	0	337	970	336	969	4.5e-234	807.0
+NZ_ACIZ01000148_643886127	NZ_GG770509_647533119	86.08	237	33	0	1137	1373	1135	1371	1.2e-77	287.0
+NZ_ACIZ01000148_643886127	NZ_GG770509_647533119	83.12	154	26	0	977	1130	976	1129	2.2e-48	190.0
+NZ_ACIZ01000148_643886127	NZ_GG739926_647533195	76.22	572	136	0	414	985	414	985	1.7e-158	556.0
+NZ_ACIZ01000148_643886127	NZ_GG739926_647533195	76.80	181	42	0	1022	1202	1023	1203	6.4e-53	205.0
+NZ_ACIZ01000148_643886127	NZ_GG739926_647533195	96.00	50	2	0	1207	1256	1209	1258	6.4e-14	75.0
+NZ_ACIZ01000148_643886127	NZ_GG739926_647533195	88.68	53	6	0	1321	1373	1315	1367	1.6e-09	61.0
+NZ_ACIZ01000148_643886127	NZ_GG739926_647533195	77.27	22	5	0	990	1011	992	1013	8.5e+02	22.0
+""".splitlines()
+
+assign_reads_prot_exp = """# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG770509_647533119_frame_1
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG770509_647533119_frame_1	NZ_GG770509_647533119	96.83	441	0	7	1	427	1	441	8.9e-254	872.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG770509_647533119_frame_2
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG770509_647533119_frame_2	NZ_ACIZ01000148_643886127	85.37	41	6	0	359	399	362	402	8.0e-13	72.0
+NZ_GG770509_647533119_frame_2	NZ_ACIZ01000148_643886127	93.75	16	1	0	419	434	421	436	1.3e+00	31.0
+NZ_GG770509_647533119_frame_2	NZ_GG739926_647533195	75.86	29	7	0	320	348	326	354	2.9e-04	43.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG770509_647533119_frame_3
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG770509_647533119_frame_3	NZ_ACIZ01000148_643886127	80.61	98	19	0	210	307	209	306	7.5e-39	158.0
+NZ_GG770509_647533119_frame_3	NZ_ACIZ01000148_643886127	66.33	98	33	0	43	140	44	141	8.9e-27	118.0
+NZ_GG770509_647533119_frame_3	NZ_ACIZ01000148_643886127	78.95	38	8	0	310	347	308	345	2.3e-08	57.0
+NZ_GG770509_647533119_frame_3	NZ_ACIZ01000148_643886127	66.67	30	10	0	178	207	178	207	2.5e-01	33.0
+NZ_GG770509_647533119_frame_3	NZ_GG739926_647533195	53.00	100	47	0	131	230	134	233	1.9e-18	90.0
+NZ_GG770509_647533119_frame_3	NZ_GG739926_647533195	68.89	45	14	0	238	282	241	285	5.9e-09	59.0
+NZ_GG770509_647533119_frame_3	NZ_GG739926_647533195	72.09	43	12	0	63	105	66	108	3.0e-08	56.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG739926_647533195_frame_1
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG739926_647533195_frame_1	NZ_GG739926_647533195	100.00	437	0	0	1	437	1	437	1.7e-263	904.0
+NZ_GG739926_647533195_frame_1	NZ_ACIZ01000148_643886127	69.86	73	22	0	213	285	209	281	1.1e-20	98.0
+NZ_GG739926_647533195_frame_1	NZ_ACIZ01000148_643886127	53.33	60	28	0	148	207	145	204	1.3e-06	51.0
+NZ_GG739926_647533195_frame_1	NZ_ACIZ01000148_643886127	60.53	38	15	0	66	103	64	101	1.9e-03	41.0
+NZ_GG739926_647533195_frame_1	NZ_ACIZ01000148_643886127	76.92	26	6	0	2	27	3	28	9.7e-03	38.0
+NZ_GG739926_647533195_frame_1	NZ_ACIZ01000148_643886127	69.57	23	7	0	288	310	285	307	4.8e+00	29.0
+NZ_GG739926_647533195_frame_1	NZ_ACIZ01000148_643886127	90.00	10	1	0	134	143	132	141	1.6e+04	18.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG739926_647533195_frame_2
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG739926_647533195_frame_2	NZ_GG770509_647533119	66.67	42	14	0	270	311	276	317	2.3e-08	57.0
+NZ_GG739926_647533195_frame_2	NZ_GG770509_647533119	60.00	45	18	0	185	229	188	232	3.9e-06	49.0
+NZ_GG739926_647533195_frame_2	NZ_GG770509_647533119	80.00	20	4	0	247	266	251	270	5.6e-01	32.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_GG739926_647533195_frame_3
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_GG739926_647533195_frame_3	NZ_ACIZ01000148_643886127	94.44	18	1	0	390	407	385	402	4.3e-03	39.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_ACIZ01000148_643886127_frame_1
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_ACIZ01000148_643886127_frame_1	NZ_ACIZ01000148_643886127	100.00	436	0	0	1	436	1	436	2.1e-261	897.0
+NZ_ACIZ01000148_643886127_frame_1	NZ_GG739926_647533195	78.57	42	9	0	240	281	244	285	4.0e-10	63.0
+NZ_ACIZ01000148_643886127_frame_1	NZ_GG739926_647533195	60.53	38	15	0	64	101	66	103	1.9e-03	41.0
+NZ_ACIZ01000148_643886127_frame_1	NZ_GG739926_647533195	76.92	26	6	0	3	28	2	27	9.7e-03	38.0
+NZ_ACIZ01000148_643886127_frame_1	NZ_GG739926_647533195	69.57	23	7	0	285	307	288	310	4.8e+00	29.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_ACIZ01000148_643886127_frame_2
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_ACIZ01000148_643886127_frame_2	NZ_GG770509_647533119	79.59	147	26	2	182	324	189	335	2.3e-61	233.0
+NZ_ACIZ01000148_643886127_frame_2	NZ_GG770509_647533119	72.73	33	9	0	128	160	137	169	5.0e-04	42.0
+NZ_ACIZ01000148_643886127_frame_2	NZ_GG770509_647533119	90.91	22	2	0	70	91	76	97	2.5e-03	40.0
+# BLAT 34x13 [2009/02/26]
+# Query: NZ_ACIZ01000148_643886127_frame_3
+# Database: /home/adro2179/metagenome/test_db_prot.fasta
+# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
+NZ_ACIZ01000148_643886127_frame_3	NZ_GG770509_647533119	84.21	38	4	1	360	395	367	404	3.0e-08	56.0
+NZ_ACIZ01000148_643886127_frame_3	NZ_GG770509_647533119	94.12	17	1	0	413	429	425	441	1.6e+00	31.0
+NZ_ACIZ01000148_643886127_frame_3	NZ_GG739926_647533195	78.57	28	5	1	321	347	326	353	1.5e-03	41.0"""
+assign_reads_prot_exp = assign_reads_prot_exp.splitlines()
+
+test_db_prot = """>NZ_GG770509_647533119
+YLEFDPGSERTLAAGLTHASRASGRRVSNAWERTICYGITQGNLCYRMetWKVGKSARVGLASWWGKGSPRRRSIAGLRGSATLGLRHGPDSYGRQQWGILDNGRKPDPAMetPRERPGCKALSPVKMetTVTGEEAPANFVPAAAVIRRGLALFGFTGRKAHVGGLLSQGNPGAQPRNCLYWKSVWRVEFRVRNSIFGGTPVAKAAHWTNRGAKAWGANRIRYPGSPRRKRMetLAVGASVAQLTHTFRLGSAVARLKLKGIDGGPHKRWSMetWFNSKQRAEPYQPLTSTGAAWLSSARVVRCWVKSRNERNPRPLPAWALGDCRAGGRWGRQVLMetALTGWATHVLQWWSVGSEHASVSSPPSQFGCTLQLECRSWNRSRISMetPRIRSRALYTPPVTPWELVLPEGACAGDHGRVSDWGEVVTRPGNLRLDHLLS
+>NZ_GG739926_647533195
+WEFDPGSGTLATGLTHASRGTGARVSNAYPTFPRPRDNLPKGRLIPYVQSRSRMGMRPISLLAGQRPTKASIGRGSERKAPHTGTETRSRLLREAAVRNIGQWAEATSQVACRTTAYGLTAFMRGYAGTAIRTGFRASSRGNTEGPGVIRIYWVRERRPPCKRAVKSSGPTAALRRELLGLSAPEAGGIRGVAVKCLDITKNPDCEGSPLWRLTLRLEGAGIEQDIPWSARTMDTRCPALGGQAKALSIPPGEYAGNGETQRNRGPAQAEEHVVFDDTRGTLPGLELRCCMVVVSSCREVSAQVPRAQPLSAVAIGRALCGHCRRKVEEGGDDVKSARPLRPGPHTCYNGRQRAVRAQVRVNPLRSQFGWGLQPDPRSWIRSRISHGAVNTFPGLVHTARQAMKAGGASPCRPRAKPVIGAKSQGSRTGRCGWNTSF
+>NZ_ACIZ01000148_643886127
+NMEFDPGSGTLAACLIHASRTSGGRVSNTWVTCPVGDNIWKQMLIPHKESRFWMDPRRISLVRRLTKAMIRSRTERLIGHIGTETRPKLLREAAVGNLPQWTQVWSNAAVKKAFGSNSVVGEDDGIQPESHGLRASSRGNTVASVIRIYWASERRRFFKSDVKALGLTEEVHRKLGNLSAEEDSGTPCVAVKCVDIWKNTSGEGGCLVLTLRLESMGSEQDIPWSMPTMNARCWSFSAAANALSIPPGEYDRKVETQRNRGPAQAVEHVVFEATRRTLPGLDIDRWCMVVVSSCREMLGVPQRAQPLLVASMGTLVRLPVTNRRKVGMTSNHHAPYDLGYTRATMDGNELRDREVKLISSILSSDVGCNSPTEVGIASNRGSARRGEYVPGPCTHRPSHHESLHPKPVRSEPSKVGQMIRVKSQGSRRRTCGWITS"""
+test_db_prot = test_db_prot.splitlines()
+
+test_db_dna = """>NZ_GG770509_647533119
+UACUUGGAGUUUGAUCCUGGCUCAGAACGAACGCUGGCGGCAGGCUUAACACAUGCAAGUCGAGCGAGCGGCAGACGGGUGAGUAACGCGUGGGAACGUACCAUUUGCUACGGAAUAACUCAGGGAAACUUGUGCUAAUACCGUAUGUGGAAAGUCGGCAAAUGAUCGGCCCGCGUUGGAUUAGCUAGUUGGUGGGGUAAAGGCUCACCAAGGCGACGAUCCAUAGCUGGUCUGAGAGGAUGAUCAGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGCAAGCCUGAUCCAGCCAUGCCGCGUGAGUGAUGAAGGCCCUAGGGUUGUAAAGCUCUUUCACCGGUGAAGAUGACGGUAACCGGAGAAGAAGCCCCGGCUAACUUCGUGCCAGCAGCCGCGGUAAUACGAAGGGGGCUAGCGUUGUUCGGAUUUACUGGGCGUAAAGCGCACGUA [...]
+>NZ_GG739926_647533195
+UAAUGGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCUACAGGCUUAACACAUGCAAGUCGAGGGACCGGCGCACGGGUGAGUAACGCGUAUCCAACCUUCCCGCGACCAAGGGAUAACCUGCCGAAAGGCAGACUAAUACCUUAUGUCCAAAGUCGGUCACGGAUGGGGAUGCGUCCGAUUAGCUUGUUGGCGGGGCAACGGCCCACCAAGGCAUCGAUCGGUAGGGGUUCUGAGAGGAAGGCCCCCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGAGGAAUAUUGGUCAAUGGGCGGAAGCCUGAACCAGCCAAGUAGCGUGCAGGACGACGGCCUACGGGUUGUAAACUGCUUUUAUGCGGGGAUAUGCAGGUACCGCAUGAAUAAGGACCGGCUAAUUCCGUGCCAGCAGCCGCGGUAAUACGGAAGGUCCGGGCGUUAUCCGGAUUUAUUGGGUUUAAAGGGAGCGC [...]
+>NZ_ACIZ01000148_643886127
+AAUAUGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCGGCGUGCCUAAUACAUGCAAGUCGAACGAGUGGCGGACGGGUGAGUAACACGUGGGUAACCUGCCCUUAAGUGGGGGAUAACAUUUGGAAACAGAUGCUAAUACCGCAUAAAGAAAGUCGCUUUUGGAUGGACCCGCGGCGUAUUAGCUAGUUGGUGAGGUAACGGCUCACCAAGGCAAUGAUACGUAGCCGAACUGAGAGGUUGAUCGGCCACAUUGGGACUGAGACACGGCCCAAACUCCUACGGGAGGCAGCAGUAGGGAAUCUUCCACAAUGGACGCAAGUCUGAUGGAGCAACGCCGCGUGAGUGAAGAAGGCUUUCGGGUCGUAAAACUCUGUUGUUGGAGAAGAUGACGGUAUCCAACCAGAAAGCCACGGCUAACUACGUGCCAGCAGCCGCGGUAAUACGUAGGUGGCAAGCGUUAUCCGGAUUUAUUGGGCGUAAAGCGAGCGC [...]
+test_db_dna = test_db_dna.splitlines()
+
+test_query = """>NZ_GG770509_647533119
+UACUUGGAGUUUGAUCCUGGCUCAGAACGAACGCUGGCGGCAGGCUUAACACAUGCAAGUCGAGCGAGCGGCAGACGGGUGAGUAACGCGUGGGAACGUACCAUUUGCUACGGAAUAACUCAGGGAAACUUGUGCUAAUACCGUAUGUGGAAAGUCGGCAAAUGAUCGGCCCGCGUUGGAUUAGCUAGUUGGUGGGGUAAAGGCUCACCAAGGCGACGAUCCAUAGCUGGUCUGAGAGGAUGAUCAGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGCAAGCCUGAUCCAGCCAUGCCGCGUGAGUGAUGAAGGCCCUAGGGUUGUAAAGCUCUUUCACCGGUGAAGAUGACGGUAACCGGAGAAGAAGCCCCGGCUAACUUCGUGCCAGCAGCCGCGGUAAUACGAAGGGGGCUAGCGUUGUUCGGAUUUACUGGGCGUAAAGCGCACGUA [...]
+>NZ_GG739926_647533195
+UAAUGGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCUACAGGCUUAACACAUGCAAGUCGAGGGACCGGCGCACGGGUGAGUAACGCGUAUCCAACCUUCCCGCGACCAAGGGAUAACCUGCCGAAAGGCAGACUAAUACCUUAUGUCCAAAGUCGGUCACGGAUGGGGAUGCGUCCGAUUAGCUUGUUGGCGGGGCAACGGCCCACCAAGGCAUCGAUCGGUAGGGGUUCUGAGAGGAAGGCCCCCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGAGGAAUAUUGGUCAAUGGGCGGAAGCCUGAACCAGCCAAGUAGCGUGCAGGACGACGGCCUACGGGUUGUAAACUGCUUUUAUGCGGGGAUAUGCAGGUACCGCAUGAAUAAGGACCGGCUAAUUCCGUGCCAGCAGCCGCGGUAAUACGGAAGGUCCGGGCGUUAUCCGGAUUUAUUGGGUUUAAAGGGAGCGC [...]
+>NZ_ACIZ01000148_643886127
+AAUAUGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCGGCGUGCCUAAUACAUGCAAGUCGAACGAGUGGCGGACGGGUGAGUAACACGUGGGUAACCUGCCCUUAAGUGGGGGAUAACAUUUGGAAACAGAUGCUAAUACCGCAUAAAGAAAGUCGCUUUUGGAUGGACCCGCGGCGUAUUAGCUAGUUGGUGAGGUAACGGCUCACCAAGGCAAUGAUACGUAGCCGAACUGAGAGGUUGAUCGGCCACAUUGGGACUGAGACACGGCCCAAACUCCUACGGGAGGCAGCAGUAGGGAAUCUUCCACAAUGGACGCAAGUCUGAUGGAGCAACGCCGCGUGAGUGAAGAAGGCUUUCGGGUCGUAAAACUCUGUUGUUGGAGAAGAUGACGGUAUCCAACCAGAAAGCCACGGCUAACUACGUGCCAGCAGCCGCGGUAAUACGUAGGUGGCAAGCGUUAUCCGGAUUUAUUGGGCGUAAAGCGAGCGC [...]
+test_query = test_query.splitlines()
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_bwa.py b/bfillings/tests/test_bwa.py
new file mode 100755
index 0000000..91d495e
--- /dev/null
+++ b/bfillings/tests/test_bwa.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from unittest import TestCase, main
+from os.path import exists
+from os import remove
+from tempfile import mkstemp
+
+from bfillings.bwa import (BWA_index, BWA_aln, BWA_samse, BWA_sampe, BWA_bwasw,
+                        create_bwa_index_from_fasta_file,
+                        assign_reads_to_database,
+                        InvalidArgumentApplicationError,
+                        MissingRequiredArgumentApplicationError)
+
+
+class BWAtests(TestCase):
+
+    """Tests for the BWA app controller
+    """
+
+    # keeps track of which files are created during the tests so that they
+    # can be removed during tearDown
+    files_to_remove = []
+
+    def setUp(self):
+        """Performs setup for the tests.
+
+        Nothing to set up for these tests.
+        """
+
+        pass
+
+    def tearDown(self):
+        """Properly and politely terminates the test.
+
+        Removes files created during the tests.
+        """
+
+        for f in self.files_to_remove:
+            if exists(f):
+                remove(f)
+
+    def test_check_arguments(self):
+        """Tests the "check_arguments" method of the BWA base class.
+
+        Arguments passed to certain parameters of the various subcommands can
+        take only certain values. The check_arguments function enforces these
+        constraints. This function ensures that the rules are being enforced
+        as expected.
+        """
+
+        # set up test parameters
+        # should pass
+        index_params_is = {'-a': 'is'}
+        # should pass
+        index_params_bwtsw = {'-a': 'bwtsw'}
+        # should fail, -a must be one of "is" or "bwtsw"
+        index_params_invalid = {'-a': 'invalid'}
+        # should fail, -p must specify a prefix that is an absolute path
+        index_params_invalid_prefix = {'-p': 'invalid'}
+        # should pass
+        index_params_valid_prefix = {'-p': '/prefix'}
+
+        # instantiate objects built from the above parameters
+        index_is = BWA_index(params=index_params_is, HALT_EXEC=True)
+        index_bwtsw = BWA_index(params=index_params_bwtsw, HALT_EXEC=True)
+        index_invalid = BWA_index(params=index_params_invalid, HALT_EXEC=True)
+        index_invalid_prefix = BWA_index(params=index_params_invalid_prefix,
+                                         HALT_EXEC=True)
+        index_valid_prefix = BWA_index(params=index_params_valid_prefix,
+                                       HALT_EXEC=True)
+
+        # Should not be allowed
+        self.assertRaises(InvalidArgumentApplicationError,
+                          index_invalid.check_arguments)
+        self.assertRaises(InvalidArgumentApplicationError,
+                          index_invalid_prefix.check_arguments)
+
+        # Should execute and not raise any exceptions
+        index_is.check_arguments()
+        index_bwtsw.check_arguments()
+        index_valid_prefix.check_arguments()
+
+        # The rest of the _valid_arguments are for checking is_int and is_float
+        # and they all use the same function from the base-class, so testing
+        # just one of the subcommands should suffice
+
+        # -n must be a float (expressed either as a float or as a string)
+        # -o must be an int (expressed either as an int or as a string)
+        # pass, both valid
+        aln_params_valid = {'-n': 3.0, '-o': 5, '-f': '/sai_out'}
+        # fail, second invalid
+        aln_params_invalid1 = {'-n': 3.0, '-o': 'nope', '-f': '/sai_out'}
+        # fail, first invalid
+        aln_params_invalid2 = {'-n': '3.5.1', '-o': 4, '-f': '/sai_out'}
+        # fail, did not specify -f
+        aln_params_invalid3 = {'-n': 3.0, '-o': 5}
+
+        # instantiate objects
+        aln_valid = BWA_aln(params=aln_params_valid, HALT_EXEC=True)
+        aln_invalid1 = BWA_aln(params=aln_params_invalid1, HALT_EXEC=True)
+        aln_invalid2 = BWA_aln(params=aln_params_invalid2, HALT_EXEC=True)
+        aln_invalid3 = BWA_aln(params=aln_params_invalid3, HALT_EXEC=True)
+
+        test_paths = {'prefix': '/fa_in', 'fastq_in': '/fq_in'}
+
+        # Should Halt Exec (AssertionError) right before execution
+        self.assertRaisesRegexp(AssertionError, 'Halted exec', aln_valid,
+                                test_paths)
+        # also need to make sure the base command is correct
+        self.assertIn('; bwa aln -f /sai_out -n 3.0 -o 5 /fa_in /fq_in',
+                      aln_valid.BaseCommand)
+
+        # Should fail
+        self.assertRaises(InvalidArgumentApplicationError, aln_invalid1,
+                          test_paths)
+
+        self.assertRaises(InvalidArgumentApplicationError, aln_invalid2,
+                          test_paths)
+
+        self.assertRaises(InvalidArgumentApplicationError, aln_invalid3,
+                          test_paths)
+
+    def test_input_as_dict(self):
+        """Tests the input handler (_input_as_dict)
+
+        The input handler should throw exceptions if there are not enough
+        arguments, or if there are unrecognized arguments, or if a file path
+        appears to be a relative filepath.
+        """
+
+        # Arguments for BWA_bwasw, which was chosen since it is the only one
+        # that also has an optional argument (optional arguments are denoted
+        # by a leading underscore)
+        missing = {'prefix': '/fa_in', '_query_fasta_2': '/mate'}
+        extra = {'prefix': '/fa_in', 'query_fasta': '/query_fasta',
+                 'extra': '/param'}
+        rel_fp = {'prefix': 'fa_in', 'query_fasta': '/query_fasta'}
+        valid = {'prefix': '/fa_in', 'query_fasta': '/query_fasta'}
+        valid_with_mate = {'prefix': '/fa_in', 'query_fasta': '/query_fasta',
+                           '_query_fasta_2': '/mate'}
+
+        # instantiate the object
+        bwasw = BWA_bwasw(params={'-f': '/sam_out'}, HALT_EXEC=True)
+
+        # should raise ApplicationError for wrong I/O files; failure
+        self.assertRaises(MissingRequiredArgumentApplicationError, bwasw,
+                          missing)
+        self.assertRaises(InvalidArgumentApplicationError, bwasw, extra)
+        self.assertRaises(InvalidArgumentApplicationError, bwasw, rel_fp)
+
+        # should raise AssertionError (Halt Exec); success
+        # tests valid arguments with and without the optional
+        # _query_fasta_2 argument
+        self.assertRaisesRegexp(AssertionError, 'Halted exec', bwasw, valid)
+        self.assertRaisesRegexp(AssertionError, 'Halted exec', bwasw,
+                                valid_with_mate)
+
+    def test_get_base_command(self):
+        """Tests the function that generates the command string.
+
+        Tests whether an object can be instantiated and then called using
+        one set of files, and then another set of files.
+
+        Since the structure of the various sublcasses is consistent, testing
+        that the correct command is generated by one of the subclasses should
+        suffice here.
+        """
+
+        # instantiate one instance
+        aln = BWA_aln(params={'-n': 1.0, '-f': '/sai_out'}, HALT_EXEC=True)
+
+        # set up two different sets of files
+        first_files = {'prefix': '/fa_in1', 'fastq_in': '/fq_in1'}
+        second_files = {'prefix': '/fa_in2', 'fastq_in': '/fq_in2'}
+
+        # make sure both sets run, and that the command appears to be correct
+        self.assertRaisesRegexp(AssertionError,
+                                'Halted exec', aln, first_files)
+        self.assertIn('; bwa aln -f /sai_out -n 1.0 /fa_in1 /fq_in1',
+                      aln.BaseCommand)
+
+        self.assertRaisesRegexp(AssertionError, 'Halted exec', aln,
+                                second_files)
+        self.assertIn('; bwa aln -f /sai_out -n 1.0 /fa_in2 /fq_in2',
+                      aln.BaseCommand)
+
+        # instantiate another object, to test that there is no cross-talk
+        # between instances with the same baseclass
+        aln2 = BWA_aln(params={'-n': 2.5, '-o': 7, '-f': '/sai_out'},
+                       HALT_EXEC=True)
+
+        self.assertRaisesRegexp(AssertionError, 'Halted exec', aln2,
+                                first_files)
+        self.assertIn('; bwa aln -f /sai_out -n 2.5 -o 7 /fa_in1 /fq_in1',
+                      aln2.BaseCommand)
+
+    def test_get_result_paths(self):
+        """Tests the function that retrieves the result paths.
+
+        aln, sampe, samse, bwasw return only one file.
+        BWA_index returns 5 files, and the name depends on whether or not the
+        -p option is on or not
+        """
+
+        # instantiate objects
+        index = BWA_index(params={}, HALT_EXEC=True)
+        index2 = BWA_index(params={'-p': '/prefix'}, HALT_EXEC=True)
+        aln = BWA_aln(params={'-f': '/sai_out'}, HALT_EXEC=True)
+        samse = BWA_samse(params={'-f': '/sam_out'}, HALT_EXEC=True)
+        sampe = BWA_sampe(params={'-f': '/sam_out'}, HALT_EXEC=True)
+        bwasw = BWA_bwasw(params={'-f': '/sam_out'}, HALT_EXEC=True)
+
+        # pass in the data, and make sure the output paths are as expected.
+        # -p is off here
+        index_data = {'fasta_in': '/fa_in'}
+        results = index._get_result_paths(index_data)
+        self.assertEqual(results['.amb'].Path, '/fa_in.amb')
+        self.assertEqual(results['.ann'].Path, '/fa_in.ann')
+        self.assertEqual(results['.bwt'].Path, '/fa_in.bwt')
+        self.assertEqual(results['.pac'].Path, '/fa_in.pac')
+        self.assertEqual(results['.sa'].Path, '/fa_in.sa')
+
+        # pass in the data, and make sure the output paths are as expected.
+        # -p is on here
+        results = index2._get_result_paths(index_data)
+        self.assertEqual(results['.amb'].Path, '/prefix.amb')
+        self.assertEqual(results['.ann'].Path, '/prefix.ann')
+        self.assertEqual(results['.bwt'].Path, '/prefix.bwt')
+        self.assertEqual(results['.pac'].Path, '/prefix.pac')
+        self.assertEqual(results['.sa'].Path, '/prefix.sa')
+
+        # pass in the data, and make sure the output path is as expected
+        aln_data = {'prefix': '/fa_in', 'fastq_in': '/fq_in'}
+        results = aln._get_result_paths(aln_data)
+        self.assertEqual(results['output'].Path, '/sai_out')
+
+        samse_data = {'prefix': '/fa_in', 'sai_in': '/sai_in',
+                      'fastq_in': '/fq_in'}
+        results = samse._get_result_paths(samse_data)
+        self.assertEqual(results['output'].Path, '/sam_out')
+
+        sampe_data = {'prefix': '/fa_in', 'sai1_in': '/sai1_in',
+                      'sai2_in': '/sai2_in', 'fastq1_in': '/fq1_in',
+                      'fastq2_in': '/fq2_in'}
+        results = sampe._get_result_paths(sampe_data)
+        self.assertEqual(results['output'].Path, '/sam_out')
+
+    def test_create_bwa_index_from_fasta_file(self):
+        """Test create_bwa_index_from_fasta_file
+
+        Makes sure that the file paths are as expected.
+        """
+
+        # get a new temp file for the input fasta
+        _, fasta_in = mkstemp(suffix=".fna")
+        # write the test fasta (see end of this file) to the temp file
+        fasta = open(fasta_in, 'w')
+        fasta.write(test_fasta)
+        fasta.close()
+
+        # make sure to remove this fasta file upon tearDown
+        self.files_to_remove.append(fasta_in)
+
+        # run the function
+        results = create_bwa_index_from_fasta_file(fasta_in, {})
+
+        # for each of the 5 output files (not counting stdout, stderr, and
+        # the exitStatus), make sure the file paths are as expcted.
+        for filetype, result in results.iteritems():
+            if filetype not in ('ExitStatus'):
+                # be sure to remove these 5 files
+                self.files_to_remove.append(result.name)
+            if filetype not in ('StdOut', 'ExitStatus', 'StdErr'):
+                self.assertEqual(fasta_in + filetype, result.name)
+
+    def test_assign_reads_to_database(self):
+        """Tests for proper failure in assign_reads_to_database
+        """
+
+        # sets of params that should cause failure
+        no_alg = {}
+        wrong_alg = {'algorithm': 'not_an_algorithm'}
+        no_aln_params = {'algorithm': 'bwa-short'}
+
+        # dummy files -- checking for failure as expected, so the function
+        # won't get as far as actually running the program
+        database = '/db'
+        query = '/query'
+        out = '/sam'
+
+        self.assertRaises(InvalidArgumentApplicationError,
+                          assign_reads_to_database, query, database,
+                          out, no_alg)
+
+        self.assertRaises(InvalidArgumentApplicationError,
+                          assign_reads_to_database, query, database, out,
+                          wrong_alg)
+
+        self.assertRaises(InvalidArgumentApplicationError,
+                          assign_reads_to_database, query, database, out,
+                          no_aln_params)
+
+test_fasta = '''>NZ_GG770509_647533119
+UACUUGGAGUUUGAUCCUGGCUCAGAACGAACGCUGGCGGCAGGCUUAACACAUGCAAGUCGAGCGAGCGGCAGACGGGUGAGUAACGCGUGGGAACGUACCAUUUGCUACGGAAUAACUCAGGGAAACUUGUGCUAAUACCGUAUGUGGAAAGUCGGCAAAUGAUCGGCCCGCGUUGGAUUAGCUAGUUGGUGGGGUAAAGGCUCACCAAGGCGACGAUCCAUAGCUGGUCUGAGAGGAUGAUCAGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGCAAGCCUGAUCCAGCCAUGCCGCGUGAGUGAUGAAGGCCCUAGGGUUGUAAAGCUCUUUCACCGGUGAAGAUGACGGUAACCGGAGAAGAAGCCCCGGCUAACUUCGUGCCAGCAGCCGCGGUAAUACGAAGGGGGCUAGCGUUGUUCGGAUUUACUGGGCGUAAAGCGCACGUA [...]
+>NZ_GG739926_647533195
+UAAUGGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCUACAGGCUUAACACAUGCAAGUCGAGGGACCGGCGCACGGGUGAGUAACGCGUAUCCAACCUUCCCGCGACCAAGGGAUAACCUGCCGAAAGGCAGACUAAUACCUUAUGUCCAAAGUCGGUCACGGAUGGGGAUGCGUCCGAUUAGCUUGUUGGCGGGGCAACGGCCCACCAAGGCAUCGAUCGGUAGGGGUUCUGAGAGGAAGGCCCCCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGAGGAAUAUUGGUCAAUGGGCGGAAGCCUGAACCAGCCAAGUAGCGUGCAGGACGACGGCCUACGGGUUGUAAACUGCUUUUAUGCGGGGAUAUGCAGGUACCGCAUGAAUAAGGACCGGCUAAUUCCGUGCCAGCAGCCGCGGUAAUACGGAAGGUCCGGGCGUUAUCCGGAUUUAUUGGGUUUAAAGGGAGCGC [...]
+>NZ_ACIZ01000148_643886127
+AAUAUGGAGUUUGAUCCUGGCUCAGGAUGAACGCUGGCGGCGUGCCUAAUACAUGCAAGUCGAACGAGUGGCGGACGGGUGAGUAACACGUGGGUAACCUGCCCUUAAGUGGGGGAUAACAUUUGGAAACAGAUGCUAAUACCGCAUAAAGAAAGUCGCUUUUGGAUGGACCCGCGGCGUAUUAGCUAGUUGGUGAGGUAACGGCUCACCAAGGCAAUGAUACGUAGCCGAACUGAGAGGUUGAUCGGCCACAUUGGGACUGAGACACGGCCCAAACUCCUACGGGAGGCAGCAGUAGGGAAUCUUCCACAAUGGACGCAAGUCUGAUGGAGCAACGCCGCGUGAGUGAAGAAGGCUUUCGGGUCGUAAAACUCUGUUGUUGGAGAAGAUGACGGUAUCCAACCAGAAAGCCACGGCUAACUACGUGCCAGCAGCCGCGGUAAUACGUAGGUGGCAAGCGUUAUCCGGAUUUAUUGGGCGUAAAGCGAGCGC [...]
+
+if __name__ == "__main__":
+    main()
diff --git a/bfillings/tests/test_cd_hit.py b/bfillings/tests/test_cd_hit.py
new file mode 100644
index 0000000..68753a0
--- /dev/null
+++ b/bfillings/tests/test_cd_hit.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, rmdir
+from unittest import TestCase, main
+
+from cogent.core.moltype import PROTEIN, DNA
+
+from bfillings.cd_hit import (CD_HIT, CD_HIT_EST, cdhit_from_seqs,
+                           cdhit_clusters_from_seqs, clean_cluster_seq_id,
+                           parse_cdhit_clstr_file)
+
+
+class CD_HIT_Tests(TestCase):
+    """Tests for the CD-HIT application controller"""
+
+    def test_base_command(self):
+        """CD_HIT BaseCommand should return the correct BaseCommand"""
+        c = CD_HIT()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cd-hit']))
+        c.Parameters['-i'].on('seq.txt')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cd-hit -i "seq.txt"']))
+        c.Parameters['-c'].on(0.8)
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cd-hit -c 0.8' +
+            ' -i "seq.txt"']))
+
+    def test_changing_working_dir(self):
+        """CD_HIT BaseCommand should change according to WorkingDir"""
+        c = CD_HIT(WorkingDir='/tmp/cdhit_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cdhit_test','/"; ','cd-hit']))
+        c = CD_HIT()
+        c.WorkingDir = '/tmp/cdhit_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cdhit_test2','/"; ','cd-hit']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/cdhit_test')
+        rmdir('/tmp/cdhit_test2')
+
+    def test_cdhit_from_seqs(self):
+        """CD_HIT should return expected seqs"""
+        res = cdhit_from_seqs(protein_seqs, PROTEIN, {'-c':0.8})
+        self.assertEqual(res.toFasta(), protein_expected)
+
+class CD_HIT_EST_Tests(TestCase):
+    """Tests for the CD-HIT application controller"""
+
+    def test_base_command(self):
+        """CD_HIT_EST BaseCommand should return the correct BaseCommand"""
+        c = CD_HIT_EST()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cd-hit-est']))
+        c.Parameters['-i'].on('seq.txt')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cd-hit-est -i "seq.txt"']))
+        c.Parameters['-c'].on(0.8)
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cd-hit-est -c 0.8' +
+            ' -i "seq.txt"']))
+
+    def test_changing_working_dir(self):
+        """CD_HIT_EST BaseCommand should change according to WorkingDir"""
+        c = CD_HIT_EST(WorkingDir='/tmp/cdhitest_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cdhitest_test','/"; ','cd-hit-est']))
+        c = CD_HIT_EST()
+        c.WorkingDir = '/tmp/cdhitest_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cdhitest_test2','/"; ','cd-hit-est']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/cdhitest_test')
+        rmdir('/tmp/cdhitest_test2')
+
+    def test_cdhit_from_seqs(self):
+        """CD_HIT should return expected seqs"""
+        res = cdhit_from_seqs(dna_seqs, DNA, {'-c':0.8})
+        self.assertEqual(res.toFasta(), dna_expected)
+
+    def test_cdhit_from_seqs_synonym(self):
+        """CD_HIT should return expected seqs with -c synonym"""
+        res = cdhit_from_seqs(dna_seqs, DNA, {'Similarity':0.8})
+        self.assertEqual(res.toFasta(), dna_expected)
+
+
+class CD_HIT_SupportMethodTests(TestCase):
+    """Tests for supporting methods"""
+    def test_clean_cluster_seq_id(self):
+        """clean_cluster_seq_id returns a cleaned sequence id"""
+        data = ">foobar..."
+        exp = "foobar"
+        obs = clean_cluster_seq_id(data)
+        self.assertEqual(obs, exp)
+
+    def test_parse_cdhit_clstr_file(self):
+        """parse_cdhit_clstr_file returns the correct clusters"""
+        data = cdhit_clstr_file.split('\n')
+        exp = [['seq0'],['seq1','seq10','seq3','seq23','seq145'],\
+               ['seq7','seq17','seq69','seq1231']]
+        obs = parse_cdhit_clstr_file(data)
+        self.assertEqual(obs, exp)
+
+    def test_cdhit_clusters_from_seqs(self):
+        """cdhit_clusters_from_seqs returns expected clusters"""
+        exp = [['cdhit_test_seqs_0'],['cdhit_test_seqs_1'],\
+               ['cdhit_test_seqs_2'],['cdhit_test_seqs_3'],\
+               ['cdhit_test_seqs_4'],['cdhit_test_seqs_5'],\
+               ['cdhit_test_seqs_6','cdhit_test_seqs_8'],\
+               ['cdhit_test_seqs_7'],['cdhit_test_seqs_9']]
+        obs = cdhit_clusters_from_seqs(dna_seqs, DNA)
+        self.assertEqual(obs, exp)
+
+dna_seqs = """>cdhit_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>cdhit_test_seqs_1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>cdhit_test_seqs_2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>cdhit_test_seqs_3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>cdhit_test_seqs_4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>cdhit_test_seqs_5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>cdhit_test_seqs_6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>cdhit_test_seqs_7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>cdhit_test_seqs_8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>cdhit_test_seqs_9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA"""
+
+dna_expected = """>cdhit_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>cdhit_test_seqs_1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>cdhit_test_seqs_2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>cdhit_test_seqs_4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>cdhit_test_seqs_5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>cdhit_test_seqs_7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT"""
+
+protein_seqs = """>seq1
+MGNKWSKSWPQVRDRMRRAAPAPAADGVGAVSQDLAKHGAITSSNTAATNDDCAWLEAQTEEEVGFPVRPQVPLRPMTYK
+>seq2
+MGGKWSKSSIVGWSTVRERMRKTPPAADGVGAVSQDLDKHGAVTSSNTAFNNPDCAWLEAQEDEDVGFPVRPQVPLRPT
+>seq3
+MGGKWSKSSIVGWPAIRERMRRARPAADRVGTQPAADGVGAVSQDLARHGAVTSSNTSHNNPDCAWLEAQEEEEVGVR
+>seq4
+MGKIWSKSSIVGWPEIRERMRRQRPHEPAVEPAVGVGAASQDLANRGALTTSNTRTNNPTVAWVEAQEEEGEVVRPQ
+>seq5
+MGKIWSKSSLVGWPEIRERMRRQTQEPAVEPAVGAGAASQDLANRGAITIRNTRDNNESIAWLEAQEEEFPVRPQV
+>seq6
+MGKIWSKSSLVGWPEIRERIRRQTPEPAVGVGAVSQDLANRGAITTSNTKDNNQTVAWLEAQEEPVRPQVPLRPM
+>seq7
+MGNALRKGKFEGWAAVRERMRRTRTFPESEPCAPGVGQISRELAARGGIPSSHTPQNNESHQEEEVGFPVAPQV
+>seq8
+MGNAWSKSKFAGWSEVRDRMRRSSSDPQQPCAPGVGAVSRELATRGGISSSALAFLDSHKDEDVGFPVRPQVP
+>seq9
+MGNVLGKDKFKGWAAVRERMRKTSSDPDPQPCAPGVGPVSRELSYTPQNNAALAFLESHEDEDVGFPVXPQV
+>seq10
+MGNVLGKDKFKGWSAVRERMRKTSPEPEPCAPGVRGGISNSHTPQNNAALAFLESHQDEDVGFPVRPQVPL"""
+
+protein_expected = """>seq1
+MGNKWSKSWPQVRDRMRRAAPAPAADGVGAVSQDLAKHGAITSSNTAATNDDCAWLEAQTEEEVGFPVRPQVPLRPMTYK
+>seq2
+MGGKWSKSSIVGWSTVRERMRKTPPAADGVGAVSQDLDKHGAVTSSNTAFNNPDCAWLEAQEDEDVGFPVRPQVPLRPT
+>seq3
+MGGKWSKSSIVGWPAIRERMRRARPAADRVGTQPAADGVGAVSQDLARHGAVTSSNTSHNNPDCAWLEAQEEEEVGVR
+>seq4
+MGKIWSKSSIVGWPEIRERMRRQRPHEPAVEPAVGVGAASQDLANRGALTTSNTRTNNPTVAWVEAQEEEGEVVRPQ
+>seq5
+MGKIWSKSSLVGWPEIRERMRRQTQEPAVEPAVGAGAASQDLANRGAITIRNTRDNNESIAWLEAQEEEFPVRPQV
+>seq7
+MGNALRKGKFEGWAAVRERMRRTRTFPESEPCAPGVGQISRELAARGGIPSSHTPQNNESHQEEEVGFPVAPQV
+>seq8
+MGNAWSKSKFAGWSEVRDRMRRSSSDPQQPCAPGVGAVSRELATRGGISSSALAFLDSHKDEDVGFPVRPQVP
+>seq9
+MGNVLGKDKFKGWAAVRERMRKTSSDPDPQPCAPGVGPVSRELSYTPQNNAALAFLESHEDEDVGFPVXPQV"""
+
+cdhit_clstr_file = """>Cluster 0
+0       2799aa, >seq0... *
+>Cluster 1
+0       2214aa, >seq1... at 80%
+1       2215aa, >seq10... at 84%
+2       2217aa, >seq3... *
+3       2216aa, >seq23... at 84%
+4       527aa, >seq145... at 63%
+>Cluster 2
+0       2202aa, >seq7... at 60%
+1       2208aa, >seq17... *
+2       2207aa, >seq69... at 73%
+3       2208aa, >seq1231... at 69%"""
+
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_clearcut.py b/bfillings/tests/test_clearcut.py
new file mode 100644
index 0000000..aff0ef5
--- /dev/null
+++ b/bfillings/tests/test_clearcut.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir, path
+import tempfile
+import shutil
+from unittest import TestCase, main
+
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import DataError
+from cogent.util.misc import flatten
+from cogent.util.dict2d import Dict2D
+
+from bfillings.clearcut import (Clearcut, build_tree_from_alignment,
+                             _matrix_input_from_dict2d,
+                             build_tree_from_distance_matrix)
+
+
+class GeneralSetUp(TestCase):
+
+    def setUp(self):
+        """Clearcut general setUp method for all tests"""
+        self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+            'GCGGCUAUUAGAUCGUA']
+
+        self.labels1 = ['>1','>2','>3']
+        self.lines1 = flatten(zip(self.labels1,self.seqs1))
+
+        self.seqs2=['UAGGCUCUGAUAUAAUAGCUCUC','UAUCGCUUCGACGAUUCUCUGAUAGAGA',
+            'UGACUACGCAU']
+        self.labels2=['>a','>b','>c']
+        self.lines2 = flatten(zip(self.labels2,self.seqs2))
+
+        self.temp_dir = tempfile.mkdtemp()
+        #self.temp_dir_spaces = '/tmp/test for clearcut/'
+        #try:
+        #    mkdir(self.temp_dir_spaces)
+        #except OSError:
+        #    pass
+        try:
+            #create sequence files
+            f = open(path.join(self.temp_dir, 'seq1.txt'),'w')
+            f.write('\n'.join(self.lines1))
+            f.close()
+            g = open(path.join(self.temp_dir, 'seq2.txt'),'w')
+            g.write('\n'.join(self.lines2))
+            g.close()
+        except OSError:
+            pass
+
+
+class ClearcutTests(GeneralSetUp):
+    """Tests for the Clearcut application controller"""
+
+    def test_base_command(self):
+        """Clearcut BaseCommand should return the correct BaseCommand"""
+        c = Clearcut()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','clearcut -d -q']))
+        c.Parameters['--in'].on('seq.txt')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','clearcut -d --in="seq.txt" -q']))
+
+
+    def test_changing_working_dir(self):
+        """Clearcut BaseCommand should change according to WorkingDir"""
+        c = Clearcut(WorkingDir='/tmp/clearcut_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/clearcut_test','/"; ','clearcut -d -q']))
+        c = Clearcut()
+        c.WorkingDir = '/tmp/clearcut_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/clearcut_test2','/"; ','clearcut -d -q']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/clearcut_test')
+        rmdir('/tmp/clearcut_test2')
+
+    def test_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        #shutil.rmtree(self.temp_dir_spaces)
+
+    def test_build_tree_from_alignment(self):
+        """Clearcut should return a tree built from the passed alignment"""
+        tree_short = build_tree_from_alignment(build_tree_seqs_short,\
+            moltype=DNA)
+        num_seqs = flatten(build_tree_seqs_short).count('>')
+        self.assertEqual(len(tree_short.tips()), num_seqs)
+
+        tree_long = build_tree_from_alignment(build_tree_seqs_long, moltype=DNA)
+        seq_names = []
+        for line in build_tree_seqs_long.split('\n'):
+            if line.startswith('>'):
+                seq_names.append(line[1:])
+
+        for node in tree_long.tips():
+            if node.Name not in seq_names:
+                self.fail()
+        #repeat with best_tree = True
+        tree_long = build_tree_from_alignment(build_tree_seqs_long,\
+            best_tree=True,\
+            moltype=DNA)
+        seq_names = []
+        for line in build_tree_seqs_long.split('\n'):
+            if line.startswith('>'):
+                seq_names.append(line[1:])
+
+        for node in tree_long.tips():
+            if node.Name not in seq_names:
+                self.fail()
+
+        #build_tree_from_alignment should raise DataError when constructing
+        # an Alignment from unaligned sequences. Clearcut only allows aligned
+        # or a distance matrix as input.
+        self.assertRaises(DataError,build_tree_from_alignment,\
+            build_tree_seqs_unaligned,DNA)
+
+    def test_matrix_input_from_dict2d(self):
+        """matrix_input_from_dict2d formats dict2d object into distance matrix
+        """
+        data = [('sample1aaaaaaa', 'sample2', 1.438), ('sample2', 'sample1aaaaaaa', 1.438), ('sample1aaaaaaa', 'sample3', 2.45678), ('sample3', 'sample1aaaaaaa', 2.45678), ('sample2', 'sample3', 2.7), ('sample3', 'sample2', 2.7)]
+        data_dict2d = Dict2D(data, Pad=True, Default=0.0)
+        matrix, int_map = _matrix_input_from_dict2d(data_dict2d)
+        #of = open('temp.txt', 'w')
+        #of.write(matrix)
+        #of.close()
+        matrix = matrix.split('\n')
+        self.assertEqual(matrix[0], '   3')
+        self.assertEqual(matrix[1], 'env_0       0.0  1.438  2.45678')
+        self.assertEqual(matrix[2], 'env_1       1.438  0.0  2.7')
+        self.assertEqual(matrix[3], 'env_2       2.45678  2.7  0.0')
+        self.assertEqual(int_map['env_1'], 'sample2')
+        self.assertEqual(int_map['env_0'], 'sample1aaaaaaa')
+        self.assertEqual(int_map['env_2'], 'sample3')
+
+    def test_build_tree_from_distance_matrix(self):
+        """build_tree_from_distance_matrix builds a tree from a dict2d
+        """
+        data = [('sample1aaaaaaa', 'sample2', 1.438), ('sample2', 'sample1aaaaaaa', 1.438), ('sample1aaaaaaa', 'sample3', 2.45678), ('sample3', 'sample1aaaaaaa', 2.45678), ('sample2', 'sample3', 2.7), ('sample3', 'sample2', 2.7)]
+        data_dict2d = Dict2D(data, Pad=True, Default=0.0)
+        result = build_tree_from_distance_matrix(data_dict2d)
+        self.assertEqual(str(result), '((sample1aaaaaaa:0.59739,sample2:0.84061),sample3:1.85939);')
+
+
+align1 = ">seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+build_tree_seqs_short = """>clearcut_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clearcut_test_seqs_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clearcut_test_seqs_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clearcut_test_seqs_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clearcut_test_seqs_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clearcut_test_seqs_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clearcut_test_seqs_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+build_tree_seqs_long = """>clearcut_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clearcut_test_seqsaaaaaaaa_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clearcut_test_seqsaaaaaaaa_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clearcut_test_seqsaaaaaaaa_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clearcut_test_seqsaaaaaaaa_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clearcut_test_seqsaaaaaaaa_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+#Unaligned seqs.  First two sequences are 3 nucleotides shorter.
+build_tree_seqs_unaligned = """>clearcut_test_seqs_0
+CCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_1
+CCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clearcut_test_seqs_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clearcut_test_seqs_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clearcut_test_seqs_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clearcut_test_seqs_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clearcut_test_seqs_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clearcut_test_seqs_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clearcut_test_seqs_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_clustalw.py b/bfillings/tests/test_clustalw.py
new file mode 100644
index 0000000..bdc57ee
--- /dev/null
+++ b/bfillings/tests/test_clustalw.py
@@ -0,0 +1,627 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Tests for application controller for ClustalW v1.83"""
+import re
+from os import getcwd, remove, rmdir, mkdir, path
+import shutil
+from cogent.core.alignment import Alignment
+from cogent.core.moltype import RNA
+from cogent.util.unit_test import TestCase, main
+from cogent.util.misc import flatten
+from skbio.parse.sequences import parse_fasta
+from bfillings.clustalw import (Clustalw, alignUnalignedSeqsFromFile,
+                             alignUnalignedSeqs, alignTwoAlignments,
+                             addSeqsToAlignment, buildTreeFromAlignment,
+                             build_tree_from_alignment,
+                             bootstrap_tree_from_alignment,
+                             align_unaligned_seqs, align_and_build_tree,
+                             add_seqs_to_alignment, align_two_alignments)
+
+
+cw_vers = re.compile("CLUSTAL W [(]1\.8[1-3][.\d]*[)]")
+
+class GeneralSetUp(TestCase):
+
+    def setUp(self):
+        """Clustalw general setUp method for all tests"""
+        self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+            'GCGGCUAUUAGAUCGUA']
+        self.aln1_fasta = ALIGN1_FASTA
+        self.labels1 = ['>1','>2','>3']
+        self.lines1 = flatten(zip(self.labels1,self.seqs1))
+        self.stdout1 = STDOUT1
+        self.aln1 = ALIGN1
+        self.dnd1 = DND1
+
+        self.multiline1 = '\n'.join(flatten(zip(self.labels1, self.seqs1)))
+
+        self.seqs2=['UAGGCUCUGAUAUAAUAGCUCUC','UAUCGCUUCGACGAUUCUCUGAUAGAGA',
+            'UGACUACGCAU']
+        self.labels2=['>a','>b','>c']
+        self.lines2 = flatten(zip(self.labels2,self.seqs2))
+        self.aln2 = ALIGN2
+        self.dnd2 = DND2
+
+        self.twoalign = TWOALIGN
+        self.alignseqs = ALIGNSEQS
+        self.treeduringalignseqs = TREEDURINGALIGNSEQS
+        self.treefromalignseqs = TREEFROMALIGNSEQS
+
+        self.temp_dir_space = "/tmp/clustalw test"
+
+        self.build_tree_seqs_short = """>clustal_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clustal_test_seqs_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clustal_test_seqs_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clustal_test_seqs_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clustal_test_seqs_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clustal_test_seqs_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clustal_test_seqs_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clustal_test_seqs_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clustal_test_seqs_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clustal_test_seqs_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+        self.build_tree_seqs_long = """>clustal_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>clustal_test_seqsaaaaaaaa_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>clustal_test_seqsaaaaaaaa_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>clustal_test_seqsaaaaaaaa_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>clustal_test_seqsaaaaaaaa_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>clustal_test_seqsaaaaaaaa_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>clustal_test_seqsaaaaaaaa_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>clustal_test_seqsaaaaaaaa_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>clustal_test_seqsaaaaaaaa_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>clustal_test_seqsaaaaaaaa_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+        try:
+            mkdir('/tmp/ct')
+        except OSError: #dir already exists
+            pass
+
+        try:
+            #create sequence files
+            f = open('/tmp/ct/seq1.txt','w')
+            f.write('\n'.join(self.lines1))
+            f.close()
+            g = open('/tmp/ct/seq2.txt','w')
+            g.write('\n'.join(self.lines2))
+            g.close()
+            #create alignment files
+            f = open('/tmp/ct/align1','w')
+            f.write(self.aln1)
+            f.close()
+            g = open('/tmp/ct/align2','w')
+            g.write(self.aln2)
+            g.close()
+            #create tree file
+            f = open('/tmp/ct/tree1','w')
+            f.write(DND1)
+            f.close()
+        except OSError:
+            pass
+
+
+
+class ClustalwTests(GeneralSetUp):
+    """Tests for the Clustalw application controller"""
+
+    def test_base_command(self):
+        """Clustalw BaseCommand should return the correct BaseCommand"""
+        c = Clustalw()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','clustalw -align']))
+        c.Parameters['-infile'].on('seq.txt')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ',\
+            'clustalw -infile="seq.txt" -align']))
+        c.Parameters['-align'].off()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','clustalw -infile="seq.txt"']))
+        c.Parameters['-nopgap'].on()
+        c.Parameters['-infile'].off()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','clustalw -nopgap']))
+
+    def test_changing_working_dir(self):
+        """Clustalw BaseCommand should change according to WorkingDir"""
+        c = Clustalw(WorkingDir='/tmp/clustaltest')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/clustaltest','/"; ','clustalw -align']))
+        c = Clustalw(WorkingDir='/tmp/clustaltest/')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/clustaltest/','/"; ','clustalw -align']))
+        c = Clustalw()
+        c.WorkingDir = '/tmp/clustaltest2/'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/clustaltest2/','/"; ','clustalw -align']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/clustaltest')
+        rmdir('/tmp/clustaltest2')
+
+    def test_stdout_input_as_string(self):
+        """Clustalw input_as_string shoud function as expected"""
+        c = Clustalw(WorkingDir='/tmp/ct')
+        res = c('/tmp/ct/seq1.txt')
+        self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+                            cw_vers.sub("", self.stdout1))
+        self.assertEqual(res['StdErr'].read(),'')
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.aln1))
+        self.assertEqual(res['Dendro'].read(),self.dnd1)
+        res.cleanUp()
+
+    def test_stdout_input_as_lines(self):
+        """Clustalw input_as_lines should function as expected"""
+        c = Clustalw(InputHandler='_input_as_lines',WorkingDir='/tmp/ct')
+        res = c(self.lines1)
+        #get info on input file name and change output accordingly
+        name = c.Parameters['-infile'].Value
+        out = self.stdout1.split('\n')
+        out[16] =\
+            'Guide tree        file created:   ['+name.rsplit(".")[0]+'.dnd]'
+        out[23] =\
+            'CLUSTAL-Alignment file created  ['+name.rsplit(".")[0]+'.aln]'
+
+        self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+                            cw_vers.sub("", '\n'.join(out)))
+        self.assertEqual(res['StdErr'].read(),'')
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.aln1))
+        self.assertEqual(res['Dendro'].read(),self.dnd1)
+        res.cleanUp()
+
+    def test_stdout_input_as_lines_local(self):
+        """Clustalw input_as_lines should function as expected"""
+        c = Clustalw(InputHandler='_input_as_lines',WorkingDir=self.temp_dir_space)
+        res = c(self.lines1)
+        #get info on input file name and change output accordingly
+        name = c.Parameters['-infile'].Value
+        out = self.stdout1.split('\n')
+        out[16] =\
+            'Guide tree        file created:   ['+name.rsplit(".")[0]+'.dnd]'
+        out[23] =\
+            'CLUSTAL-Alignment file created  ['+name.rsplit(".")[0]+'.aln]'
+
+        self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+                            cw_vers.sub("", '\n'.join(out)))
+        self.assertEqual(res['StdErr'].read(),'')
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.aln1))
+        self.assertEqual(res['Dendro'].read(),self.dnd1)
+        res.cleanUp()
+
+    def test_stdout_input_as_seqs(self):
+        """Clustalw input_as_seqs should function as expected"""
+        c = Clustalw(InputHandler='_input_as_seqs',WorkingDir='/tmp/ct')
+        res = c(self.seqs1)
+        #get info on input file name and change output accordingly
+        name = c.Parameters['-infile'].Value
+        out = self.stdout1.split('\n')
+        out[16] =\
+            'Guide tree        file created:   ['+name.rsplit(".")[0]+'.dnd]'
+        out[23] =\
+            'CLUSTAL-Alignment file created  ['+name.rsplit(".")[0]+'.aln]'
+
+        self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+                            cw_vers.sub("", '\n'.join(out)))
+        self.assertEqual(res['StdErr'].read(),'')
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.aln1))
+        self.assertEqual(res['Dendro'].read(),self.dnd1)
+        res.cleanUp()
+
+    def test_stdout_input_as_multiline_string(self):
+        """Clustalw input_as_multiline_string should function as expected"""
+        c = Clustalw(InputHandler='_input_as_multiline_string',\
+                     WorkingDir='/tmp/ct')
+        res = c(self.multiline1)
+        name = c.Parameters['-infile'].Value
+        out = self.stdout1.split('\n')
+        out[16] =\
+            'Guide tree        file created:   ['+name.rsplit(".")[0]+'.dnd]'
+        out[23] =\
+            'CLUSTAL-Alignment file created  ['+name.rsplit(".")[0]+'.aln]'
+
+        self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+                            cw_vers.sub("", '\n'.join(out)))
+        self.assertEqual(res['StdErr'].read(),'')
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.aln1))
+        self.assertEqual(res['Dendro'].read(),self.dnd1)
+        res.cleanUp()
+
+    def test_alignment_trees(self):
+        """Clustalw alignment should work correctly with new/usetree"""
+        c = Clustalw(params={'-quicktree':True,'-type':'DNA','-gapopen':10},\
+            WorkingDir='/tmp/ct')
+        res = c('/tmp/ct/seq1.txt')
+        self.assertEqual(res['Align'].name,'/tmp/ct/seq1.aln')
+        self.assertEqual(res['Dendro'].name,'/tmp/ct/seq1.dnd')
+        res.cleanUp()
+        c.Parameters['-usetree'].on('/tmp/ct/tree1')
+        c.Parameters['-output'].on('PHYLIP')
+        res = c('/tmp/ct/seq1.txt')
+        self.assertEqual(res['Align'].name,'/tmp/ct/seq1.phy')
+        self.assertEqual(res['Dendro'].name,'/tmp/ct/tree1')
+        res.cleanUp()
+        c.Parameters['-newtree'].on('newtree')
+        c.Parameters['-outfile'].on('outfile')
+        res = c('/tmp/ct/seq1.txt')
+        self.assertEqual(res['Align'].name, c.WorkingDir + 'outfile')
+        self.assertEqual(res['Dendro'].name, c.WorkingDir + 'newtree')
+        res.cleanUp()
+
+    def test_profile_newtree(self):
+        """Clustalw profile should work correctly with new/usetree"""
+        c = Clustalw(params={'-profile':None,'-profile1':'/tmp/ct/seq1.txt',\
+            '-profile2':'/tmp/ct/seq2.txt','-newtree1':'lala'},\
+            WorkingDir='/tmp/ct')
+        c.Parameters['-align'].off()
+        res = c()
+        self.assertEqual(res['Align'],None)
+        self.assertEqual(res['Dendro1'].name,'/tmp/ct/lala')
+        self.assertEqual(res['Dendro2'].name,'/tmp/ct/seq2.dnd')
+        res.cleanUp()
+
+    def test_sequences_newtree(self):
+        """Clustalw sequences should work correctly with new/usetree"""
+        c = Clustalw(params={'-sequences':None,'-newtree':'lala',\
+            '-profile1':'/tmp/ct/align1','-profile2':'/tmp/ct/seq2.txt'},\
+            WorkingDir='/tmp/ct')
+        c.Parameters['-align'].off()
+        res = c()
+        self.assertEqual(res['Align'],None)
+        self.assertEqual(res['Dendro'].name,'/tmp/ct/lala')
+        res.cleanUp()
+
+        #is this a bug in clustal. It's creating an empty file 'seq2.aln'
+        #but doesn't report it in the stdout
+        remove('/tmp/ct/seq2.aln')
+
+    def test_tree_outputtree(self):
+        """Clustalw tree should work correctly with outputtree"""
+        c = Clustalw(params={'-tree':None,'-outputtree':'dist',\
+            '-infile':'/tmp/ct/align1'},WorkingDir='/tmp/ct/')
+        c.Parameters['-align'].off()
+        res = c()
+        self.assertEqual(res['Tree'].name,'/tmp/ct/align1.ph')
+        self.assertEqual(res['TreeInfo'].name,'/tmp/ct/align1.dst')
+        res.cleanUp()
+
+
+class clustalwTests(GeneralSetUp):
+    """Tests for module level functions in clustalw.py"""
+
+
+    def test_alignUnalignedSeqs(self):
+        """Clustalw alignUnalignedSeqs should work as expected"""
+        res = alignUnalignedSeqs(self.seqs1,WorkingDir='/tmp/ct')
+        self.assertNotEqual(res['StdErr'],None)
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.aln1))
+        self.assertEqual(res['Dendro'].read(),self.dnd1)
+        res.cleanUp()
+
+        #suppress stderr and stdout
+        res = alignUnalignedSeqs(self.seqs1,WorkingDir='/tmp/ct',\
+            SuppressStderr=True,SuppressStdout=True)
+        self.assertEqual(res['StdOut'],None)
+        self.assertEqual(res['StdErr'],None)
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.aln1))
+        self.assertEqual(res['Dendro'].read(),self.dnd1)
+        res.cleanUp()
+
+    def test_alignUnalignedSeqsFromFile(self):
+        """Clustalw alignUnalignedSeqsFromFile should work as expected"""
+        #make temp file
+        res = alignUnalignedSeqsFromFile('/tmp/ct/seq1.txt')
+        self.assertEqual(cw_vers.sub("", res['StdOut'].read()),
+                            cw_vers.sub("", self.stdout1))
+        self.assertEqual(res['StdErr'].read(),'')
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.aln1))
+        self.assertEqual(res['Dendro'].read(),self.dnd1)
+        res.cleanUp()
+        #suppress stderr and stdout
+        res = alignUnalignedSeqsFromFile('/tmp/ct/seq1.txt',\
+            SuppressStderr=True, SuppressStdout=True)
+        self.assertEqual(res['StdOut'],None)
+        self.assertEqual(res['StdErr'],None)
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.aln1))
+        self.assertEqual(res['Dendro'].read(),self.dnd1)
+        res.cleanUp()
+
+    def test_alignTwoAlignments(self):
+        """Clustalw alignTwoAlignments should work as expected"""
+        res = alignTwoAlignments('/tmp/ct/align1','/tmp/ct/align2',\
+            'twoalign.aln')
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.twoalign))
+        self.assertNotEqual(res['Dendro1'],None)
+        self.assertNotEqual(res['Dendro2'],None)
+        #are there new trees created during the profiling?
+        #the produced trees are not the same as when aligning individually
+        #self.assertEqual(res['Dendro1'].read(),self.dnd)
+        #self.assertEqual(res['Dendro2'].read(),self.dnd2)
+        res.cleanUp()
+
+    def test_addSeqsToAlignment(self):
+        """Clustalw addSeqsToAlignment shoudl work as expected"""
+        res = addSeqsToAlignment('/tmp/ct/align1','/tmp/ct/seq2.txt',\
+            'alignseqs')
+        self.assertEqual(cw_vers.sub("", res['Align'].read()),
+                            cw_vers.sub("", self.alignseqs))
+        self.assertEqual(res['Dendro'].read(),self.treeduringalignseqs)
+        res.cleanUp()
+
+    def test_buildTreeFromAlignment(self):
+        """Clustalw buildTreeFromAlignment shoudl work as expected"""
+        pre_res = addSeqsToAlignment('/tmp/ct/align1','/tmp/ct/seq2.txt',\
+            'alignseqs',WorkingDir='/tmp/ct')
+        res = buildTreeFromAlignment('/tmp/ct/alignseqs',WorkingDir='/tmp/ct')
+        self.assertEqual(res['Tree'].read(),self.treefromalignseqs)
+
+        res.cleanUp()
+        pre_res.cleanUp()
+
+    def test_build_tree_from_alignment(self):
+        """Clustalw should return a tree built from the passed alignment"""
+        tree_short = build_tree_from_alignment(self.build_tree_seqs_short, \
+                RNA, best_tree=False)
+        num_seqs = flatten(self.build_tree_seqs_short).count('>')
+        self.assertEqual(len(tree_short.tips()), num_seqs)
+
+        tree_long = build_tree_from_alignment(self.build_tree_seqs_long, \
+                RNA, best_tree=False)
+        seq_names = []
+        for line in self.build_tree_seqs_long.split('\n'):
+            if line.startswith('>'):
+                seq_names.append(line[1:])
+
+        for node in tree_long.tips():
+            if node.Name not in seq_names:
+                self.fail()
+
+        tree_short = build_tree_from_alignment(self.build_tree_seqs_short, \
+                RNA, best_tree=True, params={'-bootstrap':3})
+        num_seqs = flatten(self.build_tree_seqs_short).count('>')
+        self.assertEqual(len(tree_short.tips()), num_seqs)
+
+    def test_align_unaligned_seqs(self):
+        """Clustalw align_unaligned_seqs should work as expected"""
+        res = align_unaligned_seqs(self.seqs1, RNA)
+        self.assertEqual(res.toFasta(), self.aln1_fasta)
+
+    def test_bootstrap_tree_from_alignment(self):
+        """Clustalw should return a bootstrapped tree from the passed aln"""
+        tree_short = bootstrap_tree_from_alignment(self.build_tree_seqs_short)
+        num_seqs = flatten(self.build_tree_seqs_short).count('>')
+        self.assertEqual(len(tree_short.tips()), num_seqs)
+
+        tree_long = bootstrap_tree_from_alignment(self.build_tree_seqs_long)
+        seq_names = []
+        for line in self.build_tree_seqs_long.split('\n'):
+            if line.startswith('>'):
+                seq_names.append(line[1:])
+
+        for node in tree_long.tips():
+            if node.Name not in seq_names:
+                self.fail()
+    def test_align_and_build_tree(self):
+        """Aligns and builds a tree for a set of sequences"""
+        res = align_and_build_tree(self.seqs1, RNA)
+        self.assertEqual(res['Align'].toFasta(), self.aln1_fasta)
+
+        tree = res['Tree']
+        seq_names = []
+        for line in self.aln1_fasta.split('\n'):
+            if line.startswith('>'):
+                seq_names.append(line[1:])
+
+        for node in tree.tips():
+            if node.Name not in seq_names:
+                self.fail()
+
+    def test_add_seqs_to_alignment(self):
+        """Clustalw add_seqs_to_alignment should work as expected."""
+        seq2 = dict(parse_fasta(self.lines2))
+        align1 = dict(parse_fasta(ALIGN1_FASTA.split('\n')))
+        res = add_seqs_to_alignment(seq2,align1,RNA)
+        self.assertEqual(res.toFasta(), SEQ_PROFILE_ALIGN)
+
+    def test_align_two_alignments(self):
+        """Clustalw align_two_alignments should work as expected."""
+        align1 = dict(parse_fasta(ALIGN1_FASTA.split('\n')))
+        align2 = dict(parse_fasta(ALIGN2_FASTA.split('\n')))
+        res = align_two_alignments(align1,align2,RNA)
+        self.assertEqual(res.toFasta(), PROFILE_PROFILE_ALIGN)
+
+    def test_zzz_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        remove('/tmp/ct/seq1.txt')
+        remove('/tmp/ct/seq2.txt')
+        remove('/tmp/ct/align1')
+        remove('/tmp/ct/align2')
+        remove('/tmp/ct/tree1')
+        rmdir('/tmp/ct')
+        shutil.rmtree(self.temp_dir_space)
+
+STDOUT1=\
+"""
+
+
+ CLUSTAL W (1.83) Multiple Sequence Alignments
+
+
+
+Sequence format is Pearson
+Sequence 1: 1                23 bp
+Sequence 2: 2                13 bp
+Sequence 3: 3                17 bp
+Start of Pairwise alignments
+Aligning...
+Sequences (1:2) Aligned. Score:  46
+Sequences (1:3) Aligned. Score:  41
+Sequences (2:3) Aligned. Score:  30
+Guide tree        file created:   [/tmp/ct/seq1.dnd]
+Start of Multiple Alignment
+There are 2 groups
+Aligning...
+Group 1: Sequences:   2      Score:171
+Group 2: Sequences:   3      Score:162
+Alignment Score 33
+CLUSTAL-Alignment file created  [/tmp/ct/seq1.aln]
+"""
+
+ALIGN1=\
+"""CLUSTAL W (1.83) multiple sequence alignment
+
+
+1               ACUGCUAGCUAGUAGCGUACGUA
+2               ---GCUACGUAGCUAC-------
+3               GCGGCUAUUAGAUCGUA------
+                   ****
+"""
+
+ALIGN1_FASTA = ">seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+DND1=\
+"""(
+1:0.21719,
+2:0.32127,
+3:0.37104);
+"""
+
+ALIGN2 =\
+"""CLUSTAL W (1.83) multiple sequence alignment
+
+
+a               UAGGCUCUGAUAUAAUAGCUCUC---------
+b               ----UAUCGCUUCGACGAUUCUCUGAUAGAGA
+c               ------------UGACUACGCAU---------
+                              *     *
+"""
+
+ALIGN2_FASTA = ">a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\n----UAUCGCUUCGACGAUUCUCUGAUAGAGA\n>c\n------------UGACUACGCAU---------"
+
+DND2=\
+"""(
+a:0.30435,
+b:0.30435,
+c:0.33202);
+"""
+
+TWOALIGN=\
+"""CLUSTAL W (1.83) multiple sequence alignment
+
+
+1               ---ACUGCUAGCUAGUAGCGUACGUA------
+2               ------GCUACGUAGCUAC-------------
+3               ---GCGGCUAUUAGAUCGUA------------
+a               UAGGCUCUGAUAUAAUAGCUCUC---------
+b               ----UAUCGCUUCGACGAUUCUCUGAUAGAGA
+c               ------------UGACUACGCAU---------
+
+"""
+
+ALIGNSEQS=\
+"""CLUSTAL W (1.83) multiple sequence alignment
+
+
+1               ----------ACUGCUAGCUAGUAGCGUACGUA
+2               -------------GCUACGUAGCUAC-------
+3               ----------GCGGCUAUUAGAUCGUA------
+a               -------UAGGCUCUGAUAUAAUAGCUCUC---
+c               -------------------UGACUACGCAU---
+b               UAUCGCUUCGACGAUUCUCUGAUAGAGA-----
+
+"""
+
+TREEDURINGALIGNSEQS=\
+"""(
+1:0.34511,
+(
+2:0.25283,
+(
+(
+3:0.21486,
+a:0.19691)
+:0.11084,
+b:0.31115)
+:0.06785)
+:0.02780,
+c:0.20035);
+"""
+
+TREEFROMALIGNSEQS=\
+"""(
+(
+(
+1:0.17223,
+(
+2:0.14749,
+c:0.13822)
+:0.19541)
+:0.07161,
+a:0.25531)
+:0.03600,
+3:0.29438,
+b:0.23503);
+"""
+
+SEQ_PROFILE_ALIGN = """>a\n-------UAGGCUCUGAUAUAAUAGCUCUC---\n>b\nUAUCGCUUCGACGAUUCUCUGAUAGAGA-----\n>c\n-------------------UGACUACGCAU---\n>seq_0\n----------ACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n-------------GCUACGUAGCUAC-------\n>seq_2\n----------GCGGCUAUUAGAUCGUA------"""
+
+PROFILE_PROFILE_ALIGN = """>a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\n----UAUCGCUUCGACGAUUCUCUGAUAGAGA\n>c\n------------UGACUACGCAU---------\n>seq_0\n---ACUGCUAGCUAGUAGCGUACGUA------\n>seq_1\n------GCUACGUAGCUAC-------------\n>seq_2\n---GCGGCUAUUAGAUCGUA------------"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_fasttree.py b/bfillings/tests/test_fasttree.py
new file mode 100644
index 0000000..e86cd56
--- /dev/null
+++ b/bfillings/tests/test_fasttree.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Tests for FastTree v1.1 application controller.
+Also functions on v2.0.1, v2.1.0 and v2.1.3"""
+
+from shutil import rmtree
+from os import getcwd
+from unittest import TestCase, main
+
+from cogent.core.alignment import Alignment
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA
+
+from skbio.parse.sequences import parse_fasta
+
+from bfillings.fasttree import FastTree, build_tree_from_alignment
+
+
+class FastTreeTests(TestCase):
+    def setUp(self):
+        self.seqs = Alignment(dict(parse_fasta(test_seqs.split())))
+
+    def test_base_command(self):
+        app = FastTree()
+        self.assertEqual(app.BaseCommand, \
+                         ''.join(['cd "',getcwd(),'/"; ','FastTree']))
+        app.Parameters['-nt'].on()
+        self.assertEqual(app.BaseCommand, \
+                         ''.join(['cd "',getcwd(),'/"; ','FastTree -nt']))
+
+    def test_change_working_dir(self):
+        app = FastTree(WorkingDir='/tmp/FastTreeTest')
+        self.assertEqual(app.BaseCommand, \
+                       ''.join(['cd "','/tmp/FastTreeTest','/"; ','FastTree']))
+        rmtree('/tmp/FastTreeTest')
+
+    def test_build_tree_from_alignment(self):
+        tree = build_tree_from_alignment(self.seqs, DNA)
+        # test expected output for fasttree 1.1 and 2.0.1
+        try:
+            for o,e in zip(tree.traverse(), DndParser(exp_tree).traverse()):
+                self.assertEqual(o.Name,e.Name)
+                self.assertAlmostEqual(o.Length,e.Length)
+        except AssertionError:
+            for o,e in zip(tree.traverse(), DndParser(exp_tree_201).traverse()):
+                self.assertEqual(o.Name,e.Name)
+                self.assertAlmostEqual(o.Length,e.Length)
+test_seqs = """>test_set1_0
+GGTAGATGGGACTACCTCATGACATGAAACTGCAGTCTGTTCTTTTATAGAAGCTTCATACTTGGAGATGTATACTATTA
+CTTAGGACTATGGAGGTATA
+>test_set1_1
+GGTTGATGGGACTACGTAGTGACATGAAATTGCAGTCTGTGCTTTTATAGAAGTTTGATACTTGGAGCTCTCTACTATTA
+CTTAGGACTATGGAGGTATA
+>test_set1_2
+GGTTGATGGGCCTACCTCATGACAATAAACTGAAGTCTGTGCTTTTATAGAGGCTTGATACTTGGAGCTCTATACTATTA
+CTTAGGATTATGGAGGTCTA
+>test_set1_3
+GGTTGATGGGACTACCTCATGACATGAAACTGCAGTCTGTGCTTTTATAGAAGCTTGATACTTGGAGATCTATACTATTA
+CTTAGGACTATGGAGGTCAC
+>test_set1_4
+GGTTGGTGGGACTACCTCATGACATGAAGATGCAGTCTGTGCTTGTATAGAAGCTTGAAACTTGGATATCTATACTATTA
+CTTAAGACTATGGAGGTCTA
+>test_set1_5
+GGTTGATGCGACTACCTCATGACATGAGACTGCAGTCTGTGCTTTTACTGAAGCTTGATACTTGGAGATCTATACTATTA
+CTTAGGACTATGGAGGTTTA
+>test_set1_6
+GGTTGATGGGACTACCTCATGACATGAAAATGCAGTCTGTCCTTTTATAGAAGCTTGATACTTGTAGATCTATACTGTTA
+CTTAGGACTATGGAGGTCTA
+>test_set1_7
+GGTTGATGGGACTCCCTCATGACATAAAACTGCAGTCTGTGCTTTTACAGAAGCTTGATACTTGGAGATCTATACTATTA
+CATAGGACTATGGAGGTCTA
+>test_set1_8
+GGTTGATGGCACTACCTCATGAGATGAAACTGCAGTCTGTGCTTTTATAGAAGCTTGATACTTGGATATCTATACTATAA
+CTTAGTACTATGGAGGCCTA
+>test_set1_9
+GGTTTATGTTACTACCTCATGACATGAAACGGCAGCATGTGCTTTTATAGAAGCTTGATACTTGGAGATCTAAACTATTA
+CTTAGGACTATGGAGGTCTA
+>test_set2_0
+AGCGAATCATACTCTGGAAAGAAAAGGACGACTCCTTTGCTCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_1
+AGAGAATAGTACTCTGGAAAGACAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGATTCA
+>test_set2_2
+AGAGTATAATACTCTGGAAAGAAAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTTAA
+TGATGGTTGAACCGGGGTCA
+>test_set2_3
+AGAGAATCATACTCTGGAAAGAAATGGACGACTCCTTTGATCGCGGTCCAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGGACCGGGTTCA
+>test_set2_4
+AGAGAATAATAGTCTGGAAAGAAAAGGACGACTCCTTTGTTCCCGGTCTAGCTGCTACAGCTTCCCCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_5
+ACAGAATACTACTCTGGAAAGAAAAGGCCGACTCCTTTGATCGCTGTCTAGCTGCGACAGCTGCACGGAGTCCATCCGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_6
+AGAGAATAATACTCTGGACAGAAATGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGCTGAACCGGGTTCA
+>test_set2_7
+AGAGAATATTACTCTGGAAAGAAAAGGACGACTCCTTGGATCGCGGTCTAGCTGCTACAGCTTCAGCGAGTACATCGGAA
+TGATGGTTTAACCGGGTTCA
+>test_set2_8
+AGTGAATAATACTCTGGAAAGAAAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTAGAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_9
+AGAGATTAATACTCTGGATAGAAAATGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGATTGACCTATTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set3_0
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAAAGGAGGATAGAACTCGGACAGTATTCTGAACATTACAG
+AATCGCCGTATTTACGGTGT
+>test_set3_1
+TTGTCTCCATTGAGCACTCTAATCATGCCGTGTATTCAGGAACGGAGGAGAGGACTCGGTCAGTATTCGGAACATTACAG
+AATGGCGTTATTTACGGTGT
+>test_set3_2
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGAATCCTGAATATTACAA
+AATCGGGTTATTTACGGTGT
+>test_set3_3
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTTTTCAGGAACGGAGGATAGAACTCGGACAGTAGCCTGAACATTACAG
+AATCCCGTTATTTACGGTGT
+>test_set3_4
+TTGTCTCCATCGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATTGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_5
+TTGTCTCCATTGAGCACGCTAAGCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_6
+TTGTCGTCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGAAGGATAGAACTCGGACAGTATCCTGAACTTTGCAA
+AATCGCGTTATTTACGGTGT
+>test_set3_7
+TTGTCTCCATTGAGCACTCTAATCTAGCCGTGTAGTCAGGAACGGAGGATGGAACGCGCACAGTATCCTGAACATAACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_8
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTATATTCCCGAACGGAGGATAGAACTCGGACAGTAGCCTGAACAGTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_9
+TTGTCTCCCTTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set4_0
+CTTTTACCGGGCTGCCCGAGAGCACTATCTGCGTCGTGCCCTGCTTCGATGCCCACACTACCATCATACTATTCGTGAAT
+TTGCGGCCGCTAAGATCCGA
+>test_set4_1
+CTTTTATCGGGGTGCCTGATAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCTAAACCACCGTCATGCTATTTGTGAAT
+TTGAGGTCGCTAAGAGCCCA
+>test_set4_2
+CTTTTATCGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCCAGGCCACCATCATACTATTTGTGGCT
+TAGGGGTCGCTAAGAGCCGA
+>test_set4_3
+CTTTTATCGGGGGGCCCGAGAGCACCACCTGCGTCGTGCCCTGCTTCGATGCCCAAACCACCATCATACTATTTGTGAAT
+TTGGGGTCGCTAAGAGCCGA
+>test_set4_4
+CTTTTATAGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCAGCTTCGATTTCCAAACCACCATCATACTATTTGTGAAC
+TTGGGGACGTTAAGAGCCGA
+>test_set4_5
+CTTTTCGCGGGGTGCCCGAGAGCACCATCTGCGTCGCGCCCTGCTTCGGTGCCCATACCACCATCATAATATTTGGGAAA
+TTGGGATCGCTAAGAGTCGA
+>test_set4_6
+CTTTTCTCGGGGTGCCCGAGAGCCCCATCTGCGTTGTGCCCTGCTACTATGCCCAAACCACCATCATACTATTTGTGAAT
+GTGGCGTCGCTCAGAGCCGA
+>test_set4_7
+CTTTTATCGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCCACGTCACCATACTACTATTTGTGAAT
+TTGGGGTCGCTAATAGCCGA
+>test_set4_8
+CTTTTATCGGGGGGCCCGAGAGCATCATCTGCGTCGTGCCCTGCTTCGATGCCCAAACTACCATCATACTATTTGTGAAT
+TTGGGGTTTCTAAGAGCCGA
+>test_set4_9
+CTTTTACCGGGGTGACCGAGAGCACCATCTGCGCCGTGCCCTGCTTCGAGGCCCAAACCACCATCATACTGTTTGTGAAT
+CAGGGGTTGCTAAGAGCCGA"""
+
+exp_tree = """((test_set2_0:0.02121,(test_set2_8:-0.03148,(((test_set3_6:0.05123,(test_set3_5:0.01878,((test_set3_0:0.03155,test_set3_1:0.06432)0.664:0.01096,(((test_set3_3:0.02014,test_set3_8:0.04240)0.880:0.01129,(test_set3_7:0.05900,test_set3_4:0.01449)0.756:0.00571)0.514:0.00038,test_set3_9:0.00907)0.515:0.00020)0.834:0.00164)0.708:0.01349)0.754:0.19207,test_set3_2:-0.16026)0.999:1.34181,(test_set1_2:0.00324,((test_set1_0:0.04356,test_set1_1:0.07539)0.393:0.00223,((test_set1_3:0.0199 [...]
+# for FastTree version 2.0.1
+exp_tree_201 = """(((test_set2_8:0.00039,(((test_set3_6:0.05278,(test_set3_5:0.02030,(((test_set3_0:0.03166,test_set3_1:0.06412)0.783:0.00945,(test_set3_7:0.06330,test_set3_4:0.02026)0.896:0.00014)0.911:0.00014,((test_set3_3:0.02053,test_set3_8:0.04149)0.790:0.00995,test_set3_9:0.01011)0.927:0.00015)0.922:0.00015)0.780:0.00976)0.763:0.03112,test_set3_2:0.00014)0.881:1.40572,(((((test_set1_9:0.07378,(test_set1_7:0.03123,test_set1_5:0.04198)0.756:0.00995)0.883:0.00016,(test_set1_3:0.02027, [...]
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_fasttree_v1.py b/bfillings/tests/test_fasttree_v1.py
new file mode 100644
index 0000000..23a6890
--- /dev/null
+++ b/bfillings/tests/test_fasttree_v1.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Tests for FastTree v1.0.0 application controller"""
+
+from shutil import rmtree
+from os import getcwd
+from unittest import TestCase, main
+
+from skbio.parse.sequences import parse_fasta
+
+from cogent.core.alignment import Alignment
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA
+
+from bfillings.fasttree_v1 import FastTree, build_tree_from_alignment
+
+
+class FastTreeTests(TestCase):
+    def setUp(self):
+        self.seqs = Alignment(dict(parse_fasta(test_seqs.split())))
+
+    def test_base_command(self):
+        app = FastTree()
+        self.assertEqual(app.BaseCommand, \
+                         ''.join(['cd "',getcwd(),'/"; ','FastTree']))
+        app.Parameters['-nt'].on()
+        self.assertEqual(app.BaseCommand, \
+                         ''.join(['cd "',getcwd(),'/"; ','FastTree -nt']))
+
+    def test_change_working_dir(self):
+        app = FastTree(WorkingDir='/tmp/FastTreeTest')
+        self.assertEqual(app.BaseCommand, \
+                       ''.join(['cd "','/tmp/FastTreeTest','/"; ','FastTree']))
+        rmtree('/tmp/FastTreeTest')
+
+    def test_build_tree_from_alignment(self):
+        tree = build_tree_from_alignment(self.seqs, DNA)
+        for o,e in zip(tree.traverse(), DndParser(exp_tree).traverse()):
+            self.assertEqual(o.Name,e.Name)
+            self.assertAlmostEqual(o.Length,e.Length)
+
+test_seqs = """>test_set1_0
+GGTAGATGGGACTACCTCATGACATGAAACTGCAGTCTGTTCTTTTATAGAAGCTTCATACTTGGAGATGTATACTATTA
+CTTAGGACTATGGAGGTATA
+>test_set1_1
+GGTTGATGGGACTACGTAGTGACATGAAATTGCAGTCTGTGCTTTTATAGAAGTTTGATACTTGGAGCTCTCTACTATTA
+CTTAGGACTATGGAGGTATA
+>test_set1_2
+GGTTGATGGGCCTACCTCATGACAATAAACTGAAGTCTGTGCTTTTATAGAGGCTTGATACTTGGAGCTCTATACTATTA
+CTTAGGATTATGGAGGTCTA
+>test_set1_3
+GGTTGATGGGACTACCTCATGACATGAAACTGCAGTCTGTGCTTTTATAGAAGCTTGATACTTGGAGATCTATACTATTA
+CTTAGGACTATGGAGGTCAC
+>test_set1_4
+GGTTGGTGGGACTACCTCATGACATGAAGATGCAGTCTGTGCTTGTATAGAAGCTTGAAACTTGGATATCTATACTATTA
+CTTAAGACTATGGAGGTCTA
+>test_set1_5
+GGTTGATGCGACTACCTCATGACATGAGACTGCAGTCTGTGCTTTTACTGAAGCTTGATACTTGGAGATCTATACTATTA
+CTTAGGACTATGGAGGTTTA
+>test_set1_6
+GGTTGATGGGACTACCTCATGACATGAAAATGCAGTCTGTCCTTTTATAGAAGCTTGATACTTGTAGATCTATACTGTTA
+CTTAGGACTATGGAGGTCTA
+>test_set1_7
+GGTTGATGGGACTCCCTCATGACATAAAACTGCAGTCTGTGCTTTTACAGAAGCTTGATACTTGGAGATCTATACTATTA
+CATAGGACTATGGAGGTCTA
+>test_set1_8
+GGTTGATGGCACTACCTCATGAGATGAAACTGCAGTCTGTGCTTTTATAGAAGCTTGATACTTGGATATCTATACTATAA
+CTTAGTACTATGGAGGCCTA
+>test_set1_9
+GGTTTATGTTACTACCTCATGACATGAAACGGCAGCATGTGCTTTTATAGAAGCTTGATACTTGGAGATCTAAACTATTA
+CTTAGGACTATGGAGGTCTA
+>test_set2_0
+AGCGAATCATACTCTGGAAAGAAAAGGACGACTCCTTTGCTCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_1
+AGAGAATAGTACTCTGGAAAGACAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGATTCA
+>test_set2_2
+AGAGTATAATACTCTGGAAAGAAAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTTAA
+TGATGGTTGAACCGGGGTCA
+>test_set2_3
+AGAGAATCATACTCTGGAAAGAAATGGACGACTCCTTTGATCGCGGTCCAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGTTGGACCGGGTTCA
+>test_set2_4
+AGAGAATAATAGTCTGGAAAGAAAAGGACGACTCCTTTGTTCCCGGTCTAGCTGCTACAGCTTCCCCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_5
+ACAGAATACTACTCTGGAAAGAAAAGGCCGACTCCTTTGATCGCTGTCTAGCTGCGACAGCTGCACGGAGTCCATCCGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_6
+AGAGAATAATACTCTGGACAGAAATGGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGCTTCACCGAGTACATCTGAA
+TGATGGCTGAACCGGGTTCA
+>test_set2_7
+AGAGAATATTACTCTGGAAAGAAAAGGACGACTCCTTGGATCGCGGTCTAGCTGCTACAGCTTCAGCGAGTACATCGGAA
+TGATGGTTTAACCGGGTTCA
+>test_set2_8
+AGTGAATAATACTCTGGAAAGAAAAGGACGACTCCTTTGATCGCGGTCTAGCTGCTAGAGCTTCACCGAGTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set2_9
+AGAGATTAATACTCTGGATAGAAAATGACGACTCCTTTGATCGCGGTCTAGCTGCTACAGATTGACCTATTACATCTGAA
+TGATGGTTGAACCGGGTTCA
+>test_set3_0
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAAAGGAGGATAGAACTCGGACAGTATTCTGAACATTACAG
+AATCGCCGTATTTACGGTGT
+>test_set3_1
+TTGTCTCCATTGAGCACTCTAATCATGCCGTGTATTCAGGAACGGAGGAGAGGACTCGGTCAGTATTCGGAACATTACAG
+AATGGCGTTATTTACGGTGT
+>test_set3_2
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGAATCCTGAATATTACAA
+AATCGGGTTATTTACGGTGT
+>test_set3_3
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTGTTTTCAGGAACGGAGGATAGAACTCGGACAGTAGCCTGAACATTACAG
+AATCCCGTTATTTACGGTGT
+>test_set3_4
+TTGTCTCCATCGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATTGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_5
+TTGTCTCCATTGAGCACGCTAAGCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_6
+TTGTCGTCATTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGAAGGATAGAACTCGGACAGTATCCTGAACTTTGCAA
+AATCGCGTTATTTACGGTGT
+>test_set3_7
+TTGTCTCCATTGAGCACTCTAATCTAGCCGTGTAGTCAGGAACGGAGGATGGAACGCGCACAGTATCCTGAACATAACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_8
+TTGTCTCCATTGAGCACTCTAATCTTGCCGTATATTCCCGAACGGAGGATAGAACTCGGACAGTAGCCTGAACAGTACAG
+AATCGCGTTATTTACGGTGT
+>test_set3_9
+TTGTCTCCCTTGAGCACTCTAATCTTGCCGTGTATTCAGGAACGGAGGATAGAACTCGGACAGTATCCTGAACATTACAG
+AATCGCGTTATTTACGGTGT
+>test_set4_0
+CTTTTACCGGGCTGCCCGAGAGCACTATCTGCGTCGTGCCCTGCTTCGATGCCCACACTACCATCATACTATTCGTGAAT
+TTGCGGCCGCTAAGATCCGA
+>test_set4_1
+CTTTTATCGGGGTGCCTGATAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCTAAACCACCGTCATGCTATTTGTGAAT
+TTGAGGTCGCTAAGAGCCCA
+>test_set4_2
+CTTTTATCGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCCAGGCCACCATCATACTATTTGTGGCT
+TAGGGGTCGCTAAGAGCCGA
+>test_set4_3
+CTTTTATCGGGGGGCCCGAGAGCACCACCTGCGTCGTGCCCTGCTTCGATGCCCAAACCACCATCATACTATTTGTGAAT
+TTGGGGTCGCTAAGAGCCGA
+>test_set4_4
+CTTTTATAGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCAGCTTCGATTTCCAAACCACCATCATACTATTTGTGAAC
+TTGGGGACGTTAAGAGCCGA
+>test_set4_5
+CTTTTCGCGGGGTGCCCGAGAGCACCATCTGCGTCGCGCCCTGCTTCGGTGCCCATACCACCATCATAATATTTGGGAAA
+TTGGGATCGCTAAGAGTCGA
+>test_set4_6
+CTTTTCTCGGGGTGCCCGAGAGCCCCATCTGCGTTGTGCCCTGCTACTATGCCCAAACCACCATCATACTATTTGTGAAT
+GTGGCGTCGCTCAGAGCCGA
+>test_set4_7
+CTTTTATCGGGGTGCCCGAGAGCACCATCTGCGTCGTGCCCTGCTTCGATGCCCACGTCACCATACTACTATTTGTGAAT
+TTGGGGTCGCTAATAGCCGA
+>test_set4_8
+CTTTTATCGGGGGGCCCGAGAGCATCATCTGCGTCGTGCCCTGCTTCGATGCCCAAACTACCATCATACTATTTGTGAAT
+TTGGGGTTTCTAAGAGCCGA
+>test_set4_9
+CTTTTACCGGGGTGACCGAGAGCACCATCTGCGCCGTGCCCTGCTTCGAGGCCCAAACCACCATCATACTGTTTGTGAAT
+CAGGGGTTGCTAAGAGCCGA"""
+
+exp_tree = """(test_set1_3:0.02062,(test_set1_8:0.05983,test_set1_9:0.07093)0.652:0.00422,((test_set1_5:0.04140,test_set1_7:0.03208)0.634:0.00995,((test_set1_0:0.04748,(test_set1_1:0.07025,(test_set1_2:-0.00367,((((((test_set3_4:0.01485,test_set3_7:0.05863)0.862:0.00569,(test_set3_5:0.02048,(test_set3_3:0.02036,test_set3_8:0.04218)0.724:0.01088)0.397:0.00005)0.519:0.00018,((test_set3_0:0.03139,test_set3_1:0.06448)0.699:0.01095,test_set3_9:0.00940)0.505:0.00036)0.721:0.01080,test_set3_6:0 [...]
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_formatdb.py b/bfillings/tests/test_formatdb.py
new file mode 100755
index 0000000..168c8d4
--- /dev/null
+++ b/bfillings/tests/test_formatdb.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+""" Description
+File created on 16 Sep 2009.
+
+"""
+from __future__ import division
+from os.path import split, exists
+from unittest import TestCase, main
+
+from skbio.util import remove_files
+
+from cogent import LoadSeqs
+from cogent.app.util import get_tmp_filename
+
+from bfillings.blast import blastn
+from bfillings.formatdb import (FormatDb, build_blast_db_from_seqs,
+                             build_blast_db_from_fasta_path,
+                             build_blast_db_from_fasta_file)
+
+
+class FormatDbTests(TestCase):
+
+    def setUp(self):
+        self.in_seqs1_fp =\
+         get_tmp_filename(prefix='FormatDbTests',suffix='.fasta')
+        self.in_seqs1_file = open(self.in_seqs1_fp,'w')
+        self.in_seqs1_file.write(in_seqs1)
+        self.in_seqs1_file.close()
+        self.in_seqs1 = LoadSeqs(self.in_seqs1_fp,aligned=False)
+        self.test_seq = test_seq
+
+        self.in_aln1_fp =\
+         get_tmp_filename(prefix='FormatDbTests',suffix='.fasta')
+        self.in_aln1_file = open(self.in_aln1_fp,'w')
+        self.in_aln1_file.write(in_aln1)
+        self.in_aln1_file.close()
+        self.in_aln1 = LoadSeqs(self.in_aln1_fp)
+
+
+        self.files_to_remove = [self.in_seqs1_fp,self.in_aln1_fp]
+
+    def tearDown(self):
+        remove_files(self.files_to_remove)
+
+    def test_call(self):
+        """FormatDb: Calling on a nucleotide data functions as expected
+        """
+        fdb = FormatDb(WorkingDir='/tmp')
+        result = fdb(self.in_seqs1_fp)
+
+        # test sucessful run
+        self.assertEqual(result['ExitStatus'],0)
+
+        expected_result_keys = set(\
+         ['log','nhr','nin','nsd','nsi','nsq','ExitStatus','StdOut','StdErr'])
+        self.assertEqual(set(result.keys()),expected_result_keys)
+
+        inputfile_basename = split(self.in_seqs1_fp)[1]
+        # got all the expected out files, and filepaths are as expected
+        outpaths = []
+        for ext in ['log','nhr','nin','nsd','nsi','nsq']:
+            outpath = '/tmp/%s.%s' % (inputfile_basename,ext)
+            outpaths.append(outpath)
+            self.assertEqual(result[ext].name,outpath)
+        result.cleanUp()
+
+        # all created files are cleaned up
+        for outpath in outpaths:
+            self.assertFalse(exists(outpath),\
+             "%s was not cleaned up." % outpath)
+
+    def test_blast_against_new_db(self):
+        """Formatdb: blastall against a newly created DB functions as expected
+        """
+        fdb = FormatDb(WorkingDir='/tmp')
+        result = fdb(self.in_seqs1_fp)
+        blast_res = blastn(self.test_seq,blast_db=self.in_seqs1_fp)
+        result.cleanUp()
+
+        # Test that a blast result was returned
+        self.assertTrue('s1' in blast_res,\
+         "Not getting any blast results.")
+        # Test that the sequence we expect was a good blast hit
+        subject_ids = [r['SUBJECT ID'] for r in blast_res['s1'][0]]
+        self.assertTrue('11472384' in subject_ids,\
+         "Not getting expected blast results.")
+
+    def test_build_blast_db_from_seqs(self):
+        """build_blast_db_from_seqs convenience function works as expected
+        """
+        blast_db, db_files = build_blast_db_from_seqs(self.in_seqs1,output_dir='/tmp')
+        self.assertTrue(blast_db.startswith('/tmp/Blast_tmp_db'))
+        self.assertTrue(blast_db.endswith('.fasta'))
+        expected_db_files = set([blast_db + ext\
+         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
+        self.assertEqual(set(db_files),expected_db_files)
+
+        # result returned when blasting against new db
+        self.assertEqual(\
+            len(blastn(self.test_seq,blast_db=blast_db)),1)
+
+        # Make sure all db_files exist
+        for fp in db_files:
+            self.assertTrue(exists(fp))
+
+        # Remove all db_files exist
+        remove_files(db_files)
+
+        # Make sure nothing weird happened in the remove
+        for fp in db_files:
+            self.assertFalse(exists(fp))
+
+    def test_build_blast_db_from_fasta_path(self):
+        """build_blast_db_from_fasta_path convenience function works as expected
+        """
+        blast_db, db_files = \
+         build_blast_db_from_fasta_path(self.in_seqs1_fp)
+        self.assertEqual(blast_db,self.in_seqs1_fp)
+        expected_db_files = set([self.in_seqs1_fp + ext\
+         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
+        self.assertEqual(set(db_files),expected_db_files)
+
+        # result returned when blasting against new db
+        self.assertEqual(\
+            len(blastn(self.test_seq,blast_db=blast_db)),1)
+
+        # Make sure all db_files exist
+        for fp in db_files:
+            self.assertTrue(exists(fp))
+
+        # Remove all db_files exist
+        remove_files(db_files)
+
+        # Make sure nothing weird happened in the remove
+        for fp in db_files:
+            self.assertFalse(exists(fp))
+
+    def test_build_blast_db_from_fasta_path_aln(self):
+        """build_blast_db_from_fasta_path works with alignment as input
+        """
+        blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp)
+        self.assertEqual(blast_db,self.in_aln1_fp)
+        expected_db_files = set([blast_db + ext\
+         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
+        self.assertEqual(set(db_files),expected_db_files)
+        # result returned when blasting against new db
+        self.assertEqual(\
+            len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)
+
+        # Make sure all db_files exist
+        for fp in db_files:
+            self.assertTrue(exists(fp))
+
+        # Remove all db_files exist
+        remove_files(db_files)
+
+        # Make sure nothing weird happened in the remove
+        for fp in db_files:
+            self.assertFalse(exists(fp))
+
+    def test_build_blast_db_from_fasta_file(self):
+        """build_blast_db_from_fasta_file works with open files as input
+        """
+        blast_db, db_files = \
+         build_blast_db_from_fasta_file(open(self.in_aln1_fp),output_dir='/tmp/')
+        self.assertTrue(blast_db.startswith('/tmp/BLAST_temp_db'))
+        self.assertTrue(blast_db.endswith('.fasta'))
+        expected_db_files = set([blast_db] + [blast_db + ext\
+         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
+        self.assertEqual(set(db_files),expected_db_files)
+        # result returned when blasting against new db
+        self.assertEqual(\
+            len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)
+
+        # Make sure all db_files exist
+        for fp in db_files:
+            self.assertTrue(exists(fp))
+
+        # Remove all db_files exist
+        remove_files(db_files)
+
+        # Make sure nothing weird happened in the remove
+        for fp in db_files:
+            self.assertFalse(exists(fp))
+
+
+in_seqs1 = """>11472286
+GATGAACGCTGGCGGCATGCTTAACACATGCAAGTCGAACGGAACACTTTGTGTTTTGAGTTAATAGTTCGATAGTAGATAGTAAATAGTGAACACTATGAACTAGTAAACTATTTAACTAGAAACTCTTAAACGCAGAGCGTTTAGTGGCGAACGGGTGAGTAATACATTGGTATCTACCTCGGAGAAGGACATAGCCTGCCGAAAGGTGGGGTAATTTCCTATAGTCCCCGCACATATTTGTTCTTAAATCTGTTAAAATGATTATATGTTTTATGTTTATTTGATAAAAAGCAGCAAGACAAATGAGTTTTATATTGGTTATACAGCAGATTTAAAAAATAGAATTAGGTCTCATAATCAGGGAGAAAACAAATCAACTAAATCTAAAATACCTTGGGAATTGGTTTACTATGAAGCCTACAAAAACCAAACATCAGCAAGGGTTAGAGAATCAAAGTTGAAACATTATGGGCAATCATTAACTAGACT [...]
+>11472384
+AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTTGTCCGGAAAGAAAACGCCGTGGTTAATACCCGTGGCGGATGACGGTACCGGAAGAATAAGCACCG [...]
+>11468680
+TAAACTGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGTGCTTGCACCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACATGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGATCTACGGATGAAAGCGGGGGACCTTCGGGCCTCGCGCTATAGGGTTGGCCGATGGCTGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCAGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGAAAGCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTTGTCCGGAAAGAAATCCTTGGCTCTAATACAGTCGGGGGATGACGGTACCGGAAGA [...]
+>11458037
+GACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGTTTCGAAGATCGGACTTCGAATTTCGAATTTCGATCATCGAGATAGTGGCGGACGGGTGAGTAACGCGTGGGTAACCTACCCATAAAGCCGGGACAACCCTTGGAAACGAGGGCTAATACCGGATAAGCTTGAGAAGTGGCATCACTTTTTAAGGAAAGGTGGCCGATGAGAATGCTGCCGATTATGGATGGACCCGCGTCTGATTAGCTGGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCAGTAGCCGGCCTGAGAGGGTGAACGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATCTTCCGCAATGGACGAAAGTCTGACGGAGCAACGCCGCGTGTATGATGAAGGTTTTCGGATTGTAAAGTACTGTCTATGGGGAAGAATGGTGTGCTTGAGAATATTAAGTACAAATGACGGTAC [...]
+>11469739
+AGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGAGAAGCTAACTTCTGATTCCTTCGGGATGATGAGGTTAGCAGAAAGTGGCGAACGGGTGAGTAACGCGTGGGTAATCTACCCTGTAAGTGGGGGATAACCCTCCGAAAGGAGGGCTAATACCGCATAATATCTTTATCCCAAAAGAGGTAAAGATTAAAGATGGCCTCTATACTATGCTATCGCTTCAGGATGAGTCCGCGTCCTATTAGTTAGTTGGTGGGGTAATGGCCTACCAAGACGACAATGGGTAGCCGGTCTGAGAGGATGTACGGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGAGACAGCAGTGGGGAATATTGCGCAATGGGGGAAACCCTGACGCAGCGACGCCGCGTGGATGATGAAGGCCCTTGGGTTGTAAAATCCTGTTCTGGGGGAAGAAAGCTTAAAGGTCCAAT [...]
+>11469752
+AGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAGCGGCAGCGAGTTCCTCACCGAGGTTCGGAACAGTTGACAGTAAACAGTTGACAGTAAACAGTAACTTCAGAAATGAAGCGGACTGTGAACTGTTTACTGTAACCTGTTAGCTATTATTTCGAGCTTTAGTGAGGAATGTCGGCGAGCGGCGGACGGCTGAGTAACGCGTAGGAACGTACCCCAAACTGAGGGATAAGCACCAGAAATGGTGTCTAATACCGCATATGGCCCAGCACCTTTTTTAATCAACCACGACCCTAAAATCGTGAATAATTGGTAGGAAAAGGTGTTGGGTTAAAGCTTCGGCGGTTTGGGAACGGCCTGCGTATGATTAGCTTGTTGGTGAGGTAAAAGCTCACCAAGGCGACGATCATTAGCTGGTCTGAGAGGATGATCAGCCAGACTGGGACTGAGACACGGCCCAGACTCCTAC [...]
+>11460523
+AGAGTTTGATCCTGGCTCAGAACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGCGAAATCGGGCACTCAATTTTGCTTTTCAAACATTAACTGATGAAACGACCAGAGAGATTGTTCCAGTTTAAAGAGTGAAAAGCAGGCTTGAGTGCCTGAGAGTAGAGTGGCGCACGGGTGAGTAACGCGTAAATAATCTACCCCTGCATCTGGGATAACCCACCGAAAGGTGAGCTAATACCGGATACGTTCTTTTAACCGCGAGGTTTTAAGAAGAAAGGTGGCCTCTGATATAAGCTACTGTGCGGGGAGGAGTTTGCGTACCATTAGCTAGTTGGTAGGGTAATGGCCTACCAAGGCATCGATGGTTAGCGGGTCTGAGAGGATGATCCGCCACACTGGAACTGGAACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGCGCAATGGGGGCAACCCTGACGCAGCGACGCCGCGTGG [...]
+>11460543
+TGGTTTGATCCTGGCTCAGGACAAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGAGAAGCCAGCTTTTGATTCCTTCGGGATGAGAAAGCAGGTAGAAAGTGGCGAACGGGTGAGTAACGCGTGGGTAATCTACCCTGTAAGTAGGGGATAACCCTCTGAAAAGAGGGCTAATACCGCATAATATCTTTACCCCATAAGAAGTAAAGATTAAAGATGGCCTCTGTATATGCTATCGCTTCAGGATGAGCCCGCGTCCTATTAGTTAGTTGGTAAGGTAATGGCTTACCAAGACCACGATGGGTAGCCGGTCTGAGAGGATGTACGGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCGCAATGGGGGAAACCCTGACGCAGCGACGCCGCGTGGATGATGAAGGCCTTCGGGTTGTAAAATCCTGTTTTGGGGGACGAAACCTTAAGGGTCCAATAA [...]
+>11480235
+TGGTTTGATCCTGGCTCAGGATTAACGCTGGCGGCGCGCCTTATACATGCAAGTCGAACGAGCCTTGTGCTTCGCACAAGGAAATTCCAAGCACCAAGCACCAAATCTCAAACAAATCCCAATGACCAAAATTCCAAAAACCTAAACATTTTAAATGTTTAGAATTTGGAAAATTGGAATTTGGAATTTATTTGTTATTTGGAATTTATGATTTGGGATTTTCTCGCGCGGAGANCNTNAGTGGCGAACGGGTGAGTAATACGTTGGTATCTACCCCAAAGTAGAGAATAAGCCCGAGAAATCGGGGTTAATACTCTATGTGTTCGAAAGAACAAAGACTTCGGTTGCTTTGGGAAGAACCTGCGGCCTATCAGCTTGTTGGTAAGGTAACGGCTTACCAAGGCTTTGACGGGTAGCTGGTCTGGGAAGACGACCAGCCACAATGGGACTTAGACACGGCCCATACTCCTACGGGAGGCAGCAGTAGGGAAT [...]
+>11480408
+AATTTAGCGGCCGCGAATTCGCCCTTGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGGGATATCCGAGCGGAAGGTTTCGGCCGGAAGGTTGGGTATTCGAGTGGCGGACGGGTGAGTAACGCGTGAGCAATCTGTCCCGGACAGGGGGATAACACTTGGAAACAGGTGCTAATACCGCATAAGACCACAGCATCGCATGGTGCAGGGGTAAAAGGAGCGATCCGGTCTGGGGTGAGCTCGCGTCCGATTAGATAGTTGGTGAGGTAACGGCCCACCAAGTCAACGATCGGTAGCCGACCTGAGAGGGTGATCGGCCACATTGGAACTGAGAGACGGTCCAAACTCCTACGGGAGGCAGCAGTGGGGAATATTGGGCAATGGGCGAAAGCCTGACCCAGCAACGCCGCGTGAGTGAAGAAGGCCTTCGGGTTGTAAAGCTCTGTTATGCGAGACGAAGGAAG [...]
+"""
+
+test_seq = """>s1 (11472384)
+AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTTGTCCGGAAAGAAAACGCCGTGGTTAATACCCGTGGCGGATGACGGTACCGGAAGAATAAGCACCG [...]
+"""
+
+in_aln1 = """>a1
+AAACCTTT----TTTTAAATTCCGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTC
+>a2
+AAACCTTT----TTTTAAATTCCGCAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTC
+>a3
+AAACCTTT----TTTTAAATTCCGGAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTC
+>a4
+AAACCTTT----TTTTAAATTCCGTAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGCAGCACGGGGGCAACCCTGGTGGCGAGTGGCGAACGGGTGAGTAATACATCGGAACGTGTCCTGTAGTGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATACGCTCTACGGAGGAAAGGGGGGGATCTTAGGACCTCCCGCTACAGGGGCGGCCGATGGCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCGACGATCTGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGTGTGAAGAAGGCCTTC
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bfillings/tests/test_infernal.py b/bfillings/tests/test_infernal.py
new file mode 100644
index 0000000..bc8d849
--- /dev/null
+++ b/bfillings/tests/test_infernal.py
@@ -0,0 +1,620 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir, path
+import tempfile
+import shutil
+from unittest import TestCase, main
+
+from cogent.util.misc import flatten
+from cogent.core.moltype import DNA, RNA, PROTEIN
+from cogent.core.alignment import DataError
+from cogent.parse.rfam import (MinimalRfamParser, ChangedRnaSequence,
+                               ChangedSequence)
+from cogent.format.stockholm import stockholm_from_alignment
+from cogent.struct.rna2d import ViennaStructure, wuss_to_vienna
+
+from bfillings.infernal import (Cmalign, Cmbuild, Cmcalibrate, Cmemit, Cmscore,
+                             Cmsearch, Cmstat, cmbuild_from_alignment,
+                             cmbuild_from_file, cmalign_from_alignment,
+                             cmalign_from_file, cmsearch_from_alignment,
+                             cmsearch_from_file)
+
+
+class GeneralSetUp(TestCase):
+
+    def setUp(self):
+        """Infernal general setUp method for all tests"""
+        self.seqs1_unaligned = {'1':'ACUGCUAGCUAGUAGCGUACGUA',\
+                                '2':'GCUACGUAGCUAC',\
+                                '3':'GCGGCUAUUAGAUCGUA'}
+        self.struct1_unaligned_string = '....(((...)))....'
+        self.seqs1_unaligned_gaps = {'1':'ACUGCUAGCUAGU-AGCGUAC--GUA',\
+                                     '2':'--GCUACGUAGCUAC',\
+                                     '3':'GCGGCUAUUAGAUCGUA--'}
+
+
+
+        self.seqs2_aligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC---------',\
+                              'c': '------------UGACUACGCAU---------',\
+                              'b': '----UAUCGCUUCGACGAUUCUCUGAUAGAGA'}
+
+        self.seqs2_unaligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC',\
+                                'c': 'UGACUACGCAU',\
+                                'b': 'UAUCGCUUCGACGAUUCUCUGAUAGAGA'}
+
+        self.struct2_aligned_string = '............((.(...)))..........'
+        self.struct2_aligned_dict = {'SS_cons':self.struct2_aligned_string}
+
+        self.lines2 = stockholm_from_alignment(aln=self.seqs2_aligned,\
+            GC_annotation=self.struct2_aligned_dict)
+
+        #self.seqs1 aligned to self.seqs2 with self.seqs2 included.
+        self.seqs1_and_seqs2_aligned = \
+            {'a': 'UAGGCUCUGAUAUAAUAGC-UCUC---------',\
+             'b': '----UAUCGCUUCGACGAU-UCUCUGAUAGAGA',\
+             'c': '------------UGACUAC-GCAU---------',\
+             '1': '-ACUGCUAGCUAGUAGCGUACGUA---------',\
+             '2': '----------GCUACGUAG-CUAC---------',\
+             '3': '-----GCGGCUAUUAG-AU-CGUA---------',\
+             }
+
+        self.seqs1_and_seqs2_aligned_struct_string = \
+            '............((.(....)))..........'
+
+        #self.seqs1 aligned to self.seqs2 without self.seqs2 included.
+        self.seqs1_aligned = \
+            {'1': 'ACUGCUAGCUAGUAGCGUACGUA',\
+             '2': '---------GCUACGUAG-CUAC',\
+             '3': '----GCGGCUAUUAG-AU-CGUA',\
+             }
+
+        self.seqs1_aligned_struct_string = \
+            '...........((.(....))).'
+
+        self.temp_dir = tempfile.mkdtemp()
+        self.temp_dir_spaces = '/tmp/test for infernal/'
+        try:
+            mkdir(self.temp_dir_spaces)
+        except OSError:
+            pass
+        try:
+            #create sequence files
+            f = open(path.join(self.temp_dir, 'seqs1.sto'),'w')
+            f.write(self.lines2)
+            f.close()
+            #create cm file.
+            self.cmfile = path.join(self.temp_dir, 'aln2.cm')
+            cm = open(self.cmfile,'w')
+            cm.write(ALN1_CM)
+            cm.close()
+            #create alignment file used to create cm file.
+            self.aln2_file = path.join(self.temp_dir, 'aln2.sto')
+            af = open(self.aln2_file,'w')
+            af.write(self.lines2)
+            af.close()
+        except OSError:
+            pass
+
+
+class CmalignTests(GeneralSetUp):
+    """Tests for the Cmalign application controller"""
+
+    def test_base_command(self):
+        """Infernal BaseCommand should return the correct BaseCommand"""
+        c = Cmalign()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmalign']))
+        c.Parameters['-l'].on()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmalign -l']))
+
+
+    def test_changing_working_dir(self):
+        """Infernal BaseCommand should change according to WorkingDir"""
+        c = Cmalign(WorkingDir='/tmp/cmalign_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmalign_test','/"; ','cmalign']))
+        c = Cmalign()
+        c.WorkingDir = '/tmp/cmalign_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmalign_test2','/"; ','cmalign']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/cmalign_test')
+        rmdir('/tmp/cmalign_test2')
+
+    def test_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        shutil.rmtree(self.temp_dir_spaces)
+
+    def test_cmalign_from_alignment(self):
+        """cmalign_from_alignment should work as expected.
+        """
+        #Align with cmalign_from_alignment without original alignment.
+        aln, struct = cmalign_from_alignment(aln=self.seqs2_aligned,\
+            structure_string=self.struct2_aligned_string,\
+            seqs=self.seqs1_unaligned_gaps,moltype=RNA,include_aln=False)
+        #Check correct alignment
+        self.assertEqual(aln.todict(),self.seqs1_aligned)
+        #Check correct struct
+        self.assertEqual(wuss_to_vienna(str(struct)),\
+            self.seqs1_aligned_struct_string)
+
+        #should work with gapped seqs.  Need to test this is taken care of
+        # since cmalign segfaults when there are gaps in the seqs to be aligned.
+        aln, struct = cmalign_from_alignment(aln=self.seqs2_aligned,\
+            structure_string=self.struct2_aligned_string,\
+            seqs=self.seqs1_unaligned_gaps,moltype=RNA)
+        #alignment should be correct
+        self.assertEqual(aln.todict(),self.seqs1_and_seqs2_aligned)
+        #structure should be correct
+        self.assertEqual(wuss_to_vienna(str(struct)),\
+            self.seqs1_and_seqs2_aligned_struct_string)
+
+        #should work with ungapped seqs.
+        aln, struct = cmalign_from_alignment(aln=self.seqs2_aligned,\
+            structure_string=self.struct2_aligned_string,\
+            seqs=self.seqs1_unaligned_gaps,moltype=RNA)
+        #alignment should be correct
+        self.assertEqual(aln.todict(),self.seqs1_and_seqs2_aligned)
+        #structure should be correct
+        self.assertEqual(wuss_to_vienna(str(struct)),\
+            self.seqs1_and_seqs2_aligned_struct_string)
+
+        #should return standard out
+        aln, struct,stdout = cmalign_from_alignment(aln=self.seqs2_aligned,\
+            structure_string=self.struct2_aligned_string,\
+            seqs=self.seqs1_unaligned_gaps,moltype=RNA,\
+            return_stdout=True)
+        #Test that standard out is same length as expected
+        self.assertEqual(len(stdout.split('\n')),\
+            len(CMALIGN_STDOUT.split('\n')))
+
+    def test_cmalign_from_file(self):
+        """cmalign_from_file should work as expected.
+        """
+        #Align with cmalign_from_file without original alignment.
+        aln,struct = cmalign_from_file(cm_file_path=self.cmfile,\
+            seqs=self.seqs1_unaligned,\
+            moltype=RNA)
+        #Check correct alignment
+        self.assertEqual(aln.todict(),self.seqs1_aligned)
+        #Check correct struct
+        self.assertEqual(wuss_to_vienna(str(struct)),\
+            self.seqs1_aligned_struct_string)
+
+        #Align with cmalign_from_file using original alignment.
+        aln,struct = cmalign_from_file(cm_file_path=self.cmfile,\
+            seqs=self.seqs1_unaligned,\
+            moltype=RNA,\
+            alignment_file_path=self.aln2_file,\
+            include_aln=True)
+        #alignment should be correct
+        self.assertEqual(aln.todict(),self.seqs1_and_seqs2_aligned)
+        #structure should be correct
+        self.assertEqual(wuss_to_vienna(str(struct)),\
+            self.seqs1_and_seqs2_aligned_struct_string)
+
+
+class CmbuildTests(GeneralSetUp):
+    """Tests for the Cmbuild application controller"""
+
+    def test_base_command(self):
+        """Infernal BaseCommand should return the correct BaseCommand"""
+        c = Cmbuild()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmbuild']))
+        c.Parameters['-A'].on()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmbuild -A']))
+
+    def test_changing_working_dir(self):
+        """Infernal BaseCommand should change according to WorkingDir"""
+        c = Cmbuild(WorkingDir='/tmp/cmbuild_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmbuild_test','/"; ','cmbuild']))
+        c = Cmbuild()
+        c.WorkingDir = '/tmp/cmbuild_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmbuild_test2','/"; ','cmbuild']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/cmbuild_test')
+        rmdir('/tmp/cmbuild_test2')
+
+    def test_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        shutil.rmtree(self.temp_dir_spaces)
+
+    def test_cmbuild_from_alignment(self):
+        """cmbuild_from_alignment should work as expected.
+        """
+        #Test unaligned seqs and unaligned struct fail.
+        #DataError should be raised with Alignment is constructed
+        self.assertRaises(DataError,cmbuild_from_alignment,\
+            self.seqs1_unaligned,self.struct1_unaligned_string)
+
+        #Test aligned seqs and unaligned struct fail.
+        self.assertRaises(ValueError,cmbuild_from_alignment,\
+            self.seqs2_aligned,self.struct1_unaligned_string)
+
+        #Test get cm back without alignment.
+        cm_res = cmbuild_from_alignment(self.seqs2_aligned,\
+            self.struct2_aligned_string)
+        cm_lines = cm_res.split('\n')
+        ALN1_CM_lines = ALN1_CM.split('\n')
+        #Check that the same number of lines are in both CMs
+        self.assertEqual(len(cm_lines),len(ALN1_CM_lines))
+
+        #The first 13 lines are unique to the specific run.  The res of the
+        # CM should be the same, since built from the same data.
+        self.assertEqual(cm_lines[13:],ALN1_CM_lines[13:])
+
+        #Make sure same alignment is returned if return_alignment=True
+        cm_res, cm_aln = cmbuild_from_alignment(self.seqs2_aligned,\
+            self.struct2_aligned_string,return_alignment=True)
+        self.assertEqual(cm_aln,self.lines2)
+
+    def test_cmbuild_from_file(self):
+        """cmbuild_from_file should work as expected.
+        """
+        cm_res = cmbuild_from_file(self.temp_dir+'/seqs1.sto')
+        cm_lines = cm_res.split('\n')
+        ALN1_CM_lines = ALN1_CM.split('\n')
+        #Check that the same number of lines are in both CMs
+        self.assertEqual(len(cm_lines),len(ALN1_CM_lines))
+
+        #The first 13 lines are unique to the specific run.  The res of the
+        # CM should be the same, since built from the same data.
+        self.assertEqual(cm_lines[13:],ALN1_CM_lines[13:])
+
+        #Make sure same alignment is returned if return_alignment=True
+        cm_res, cm_aln = cmbuild_from_alignment(self.seqs2_aligned,\
+            self.struct2_aligned_string,return_alignment=True)
+        self.assertEqual(cm_aln,self.lines2)
+
+class CmcalibrateTests(GeneralSetUp):
+    """Tests for the Cmcalibrate application controller"""
+
+    def test_base_command(self):
+        """Infernal BaseCommand should return the correct BaseCommand"""
+        c = Cmcalibrate()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmcalibrate']))
+        c.Parameters['--mpi'].on()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmcalibrate --mpi']))
+
+
+    def test_changing_working_dir(self):
+        """Infernal BaseCommand should change according to WorkingDir"""
+        c = Cmcalibrate(WorkingDir='/tmp/cmcalibrate_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmcalibrate_test','/"; ','cmcalibrate']))
+        c = Cmcalibrate()
+        c.WorkingDir = '/tmp/cmcalibrate_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmcalibrate_test2','/"; ','cmcalibrate']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/cmcalibrate_test')
+        rmdir('/tmp/cmcalibrate_test2')
+
+    def test_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        shutil.rmtree(self.temp_dir_spaces)
+
+class CmemitTests(GeneralSetUp):
+    """Tests for the Cmemit application controller"""
+
+    def test_base_command(self):
+        """Infernal BaseCommand should return the correct BaseCommand"""
+        c = Cmemit()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmemit']))
+        c.Parameters['-u'].on()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmemit -u']))
+
+
+    def test_changing_working_dir(self):
+        """Infernal BaseCommand should change according to WorkingDir"""
+        c = Cmemit(WorkingDir='/tmp/cmemit_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmemit_test','/"; ','cmemit']))
+        c = Cmemit()
+        c.WorkingDir = '/tmp/cmemit_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmemit_test2','/"; ','cmemit']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/cmemit_test')
+        rmdir('/tmp/cmemit_test2')
+
+    def test_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        shutil.rmtree(self.temp_dir_spaces)
+
+class CmscoreTests(GeneralSetUp):
+    """Tests for the Cmscore application controller"""
+
+    def test_base_command(self):
+        """Infernal BaseCommand should return the correct BaseCommand"""
+        c = Cmscore()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmscore']))
+        c.Parameters['-l'].on()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmscore -l']))
+
+
+    def test_changing_working_dir(self):
+        """Infernal BaseCommand should change according to WorkingDir"""
+        c = Cmscore(WorkingDir='/tmp/cmscore_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmscore_test','/"; ','cmscore']))
+        c = Cmscore()
+        c.WorkingDir = '/tmp/cmscore_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmscore_test2','/"; ','cmscore']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/cmscore_test')
+        rmdir('/tmp/cmscore_test2')
+
+    def test_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        shutil.rmtree(self.temp_dir_spaces)
+
+
+class CmsearchTests(GeneralSetUp):
+    """Tests for the Cmsearch application controller"""
+
+    def test_base_command(self):
+        """Infernal BaseCommand should return the correct BaseCommand"""
+        c = Cmsearch()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmsearch']))
+        c.Parameters['-p'].on()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmsearch -p']))
+
+
+    def test_changing_working_dir(self):
+        """Infernal BaseCommand should change according to WorkingDir"""
+        c = Cmsearch(WorkingDir='/tmp/cmsearch_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmsearch_test','/"; ','cmsearch']))
+        c = Cmsearch()
+        c.WorkingDir = '/tmp/cmsearch_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmsearch_test2','/"; ','cmsearch']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/cmsearch_test')
+        rmdir('/tmp/cmsearch_test2')
+
+    def test_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        shutil.rmtree(self.temp_dir_spaces)
+
+    def test_cmsearch_from_alignment_no_hits(self):
+        """cmsearch_from_alignment should work as expected
+        """
+        search_res = cmsearch_from_alignment(aln=self.seqs2_aligned,\
+            structure_string=self.struct2_aligned_string,\
+            seqs=self.seqs1_unaligned,moltype=RNA)
+        self.assertEqual(search_res,[])
+
+    def test_cmsearch_from_alignment(self):
+        """cmsearch_from_alignment should work as expected
+        """
+        exp_search_res = [['a', 5, 23, 1, 19, 12.85, '-', 37],\
+                          ['b', 1, 19, 1, 19, 14.359999999999999, '-', 47]]
+        search_res = cmsearch_from_alignment(aln=self.seqs2_aligned,\
+            structure_string=self.struct2_aligned_string,\
+            seqs=self.seqs2_unaligned,moltype=RNA)
+        for search, exp in zip(search_res, exp_search_res):
+            self.assertEqual(search[1:],exp)
+
+    def test_cmsearch_from_file_no_hits(self):
+        """cmsearch_from_file should work as expected
+        """
+        search_res = cmsearch_from_file(cm_file_path=self.cmfile,\
+            seqs=self.seqs1_unaligned,moltype=RNA)
+        self.assertEqual(search_res,[])
+
+    def test_cmsearch_from_file(self):
+        """cmsearch_from_file should work as expected
+        """
+        exp_search_res = [['a', 5, 23, 1, 19, 12.85, '-', 37],\
+                          ['b', 1, 19, 1, 19, 14.359999999999999, '-', 47]]
+        search_res = cmsearch_from_file(cm_file_path=self.cmfile,\
+            seqs=self.seqs2_unaligned,moltype=RNA)
+        for search, exp in zip(search_res, exp_search_res):
+            self.assertEqual(search[1:],exp)
+
+class CmstatTests(GeneralSetUp):
+    """Tests for the Cmstat application controller"""
+
+    def test_base_command(self):
+        """Infernal BaseCommand should return the correct BaseCommand"""
+        c = Cmstat()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmstat']))
+        c.Parameters['-g'].on()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','cmstat -g']))
+
+
+    def test_changing_working_dir(self):
+        """Infernal BaseCommand should change according to WorkingDir"""
+        c = Cmstat(WorkingDir='/tmp/cmstat_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmstat_test','/"; ','cmstat']))
+        c = Cmstat()
+        c.WorkingDir = '/tmp/cmstat_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/cmstat_test2','/"; ','cmstat']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/cmstat_test')
+        rmdir('/tmp/cmstat_test2')
+
+    def test_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        shutil.rmtree(self.temp_dir_spaces)
+
+ALN1_CM = """INFERNAL-1 [1.0rc1]
+NAME     aln1-1
+STATES   61
+NODES    18
+ALPHABET 1
+ELSELF   -0.08926734
+WBETA    1e-07
+NSEQ     3
+EFFNSEQ  3.000
+CLEN     19
+BCOM     cmbuild aln1.cm aln1.sto
+BDATE    Sun Oct 5 18:45:35 2008
+NULL     0.000  0.000  0.000  0.000
+MODEL:
+				[ ROOT    0 ]
+     S     0    -1 0     1     4  -2.071  -2.210  -1.649  -2.140
+    IL     1     1 2     1     4  -0.556  -5.022  -1.818  -7.508                  0.000  0.000  0.000  0.000
+    IR     2     2 3     2     3  -0.310  -2.439  -6.805                          0.000  0.000  0.000  0.000
+				[ MATL    1 ]
+    ML     3     2 3     5     3  -8.003  -0.020  -6.657                         -0.389  0.377 -1.236  0.597
+     D     4     2 3     5     3  -7.923  -3.436  -0.146
+    IL     5     5 3     5     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL    2 ]
+    ML     6     5 3     8     3  -8.003  -0.020  -6.657                          0.711 -1.015 -1.162  0.507
+     D     7     5 3     8     3  -7.923  -3.436  -0.146
+    IL     8     8 3     8     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL    3 ]
+    ML     9     8 3    11     3  -8.003  -0.020  -6.657                         -0.389  0.377 -1.236  0.597
+     D    10     8 3    11     3  -7.923  -3.436  -0.146
+    IL    11    11 3    11     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL    4 ]
+    ML    12    11 3    14     3  -8.003  -0.020  -6.657                         -0.392  0.246 -1.238  0.703
+     D    13    11 3    14     3  -7.923  -3.436  -0.146
+    IL    14    14 3    14     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL    5 ]
+    ML    15    14 3    17     3  -8.003  -0.020  -6.657                         -1.340 -2.411  1.644 -1.777
+     D    16    14 3    17     3  -7.923  -3.436  -0.146
+    IL    17    17 3    17     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL    6 ]
+    ML    18    17 3    20     3  -8.003  -0.020  -6.657                          0.830  0.106 -1.204 -0.492
+     D    19    17 3    20     3  -7.923  -3.436  -0.146
+    IL    20    20 3    20     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL    7 ]
+    ML    21    20 3    23     3  -8.003  -0.020  -6.657                         -1.143 -1.575 -1.925  1.560
+     D    22    20 3    23     3  -7.923  -3.436  -0.146
+    IL    23    23 3    23     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL    8 ]
+    ML    24    23 3    26     3  -8.391  -0.018  -6.709                          0.821 -1.044 -1.178  0.385
+     D    25    23 3    26     3  -6.905  -0.258  -2.688
+    IL    26    26 3    26     3  -1.925  -0.554  -4.164                          0.000  0.000  0.000  0.000
+				[ MATR    9 ]
+    MR    27    26 3    29     5  -7.411  -0.031  -7.227  -7.439  -8.330         -0.726  0.967 -1.567  0.142
+     D    28    26 3    29     5  -5.352  -0.707  -2.978  -4.409  -2.404
+    IR    29    29 3    29     5  -2.408  -0.496  -5.920  -4.087  -5.193          0.000  0.000  0.000  0.000
+				[ MATP   10 ]
+    MP    30    29 3    34     6  -9.266  -9.205  -0.019  -7.982  -8.261  -8.656 -1.570 -1.865 -1.898  0.327 -1.331 -2.318  0.651  0.994 -1.872  0.282 -2.224 -0.666  1.972 -1.608 -0.242  1.187
+    ML    31    29 3    34     6  -6.250  -6.596  -1.310  -1.005  -6.446  -3.975  0.660 -0.612 -0.293 -0.076
+    MR    32    29 3    34     6  -6.988  -5.717  -1.625  -5.695  -0.829  -3.908  0.660 -0.612 -0.293 -0.076
+     D    33    29 3    34     6  -9.049  -7.747  -3.544  -4.226  -4.244  -0.319
+    IL    34    34 5    34     6  -2.579  -2.842  -0.760  -4.497  -5.274  -4.934  0.000  0.000  0.000  0.000
+    IR    35    35 6    35     5  -2.408  -0.496  -5.920  -4.087  -5.193          0.000  0.000  0.000  0.000
+				[ MATP   11 ]
+    MP    36    35 6    40     4  -7.331  -7.538  -0.041  -5.952                 -4.114  0.397 -4.664  0.815 -4.665 -4.015 -0.462 -4.315 -3.939  3.331 -3.732 -0.830 -0.398 -3.640 -1.958 -3.517
+    ML    37    35 6    40     4  -3.758  -3.940  -0.507  -2.670                  0.660 -0.612 -0.293 -0.076
+    MR    38    35 6    40     4  -4.809  -3.838  -1.706  -0.766                  0.660 -0.612 -0.293 -0.076
+     D    39    35 6    40     4  -4.568  -4.250  -2.265  -0.520
+    IL    40    40 5    40     4  -1.686  -2.369  -1.117  -4.855                  0.000  0.000  0.000  0.000
+    IR    41    41 6    41     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL   12 ]
+    ML    42    41 6    44     5  -7.411  -0.031  -7.227  -7.439  -8.330          1.826 -2.947 -2.856 -2.413
+     D    43    41 6    44     5  -4.959  -0.803  -4.221  -2.596  -2.508
+    IL    44    44 3    44     5  -2.408  -0.496  -4.087  -5.920  -5.193          0.000  0.000  0.000  0.000
+				[ MATP   13 ]
+    MP    45    44 3    49     4  -7.331  -7.538  -0.041  -5.952                 -1.592 -1.722 -1.807  0.471 -1.387 -2.146  1.822  0.774 -1.836  0.505 -2.076 -0.521  1.055 -1.515 -0.260  0.958
+    ML    46    44 3    49     4  -3.758  -3.940  -0.507  -2.670                  0.660 -0.612 -0.293 -0.076
+    MR    47    44 3    49     4  -4.809  -3.838  -1.706  -0.766                  0.660 -0.612 -0.293 -0.076
+     D    48    44 3    49     4  -4.568  -4.250  -2.265  -0.520
+    IL    49    49 5    49     4  -1.686  -2.369  -1.117  -4.855                  0.000  0.000  0.000  0.000
+    IR    50    50 6    50     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL   14 ]
+    ML    51    50 6    53     3  -8.323  -0.016  -6.977                          0.481 -1.091 -0.011  0.192
+     D    52    50 6    53     3  -6.174  -1.687  -0.566
+    IL    53    53 3    53     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL   15 ]
+    ML    54    53 3    56     3  -8.323  -0.016  -6.977                          1.148 -1.570 -0.075 -1.007
+     D    55    53 3    56     3  -6.174  -1.687  -0.566
+    IL    56    56 3    56     3  -1.442  -0.798  -4.142                          0.000  0.000  0.000  0.000
+				[ MATL   16 ]
+    ML    57    56 3    59     2       *   0.000                                 -0.726  0.967 -1.567  0.142
+     D    58    56 3    59     2       *   0.000
+    IL    59    59 3    59     2  -1.823  -0.479                                  0.000  0.000  0.000  0.000
+				[ END    17 ]
+     E    60    59 3    -1     0
+//
+"""
+
+CMALIGN_STDOUT = """# cmalign :: align sequences to an RNA CM
+# INFERNAL 1.0rc1 (June 2008)
+# Copyright 2007-2009 (C) 2008 HHMI Janelia Farm Research Campus
+# Freely distributed under the GNU General Public License (GPL)
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+# command: cmalign --withali aln1.sto -o all_aligned.sto aln1.cm seqs1.fasta
+# date:    Sun Oct 5 22:04:30 2008
+#
+# cm name                    algorithm  config  sub  bands     tau
+# -------------------------  ---------  ------  ---  -----  ------
+# aln1-1                       opt acc  global   no    hmm   1e-07
+#
+#                               bit scores
+#                           ------------------
+# seq idx  seq name    len     total    struct  avg prob      elapsed
+# -------  --------  -----  --------  --------  --------  -----------
+        1  1            23     -9.98      5.71     0.260  00:00:00.01
+        2  2            13     -6.79      6.73     0.710  00:00:00.00
+        3  3            17     -7.43      5.86     0.754  00:00:00.01
+
+# Alignment saved in file all_aligned.sto.
+#
+# CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00
+"""
+
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_mafft.py b/bfillings/tests/test_mafft.py
new file mode 100644
index 0000000..d4ca8db
--- /dev/null
+++ b/bfillings/tests/test_mafft.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir, path
+import tempfile
+import shutil
+from unittest import TestCase, main
+
+from cogent.core.moltype import RNA
+from cogent.util.misc import flatten
+from bfillings.mafft import (Mafft, align_unaligned_seqs, add_seqs_to_alignment,
+                          align_two_alignments)
+
+
+class GeneralSetUp(TestCase):
+
+    def setUp(self):
+        """Mafft general setUp method for all tests"""
+        self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+            'GCGGCUAUUAGAUCGUA']
+
+        self.labels1 = ['>1','>2','>3']
+        self.lines1 = flatten(zip(self.labels1,self.seqs1))
+
+        self.aligned1 = {'1': 'acugcuagcuaguagcguacgua',\
+                         '2': 'gcuacguagcuac----------',\
+                         '3': 'gcggcuauuagau------cgua',\
+                         }
+
+
+        self.seqs2=['UAGGCUCUGAUAUAAUAGCUCUC','UAUCGCUUCGACGAUUCUCUGAUAGAGA',
+            'UGACUACGCAU']
+        self.labels2=['>a','>b','>c']
+        self.lines2 = flatten(zip(self.labels2,self.seqs2))
+
+        self.aligned2 = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC---------',\
+                         'b': 'UA----UCGCUUCGACGAUUCUCUGAUAGAGA',\
+                         'c': 'UG------------ACUACGCAU---------',\
+                         }
+
+
+        self.temp_dir = tempfile.mkdtemp()
+        self.temp_dir_spaces = '/tmp/test for mafft/'
+        try:
+            mkdir(self.temp_dir_spaces)
+        except OSError:
+            pass
+        try:
+            #create sequence files
+            f = open(path.join(self.temp_dir, 'seq1.txt'),'w')
+            f.write('\n'.join(self.lines1))
+            f.close()
+            g = open(path.join(self.temp_dir, 'seq2.txt'),'w')
+            g.write('\n'.join(self.lines2))
+            g.close()
+        except OSError:
+            pass
+
+
+class MafftTests(GeneralSetUp):
+    """Tests for the Mafft application controller"""
+
+    def test_base_command(self):
+        """Mafft BaseCommand should return the correct BaseCommand"""
+        c = Mafft()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','mafft']))
+        c.Parameters['--quiet'].on()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','mafft --quiet']))
+        c.Parameters['--globalpair'].on()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','mafft --globalpair --quiet']))
+        c.Parameters['--maxiterate'].on(1000)
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ',"""mafft --maxiterate 1000 --globalpair --quiet"""]))
+
+    def test_changing_working_dir(self):
+        """Mafft BaseCommand should change according to WorkingDir"""
+        c = Mafft(WorkingDir='/tmp/mafft_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/mafft_test','/"; ','mafft']))
+        c = Mafft()
+        c.WorkingDir = '/tmp/mafft_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/mafft_test2','/"; ','mafft']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/mafft_test')
+        rmdir('/tmp/mafft_test2')
+
+    def test_general_cleanUp(self):
+        """Last test executed: cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        shutil.rmtree(self.temp_dir_spaces)
+
+    def test_align_unaligned_seqs(self):
+        """align_unaligned_seqs should work as expected"""
+        res = align_unaligned_seqs(self.seqs1, RNA)
+        self.assertEqual(res.toFasta(), align1)
+        res = align_unaligned_seqs(self.lines2, RNA)
+        self.assertEqual(res.toFasta(), align2)
+
+    def test_add_seqs_to_alignment(self):
+        """add_seqs_to_alignment should work as expected."""
+        res = add_seqs_to_alignment(self.lines1,self.aligned2, RNA)
+        self.assertEqual(res.toFasta(), add_seqs_align)
+
+    def test_align_two_alignments(self):
+        """align_two_alignments should work as expected."""
+        res = align_two_alignments(self.aligned1, self.aligned2, RNA)
+        self.assertEqual(res.toFasta(), align_two_align)
+
+align1 = ">seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\nGCUACGUAGCUAC----------\n>seq_2\nGCGGCUAUUAGAU------CGUA"
+
+align2 = ">a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\nUA----UCGCUUCGACGAUUCUCUGAUAGAGA\n>c\nUG------------ACUACGCAU---------"
+
+add_seqs_align = """>1\nACUGC-UAGCUAGUAGCGUACGUA--------\n>2\nGCUACGUAGCUA-----------C--------\n>3\nGCGGCUAUUAGAUCGUA---------------\n>a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\nUA----UCGCUUCGACGAUUCUCUGAUAGAGA\n>c\nUG------------ACUACGCAU---------"""
+
+align_two_align = """>1\nACUGCUAGCUAGUAGCGUACGUA---------\n>2\nGCUACGUAGCUAC-------------------\n>3\nGCGGCUAUUAGAU------CGUA---------\n>a\nUAGGCUCUGAUAUAAUAGCUCUC---------\n>b\nUA----UCGCUUCGACGAUUCUCUGAUAGAGA\n>c\nUG------------ACUACGCAU---------"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_mothur.py b/bfillings/tests/test_mothur.py
new file mode 100644
index 0000000..7a1cd1e
--- /dev/null
+++ b/bfillings/tests/test_mothur.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from __future__ import with_statement
+from cStringIO import StringIO
+from os import remove, rmdir
+import os.path
+from shutil import rmtree
+from tempfile import mkdtemp, mkstemp, NamedTemporaryFile
+from unittest import TestCase, main
+
+from bfillings.mothur import (Mothur, mothur_from_file, MothurClassifySeqs,
+                           mothur_classify_file)
+
+
+__author__ = "Kyle Bittinger"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Kyle Bittinger", "Jose Carlos Clemente Litran"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "Kyle Bittinger"
+__email__ = "kylebittinger at gmail.com"
+__status__ = "Development"
+
+
+class MothurTests(TestCase):
+    def setUp(self):
+        self.small_fasta = (
+            '>aaaaaa\nTAGGCTCTGATATAATAGCTCTC---------\n'
+            '>cccccc\n------------TGACTACGCAT---------\n'
+            '>bbbbbb\n----TATCGCTTCGACGATTCTCTGATAGAGA\n'
+            )
+        self.small_otus = (
+            'unique\t3\taaaaaa\tcccccc\tbbbbbb\t\n'
+            '0.62\t2\taaaaaa\tbbbbbb,cccccc\t\n'
+            '0.67\t1\tbbbbbb,cccccc,aaaaaa\t\n'
+            )
+        self.small_otus_parsed = [
+            (float('0'), [['aaaaaa'], ['cccccc'], ['bbbbbb']]),
+            (float('0.62'), [['aaaaaa'], ['bbbbbb', 'cccccc']]),
+            (float('0.67'), [['bbbbbb', 'cccccc', 'aaaaaa']]),
+            ]
+        self.complement_fasta = (
+            '>a\n--AGGGGTAATAA--\n'
+            '>b\n--TTATTACCCCT--\n'
+            '>c\n-------AAAAAA--\n'
+            )
+        self.complement_otus = (
+            'unique\t3\ta\tb\tc\t\n'
+            '0.43\t2\tc,a\tb\t\n'
+            '1.00\t1\tb,c,a\t\n'
+            )
+        self.work_dir = mkdtemp()
+
+    def tearDown(self):
+        rmtree(self.work_dir)
+
+    def test_get_help(self):
+        """Mothur.getHelp() should return help string"""
+        expected_help = (
+            'See manual, available on the MOTHUR wiki:\n'
+            'http://schloss.micro.umass.edu/mothur/'
+            )
+        self.assertEqual(Mothur.getHelp(), expected_help)
+
+    def test_compile_mothur_script(self):
+        """Mothur._compile_mothur_script() should return valid Mothur script"""
+        app = Mothur()
+        app._input_filename = 'test.fasta'
+        observed_script = app._compile_mothur_script()
+        expected_script = (
+            '#unique.seqs(fasta=test.fasta); '
+            'dist.seqs(fasta=test.unique.fasta); '
+            'read.dist(column=test.unique.dist, name=test.names); '
+            'cluster(method=furthest)')
+        self.assertEqual(observed_script, expected_script)
+
+    def test_get_result_paths(self):
+        """Mothur._get_result_paths() should guess correct output paths"""
+        app = Mothur()
+        app._input_filename = 'test.fasta'
+        observed_paths = {
+            'distance matrix': app._derive_dist_path(),
+            'otu list': app._derive_list_path(),
+            'rank abundance': app._derive_rank_abundance_path(),
+            'species abundance': app._derive_species_abundance_path(),
+            'unique names': app._derive_names_path(),
+            'unique seqs': app._derive_unique_path(),
+            }
+        expected_paths = {
+            'distance matrix': 'test.unique.dist',
+            'otu list': 'test.unique.fn.list',
+            'rank abundance': 'test.unique.fn.rabund',
+            'species abundance': 'test.unique.fn.sabund',
+            'unique names': 'test.names',
+            'unique seqs': 'test.unique.fasta',
+            }
+        self.assertEqual(observed_paths, expected_paths)
+
+    def test_working_directory(self):
+        """Mothur.WorkingDir attribute should not be cast to FilePath object"""
+        app = Mothur(WorkingDir='/tmp')
+        self.assertEquals(str(app.WorkingDir), '/tmp')
+
+    def test_working_directory_used(self):
+        """Mothur input file should be created in the working dir."""
+        app = Mothur(WorkingDir=self.work_dir)
+        result = app(self.small_fasta, remove_tmp=False)
+        input_dir, _ = os.path.split(app._input_filename)
+        self.assertEqual(input_dir, self.work_dir)
+        result.cleanUp()
+
+    def test_call_with_multiline_string(self):
+        """Mothur.__call__() should return correct otu's for input as single string"""
+        app = Mothur()
+        result = app(self.small_fasta)
+        observed_otus = result['otu list'].read()
+        self.assertEquals(observed_otus, self.small_otus)
+        result.cleanUp()
+
+    def test_call_with_lines(self):
+        """Mothur.__call__() should return correct otu's for input as lines"""
+        lines = self.small_fasta.split('\n')
+        app = Mothur(InputHandler='_input_as_lines')
+        result = app(lines)
+        observed_otus = result['otu list'].read()
+        self.assertEquals(observed_otus, self.small_otus)
+        result.cleanUp()
+
+    def test_call_with_path(self):
+        """Mothur.__call__() should return correct otu's for input as path"""
+        working_dir = mkdtemp()
+        _, filename = mkstemp(dir=working_dir, suffix='.fasta')
+        with open(filename, 'w') as f:
+            f.write(self.small_fasta)
+        app = Mothur(InputHandler='_input_as_path', WorkingDir=working_dir)
+        result = app(filename)
+        observed_otus = result['otu list'].read()
+        self.assertEquals(observed_otus, self.small_otus)
+        remove(filename)
+        result.cleanUp()
+        rmdir(working_dir)
+
+    def test_call_with_working_dir(self):
+        """Mothur.__call__() should return correct otu's when input dir is changed"""
+        working_dir = mkdtemp()
+        app = Mothur(WorkingDir=working_dir)
+        result = app(self.small_fasta)
+        observed_otus = result['otu list'].read()
+        self.assertEquals(observed_otus, self.small_otus)
+        result.cleanUp()
+        rmdir(working_dir)
+
+    def test_call_with_complement(self):
+        """Mothur.__call__() should return correct otu's for input sequences which are reverse complements"""
+        app = Mothur()
+        result = app(self.complement_fasta)
+        observed_otus = result['otu list'].read()
+        self.assertEquals(observed_otus, self.complement_otus)
+        result.cleanUp()
+
+    def test_mothur_from_file(self):
+        """mothur_from_file() should return parsed otus"""
+        f = StringIO(self.small_fasta)
+        f.seek(0)
+        parsed_otus = mothur_from_file(f)
+        self.assertEquals(parsed_otus, self.small_otus_parsed)
+
+
+class TestMothurClassifySeqs(TestCase):
+    def setUp(self):
+        self.ref_file = NamedTemporaryFile()
+        self.ref_file.write(mothur_ref_seqs)
+        self.ref_file.seek(0)
+
+        self.tax_file = NamedTemporaryFile()
+        self.tax_file.write(mothur_taxonomy)
+        self.tax_file.seek(0)
+
+        self.work_dir = mkdtemp()
+
+    def tearDown(self):
+        rmtree(self.work_dir)
+
+    def test_app(self):
+        app = MothurClassifySeqs({
+            'reference': self.ref_file.name,
+            'taxonomy': self.tax_file.name,
+            }, WorkingDir=self.work_dir)
+        res = app(mothur_seqs)
+        assignments = res['assignments'].read()
+        self.assertEqual(assignments, mothur_assignments)
+        summary = res['summary'].read()
+        # Later versions of mothur add a tab before the newline.  We
+        # do not care about trailing whitespace as long as content is
+        # the same.
+        summary = summary.replace("\t\n", "\n")
+        self.assertEqual(summary, mothur_summary)
+        res.cleanUp()
+
+    def test_format_function_arguments(self):
+        app = MothurClassifySeqs({
+            'reference': '/home/myuser/ref-seqs.fasta',
+            'taxonomy': '/home/MyUser/data/tax.txt',
+            'cutoff': 80,
+            })
+        obs_args = app._format_function_arguments(
+            ['reference', 'taxonomy', 'cutoff', 'iters'])
+        exp_args = (
+            "reference=/home/myuser/ref\\-seqs.fasta, "
+            "taxonomy=/home/MyUser/data/tax.txt, cutoff=80")
+        self.assertEqual(obs_args, exp_args)
+
+    def test_compile_mothur_script(self):
+        app = MothurClassifySeqs({
+            'reference': '/home/myuser/ref-seqs.fasta',
+            'taxonomy': '/home/MyUser/data/tax.txt',
+            'cutoff': 80,
+            })
+        app._input_filename = "/my/input.fasta"
+        exp_script = (
+            "#classify.seqs(fasta=/my/input.fasta, "
+            "reference=/home/myuser/ref\-seqs.fasta, "
+            "taxonomy=/home/MyUser/data/tax.txt, "
+            "cutoff=80)")
+        self.assertEqual(app._compile_mothur_script(), exp_script)
+
+    def test_mothur_classify_file(self):
+        query_file = StringIO(mothur_seqs)
+        res = mothur_classify_file(
+            query_file,  self.ref_file.name, self.tax_file.name)
+        exp_res = {
+            'A': (['k__Bacteria', 'p__Firmicutes', 'c__Clostridia',
+                   'o__Clostridale', 'f__Eubacteriaceae', 'g__Eubacterium',
+                   's__Eubacteriumfoedans'], 1.0),
+            'Very': (['k__Bacteria', 'p__Bacteriodetes'], 1.0),
+            '01': (['k__Bacteria', 'p__Firmicutes'], 1.0),
+            }
+        self.assertEqual(res, exp_res)
+
+    def test_unclassifiable_sequence(self):
+        query_file = StringIO(
+            ">MostlyTs\nTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+            "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+            "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n")
+        res = mothur_classify_file(
+            query_file,  self.ref_file.name, self.tax_file.name)
+        exp_res = {
+            'MostlyTs': (['Unknown'], 0.0),
+            }
+        self.assertEqual(res, exp_res)
+
+
+mothur_assignments = """\
+01	k__Bacteria(100);p__Firmicutes(100);unclassified;unclassified;unclassified;unclassified;unclassified;
+A	k__Bacteria(100);p__Firmicutes(100);c__Clostridia(100);o__Clostridale(100);f__Eubacteriaceae(100);g__Eubacterium(100);s__Eubacteriumfoedans(100);
+Very	k__Bacteria(100);p__Bacteriodetes(100);unclassified;unclassified;unclassified;unclassified;unclassified;
+"""
+
+mothur_summary = """\
+taxlevel	 rankID	 taxon	 daughterlevels	 total
+0	0	Root	1	3
+1	0.1	k__Bacteria	2	3
+2	0.1.1	p__Bacteriodetes	1	1
+3	0.1.1.1	unclassified	1	1
+4	0.1.1.1.1	unclassified	1	1
+5	0.1.1.1.1.1	unclassified	1	1
+6	0.1.1.1.1.1.1	unclassified	1	1
+7	0.1.1.1.1.1.1.1	unclassified	0	1
+2	0.1.2	p__Firmicutes	2	2
+3	0.1.2.1	c__Clostridia	1	1
+4	0.1.2.1.1	o__Clostridale	1	1
+5	0.1.2.1.1.1	f__Eubacteriaceae	1	1
+6	0.1.2.1.1.1.1	g__Eubacterium	1	1
+7	0.1.2.1.1.1.1.1	s__Eubacteriumfoedans	0	1
+3	0.1.2.2	unclassified	1	1
+4	0.1.2.2.1	unclassified	1	1
+5	0.1.2.2.1.1	unclassified	1	1
+6	0.1.2.2.1.1.1	unclassified	1	1
+7	0.1.2.2.1.1.1.1	unclassified	0	1
+"""
+
+mothur_seqs = """\
+>01
+GGAGTCTGGGCCGGTGTCGTCAAGGTCCCAATCTGGCTGGTCGGTCTCTCAACCCAGCTACCCATCATTGCCTTGGTAGGCCGTTACCCACCAACAAGCTAACAGGCCGCGGGCCCATCCCTCTCCGCCGGAGCTTTCTCGAGTCTTCCATGCGGAAGTCCCGAAGTATTCGGTATTATCCACGGTTTCCCGTGGCTATCCCAATGAGAGGGGCAGGTTGCCCACGTGTTACTCAGCCGTTCGCCACTTTATACACACCCGAAGGTGCTTTAATCGTTCGACTTGCATGTGTTAGGCGCGCCGCCAGCGTTCATC
+>A
+GGAGTCTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCACCCTCTCAGGTCGGCTATGCATCACGGCCTTGGTGAGCCGTTACCTCACCAACTAGCTAATGCACCGCGGGTCCATCCATCAGCAGAAGCTTGCGCCTCTTTTCCTCTTCAAACCATGCGGTTCGAAGACCTATGCGGTTTTAGCATCCGTTTCCGAATGTTATCCCCCTCTGATGGGCAGGTTACCCACGTGTTACTCACCCGTTCGCCACTAGATTGACCAGTGCAAGCACCGGTCGCTCTCGTTCGACTTGCATGTATTAGGCACGCCGCCAGCGTTCGTC
+>Very long seq name with many spaces!
+GGAGTCTGGACCGTGTCTCAGTTCCAGTGTGACTGATCATCCTCTCAGACCAGTTATGCGTCATAGCCTTGGTGAGCCATTACCTCACCAACTAGCTGATACAATATAGCCTCATCCTACACCGAAAAACTTTCCCTATCTAACTTATGTTAGAGAGGAGTATAGAGTATTAGCAGTCGTTTCCAACTGTTGTCCTCTAGTGTAGGGCAGATTAGCTACACATTACTCACCCGTGCGCCACTAACTCATAAGAGCAAGCTCTTACTTGTCCGTTCGACTTGCATGTATTAGGCACGCCGCCAGCGTTCACT
+"""
+
+mothur_ref_seqs = """\
+>ref1
+GGAGTCTGGGCCGGTGTCGTCAAGGTCCCAATCTGGCTGGTCGGTCTCTCTGGTAGGCCGTTACCCACCAACAAGCTAACAGGCCGCGGGCCCATCCCTCTCCGCCGGAGCTTTCTCGAGTCTTCCATGCGGAAGTCCCTGCGGAAGTCCCGAAGTATTCGGTATTATCCACGGTTTCCCGTGGCTATCCCAATGAGAGGGGCAGGTTGCCCACGTGTTACTCAGCCGTTCGCCACTTTATACACACCCGAAGGTGCTTTAATCGTTCGACTTGCATGTGTTAGGCGCGCCGCCAGCGTTCATC
+>ref2
+GGAGTCTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCACCCTCTCAGGTCGGCTATGCATCACGGCCTTGGTGAGCCGTTACCTCACCAACTAGCTACTCTTTTCCTCTTCAAACCATGCGGTTCGAAGACCTATGCGGTTTTAGCATCCGTAAACTTTCCCTATCTAACTTATGTTAGAGAGGAGTATAGAGTATTAGCAGTCGTTTCCAACTTCCGAATGTTATCCCCCTCTGATGGGCAGGTTACCCACGTGTTACTCACCCGTTCGCCACTAGATTGACCAGTGCAAGCACCGGTCGCTCTCGTTCGACTTGCATGTATTAGGCACGCCGCCAGCGTTCGTC
+>3333
+GGAGTCTGGACCGTGTCTCAGTTCCAGTGTGACTGATCATCCTCTCAGACAGTTATGCGTCATAGCCTTGGTGAGCCATTACCTCACCAACTAGCTGATACAATATAGCCTCATCCTACACCGAAAAACTTTCCCTATCTCTTATGTTAGAGAGGAGTATAGAGTATTAGCAGTCGTTTCCAACTGTTGTCCTCTAGTGTAGGGCAGATTAGCACACATTACTCACCCGTGCGCCACTAACTCATAAGAGCAAGCTCTTACTTGTCCGTTCGACTTGCATGTATTAGGCACGCCGCCAGCGTTCACT
+"""
+
+mothur_taxonomy = """\
+ref1	k__Bacteria;p__Firmicutes;
+ref2	k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridale;f__Eubacteriaceae;g__Eubacterium;s__Eubacteriumfoedans;
+3333	k__Bacteria;p__Bacteriodetes;
+"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_muscle_v38.py b/bfillings/tests/test_muscle_v38.py
new file mode 100644
index 0000000..39fd5e0
--- /dev/null
+++ b/bfillings/tests/test_muscle_v38.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir, path
+from subprocess import Popen, PIPE, STDOUT
+import tempfile
+import shutil
+from unittest import TestCase, main
+
+from cogent.core.moltype import RNA, DNA
+from cogent.util.misc import flatten
+
+from bfillings.muscle_v38 import (Muscle, muscle_seqs, aln_tree_seqs,
+                               align_unaligned_seqs, build_tree_from_alignment,
+                               align_and_build_tree, add_seqs_to_alignment,
+                               align_two_alignments)
+
+
+class GeneralSetUp(TestCase):
+
+    def setUp(self):
+        """Muscle general setUp method for all tests"""
+        # Check if muscle version is supported for this test
+        acceptable_version = (3,8)
+        command = "muscle -version"
+        proc = Popen(command,shell=True,universal_newlines=True,\
+                     stdout=PIPE,stderr=STDOUT)
+        stdout = proc.stdout.read()
+        version_string = stdout.strip().split(' ')[1].strip()[1:]
+        try:
+            version = tuple(map(int,version_string.split('.')))
+            pass_test = version[:2] == acceptable_version
+        except ValueError:
+            pass_test = False
+            version_string = stdout
+        self.assertTrue(pass_test,\
+         "Unsupported muscle version. %s is required, but running %s." \
+         % ('.'.join(map(str,acceptable_version)), version_string))
+
+        self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+            'GCGGCUAUUAGAUCGUA']
+
+        self.labels1 = ['>1','>2','>3']
+        self.lines1 = flatten(zip(self.labels1,self.seqs1))
+
+        self.seqs2=['UAGGCUCUGAUAUAAUAGCUCUC','UAUCGCUUCGACGAUUCUCUGAUAGAGA',
+            'UGACUACGCAU']
+        self.labels2=['>a','>b','>c']
+        self.lines2 = flatten(zip(self.labels2,self.seqs2))
+
+        self.temp_dir = tempfile.mkdtemp()
+        self.temp_dir_spaces = '/tmp/test for muscle/'
+        try:
+            mkdir(self.temp_dir_spaces)
+        except OSError:
+            pass
+        try:
+            #create sequence files
+            f = open(path.join(self.temp_dir, 'seq1.txt'),'w')
+            f.write('\n'.join(self.lines1))
+            f.close()
+            g = open(path.join(self.temp_dir, 'seq2.txt'),'w')
+            g.write('\n'.join(self.lines2))
+            g.close()
+        except OSError:
+            pass
+
+    def tearDown(self):
+        """cleans up all files initially created"""
+        # remove the tempdir and contents
+        shutil.rmtree(self.temp_dir)
+        shutil.rmtree(self.temp_dir_spaces)
+
+class MuscleTests(GeneralSetUp):
+    """Tests for the Muscle application controller"""
+
+    def test_base_command(self):
+        """Muscle BaseCommand should return the correct BaseCommand"""
+        c = Muscle()
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','muscle']))
+        c.Parameters['-in'].on('seq.txt')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','muscle -in "seq.txt"']))
+        c.Parameters['-cluster2'].on('neighborjoining')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "',getcwd(),'/"; ','muscle -cluster2 neighborjoining' +
+            ' -in "seq.txt"']))
+
+    def test_maxmb(self):
+        """maxmb option should not break Muscle"""
+        app = Muscle()
+        app.Parameters['-maxmb'].on('250')
+        outfile = tempfile.NamedTemporaryFile()
+        app.Parameters['-out'].on(outfile.name)
+
+        infile = tempfile.NamedTemporaryFile()
+        infile.write(
+            ">Seq1\nAAAGGGTTTCCCCT\n"
+            ">Seq2\nAAAGGGGGTTTCCACT\n")
+        infile.flush()
+        result = app(infile.name)
+
+        observed = result['MuscleOut'].read()
+        expected = (
+            ">Seq1\nAAA--GGGTTTCCCCT\n"
+            ">Seq2\nAAAGGGGGTTTCCACT\n"
+            )
+        self.assertEqual(observed, expected)
+
+    def test_changing_working_dir(self):
+        """Muscle BaseCommand should change according to WorkingDir"""
+        c = Muscle(WorkingDir='/tmp/muscle_test')
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/muscle_test','/"; ','muscle']))
+        c = Muscle()
+        c.WorkingDir = '/tmp/muscle_test2'
+        self.assertEqual(c.BaseCommand,\
+            ''.join(['cd "','/tmp/muscle_test2','/"; ','muscle']))
+
+        #removing the dirs is proof that they were created at the same time
+        #if the dirs are not there, an OSError will be raised
+        rmdir('/tmp/muscle_test')
+        rmdir('/tmp/muscle_test2')
+
+    def test_aln_tree_seqs(self):
+        "aln_tree_seqs returns the muscle alignment and tree from iteration2"
+        tree, aln = aln_tree_seqs(path.join(self.temp_dir, 'seq1.txt'),
+                                   tree_type="neighborjoining",
+                                   WorkingDir=self.temp_dir,
+                                   clean_up=True)
+        self.assertEqual(str(tree), '((1:1.125,2:1.125):0.375,3:1.5);')
+        self.assertEqual(len(aln), 6)
+        self.assertEqual(aln[-2], '>3\n')
+        self.assertEqual(aln[-1], 'GCGGCUAUUAGAUCGUA------\n')
+
+    def test_aln_tree_seqs_spaces(self):
+        "aln_tree_seqs should work on filename with spaces"
+        try:
+            #create sequence files
+            f = open(path.join(self.temp_dir_spaces, 'muscle_test_seq1.txt'),'w')
+            f.write('\n'.join(self.lines1))
+            f.close()
+        except OSError:
+            pass
+        tree, aln = aln_tree_seqs(path.join(self.temp_dir_spaces,\
+                                    'muscle_test_seq1.txt'),
+                                    tree_type="neighborjoining",
+                                    WorkingDir=getcwd(),
+                                    clean_up=True)
+        self.assertEqual(str(tree), '((1:1.125,2:1.125):0.375,3:1.5);')
+        self.assertEqual(len(aln), 6)
+        self.assertEqual(aln[-2], '>3\n')
+        self.assertEqual(aln[-1], 'GCGGCUAUUAGAUCGUA------\n')
+        remove(self.temp_dir_spaces+'/muscle_test_seq1.txt')
+
+    def test_align_unaligned_seqs(self):
+        """align_unaligned_seqs should work as expected"""
+        res = align_unaligned_seqs(self.seqs1, RNA)
+        self.assertEqual(res.toFasta(), align1)
+
+    def test_build_tree_from_alignment(self):
+        """Muscle should return a tree built from the passed alignment"""
+        tree_short = build_tree_from_alignment(build_tree_seqs_short, DNA)
+        num_seqs = flatten(build_tree_seqs_short).count('>')
+        self.assertEqual(len(tree_short.tips()), num_seqs)
+
+        tree_long = build_tree_from_alignment(build_tree_seqs_long, DNA)
+        seq_names = []
+        for line in build_tree_seqs_long.split('\n'):
+            if line.startswith('>'):
+                seq_names.append(line[1:])
+
+        for node in tree_long.tips():
+            if node.Name not in seq_names:
+                self.fail()
+
+    def test_align_and_build_tree(self):
+        """Should align and build a tree from a set of sequences"""
+        res = align_and_build_tree(self.seqs1, RNA)
+        self.assertEqual(res['Align'].toFasta(), align1)
+
+        tree = res['Tree']
+        seq_names = []
+        for line in align1.split('\n'):
+            if line.startswith('>'):
+                seq_names.append(line[1:])
+
+        for node in tree.tips():
+            if node.Name not in seq_names:
+                self.fail()
+
+    def test_add_seqs_to_alignment(self):
+        """Should add sequences to an alignment"""
+        res = add_seqs_to_alignment(seqs_to_add, align1)
+        self.assertEqual(res.toFasta(), added_align_result)
+
+    def test_align_two_alignments(self):
+        """Should align to multiple sequence alignments"""
+        res = align_two_alignments(align1, aln_to_merge)
+        self.assertEqual(res.toFasta(), merged_align_result)
+
+align1 = ">seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+# for use in test_add_seqs_to_alignment()
+seqs_to_add = ">foo\nGCUACGUAGCU\n>bar\nGCUACGUAGCC"
+added_align_result = ">bar\n---GCUACGUAGCC---------\n>foo\n---GCUACGUAGCU---------\n>seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+# for use in test_align_two_alignments()
+aln_to_merge = ">foo\nGCUACGUAGCU\n>bar\n--UACGUAGCC"
+merged_align_result = ">bar\n-----UACGUAGCC---------\n>foo\n---GCUACGUAGCU---------\n>seq_0\nACUGCUAGCUAGUAGCGUACGUA\n>seq_1\n---GCUACGUAGCUAC-------\n>seq_2\nGCGGCUAUUAGAUCGUA------"
+
+build_tree_seqs_short = """>muscle_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>muscle_test_seqs_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>muscle_test_seqs_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>muscle_test_seqs_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>muscle_test_seqs_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>muscle_test_seqs_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>muscle_test_seqs_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>muscle_test_seqs_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>muscle_test_seqs_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>muscle_test_seqs_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+build_tree_seqs_long = """>muscle_test_seqs_0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AGCTTTAAATCATGCCAGTG
+>muscle_test_seqsaaaaaaaa_1
+GACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+TGCTTTCAATAATGCCAGTG
+>muscle_test_seqsaaaaaaaa_2
+AACCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+TGCTTTGAATCATGCCAGTA
+>muscle_test_seqsaaaaaaaa_3
+AAACCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+TGCTTTACATCATGCAAGTG
+>muscle_test_seqsaaaaaaaa_4
+AACCGCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+TGCTTTAAATCATGCCAGTG
+>muscle_test_seqsaaaaaaaa_5
+AACCCCCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+TGCTTTAAATCATGCCAGTT
+>muscle_test_seqsaaaaaaaa_6
+GACCCCCGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+TACTTTAGATCATGCCGGTG
+>muscle_test_seqsaaaaaaaa_7
+AACCCCCACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+TGCTTTAAATCATGCCAGTG
+>muscle_test_seqsaaaaaaaa_8
+AACCCCCACGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+TGCATTAAATCATGCCAGTG
+>muscle_test_seqsaaaaaaaa_9
+AAGCCCCACGGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+TGCTTTAAATCCTGACAGCG
+"""
+
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_parsinsert.py b/bfillings/tests/test_parsinsert.py
new file mode 100644
index 0000000..40ac95a
--- /dev/null
+++ b/bfillings/tests/test_parsinsert.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Tests for ParsInsert v1.03 application controller."""
+
+
+from shutil import rmtree
+from os.path import splitext
+from os import getcwd, remove, rmdir, mkdir
+from unittest import TestCase, main
+
+from cogent.core.alignment import Alignment
+from cogent.parse.tree import DndParser
+from cogent.core.moltype import DNA
+
+from skbio.parse.sequences import parse_fasta
+from burrito.util import get_tmp_filename
+
+from bfillings.parsinsert import ParsInsert, insert_sequences_into_tree
+
+
+class ParsInsertTests(TestCase):
+    def setUp(self):
+
+        # create a list of files to cleanup
+        self._paths_to_clean_up = []
+        self._dirs_to_clean_up = []
+
+        # load query seqs
+        self.seqs = Alignment(parse_fasta(QUERY_SEQS.split()))
+
+        # generate temp filename
+        tmp_dir='/tmp'
+        self.outfile = get_tmp_filename(tmp_dir)
+
+        # create and write out reference sequence file
+        self.outfasta=splitext(self.outfile)[0]+'.fasta'
+        fastaout=open(self.outfasta,'w')
+        fastaout.write(REF_SEQS)
+        fastaout.close()
+        self._paths_to_clean_up.append(self.outfasta)
+
+        # create and write out starting tree file
+        self.outtree=splitext(self.outfile)[0]+'.tree'
+        treeout=open(self.outtree,'w')
+        treeout.write(REF_TREE)
+        treeout.close()
+        self._paths_to_clean_up.append(self.outtree)
+
+    def tearDown(self):
+        """cleans up all files initially created"""
+        # remove the tempdir and contents
+        map(remove,self._paths_to_clean_up)
+        map(rmdir,self._dirs_to_clean_up)
+
+    def test_base_command(self):
+        """Base command-calls"""
+
+        app = ParsInsert()
+        self.assertEqual(app.BaseCommand, \
+                         ''.join(['cd "',getcwd(),'/"; ','ParsInsert']))
+
+    def test_change_working_dir(self):
+        """Change working dir"""
+
+        app = ParsInsert(WorkingDir='/tmp/ParsInsertTest')
+        self.assertEqual(app.BaseCommand, \
+                       ''.join(['cd "','/tmp/ParsInsertTest',\
+                                '/"; ','ParsInsert']))
+
+        rmtree('/tmp/ParsInsertTest')
+
+    def test_insert_sequences_into_tree(self):
+        """Inserts sequences into Tree"""
+
+        # define log fp
+        log_fp='/tmp/parsinsert.log'
+        self._paths_to_clean_up.append(log_fp)
+
+        # define tax assignment values fp
+        tax_assign_fp='/tmp/tax_assignments.log'
+        self._paths_to_clean_up.append(tax_assign_fp)
+
+        # set the reference alignment and starting tree
+        param={
+                '-t':self.outtree,
+                '-s':self.outfasta,
+                '-l':log_fp,
+                '-o':tax_assign_fp
+              }
+
+        seqs, align_map = self.seqs.toPhylip()
+
+        # insert sequences into tree
+        tree = insert_sequences_into_tree(seqs, DNA, params=param)
+
+        # rename tips back to query names
+        for node in tree.tips():
+            if node.Name in align_map:
+                node.Name = align_map[node.Name]
+
+        self.assertEqual(tree.getNewick(with_distances=True),exp_tree)
+
+
+
+QUERY_SEQS= """\
+>6
+TGCATGTCAGTATAGCTTTGGTGAAACTGCGAATGGCTCATTAAATCAGT
+>7
+TGCATGTCAGTATAACTTTGGTGAAACTGCGAATGGCTCATTAAATCAGT
+"""
+
+REF_SEQS= """\
+>seq0000011
+TGCATGTCAGTATAGCTTTAGTGAAACTGCGAATGGCTCATTAAATCAGT
+>seq0000012
+TGCATGTCAGTATAGCTTTAGTGAAACTGCGAATGGCTNNTTAAATCAGT
+>seq0000013
+TGCATGTCAGTATAGCATTAGTGAAACTGCGAATGGCTCATTAAATCAGT
+>seq0000014
+TCCATGTCAGTATAACTTTGGTGAAACTGCGAATGGCTCATTAAATCAGG
+>seq0000015
+NNNNNNNNNNTATATCTTATGTGAAACTTCGAATGCCTCATTAAATCAGT
+"""
+
+REF_TREE="""((seq0000014:0.08408,seq0000015:0.13713)0.609:0.00215,seq0000013:0.02032,(seq0000011:0.00014,seq0000012:0.00014)0.766:0.00015);
+"""
+
+exp_tree = """((seq0000014:0.08408,seq0000015:0.13713,7:0.02027):0.00215,seq0000013:0.02032,(seq0000011:0.00014,seq0000012:0.00014,6:0.02027):0.00015):0.0;"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_pplacer.py b/bfillings/tests/test_pplacer.py
new file mode 100644
index 0000000..064003f
--- /dev/null
+++ b/bfillings/tests/test_pplacer.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+from os import getcwd, remove, rmdir, mkdir
+from os.path import splitext
+from random import randint
+from StringIO import StringIO
+from unittest import TestCase, main
+
+from skbio.parse.sequences import parse_fasta
+from burrito.util import ApplicationError, get_tmp_filename
+
+from cogent.util.misc import flatten
+from cogent.core.tree import PhyloNode
+from cogent.core.moltype import RNA, DNA
+from cogent.core.alignment import Alignment
+
+from bfillings.pplacer import Pplacer, insert_sequences_into_tree
+
+
+class Genericpplacer(TestCase):
+
+    def setUp(self):
+        '''setup the files for testing pplacer'''
+
+        # create a list of files to cleanup
+        self._paths_to_clean_up = []
+        self._dirs_to_clean_up = []
+
+        # get a tmp filename to use
+        basename=splitext(get_tmp_filename())[0]
+
+        # create and write out RAxML stats file
+        self.stats_fname=basename+'.stats'
+        stats_out=open(self.stats_fname,'w')
+        stats_out.write(RAXML_STATS)
+        stats_out.close()
+        self._paths_to_clean_up.append(self.stats_fname)
+
+        # create and write out reference sequence file
+        self.refseq_fname=basename+'_refseqs.fasta'
+        refseq_out=open(self.refseq_fname,'w')
+        refseq_out.write(REF_SEQS)
+        refseq_out.close()
+        self._paths_to_clean_up.append(self.refseq_fname)
+
+        # create and write out query sequence file
+        self.query_fname=basename+'_queryseqs.fasta'
+        query_out=open(self.query_fname,'w')
+        query_out.write(QUERY_SEQS)
+        query_out.close()
+        self._paths_to_clean_up.append(self.query_fname)
+
+        # create and write out starting tree file
+        self.tree_fname=basename+'.tre'
+        tree_out=open(self.tree_fname,'w')
+        tree_out.write(REF_TREE)
+        tree_out.close()
+        self._paths_to_clean_up.append(self.tree_fname)
+
+    def writeTmp(self, outname):
+        """Write data to temp file"""
+        t = open(outname, "w+")
+        t.write(PHYLIP_FILE)
+        t.close()
+
+    #
+    def tearDown(self):
+        """cleans up all files initially created"""
+        # remove the tempdir and contents
+        map(remove,self._paths_to_clean_up)
+        map(rmdir,self._dirs_to_clean_up)
+
+class pplacerTests(Genericpplacer):
+    """Tests for the pplacer application controller"""
+
+    def test_pplacer(self):
+        """Base command-calls"""
+
+        app=Pplacer()
+
+        self.assertEqual(app.BaseCommand, \
+                         ''.join(['cd "',getcwd(),'/"; ','pplacer']))
+
+        app.Parameters['--help'].on()
+        self.assertEqual(app.BaseCommand, \
+                         ''.join(['cd "',getcwd(),'/"; ','pplacer --help']))
+
+    def test_change_working_dir(self):
+        """Change working dir"""
+
+        working_dir='/tmp/Pplacer'
+        self._dirs_to_clean_up.append(working_dir)
+
+        # define working directory for output
+        app = Pplacer(WorkingDir=working_dir)
+
+        self.assertEqual(app.BaseCommand, \
+                       ''.join(['cd "','/tmp/Pplacer','/"; ','pplacer']))
+
+
+    def test_insert_sequences_into_tree(self):
+        """Inserts sequences into Tree"""
+
+        params={}
+        # generate temp filename for output
+        params["-r"] = self.refseq_fname
+        params["-t"] = self.tree_fname
+        params["-s"] = self.stats_fname
+        params["--out-dir"] = "/tmp"
+
+        aln_ref_query=parse_fasta(StringIO(QUERY_SEQS))
+        aln = Alignment(aln_ref_query)
+        seqs, align_map = aln.toPhylip()
+        tree = insert_sequences_into_tree(seqs, DNA, params=params,
+                                          write_log=False)
+
+        # rename tips back to query names
+        for node in tree.tips():
+            if node.Name in align_map:
+                node.Name = align_map[node.Name]
+
+        self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE)
+
+
+JSON_RESULT="""\
+{"tree":
+  "((seq0000004:0.08408[0],seq0000005:0.13713[1])0.609:0.00215[2],seq0000003:0.02032[3],(seq0000001:0.00014[4],seq0000002:0.00014[5])0.766:0.00015[6]):0[7];",
+  "placements":
+  [
+    {"p":
+      [[0, -113.210938, 0.713818, 0.064504, 0.000006],
+        [1, -114.929894, 0.127954, 0.137122, 0.000007],
+        [2, -114.932766, 0.127587, 0.000008, 0.000006],
+        [6, -117.743534, 0.007675, 0.000141, 0.027211],
+        [3, -117.743759, 0.007674, 0.020310, 0.027207],
+        [4, -117.747386, 0.007646, 0.000131, 0.027266],
+        [5, -117.747396, 0.007646, 0.000131, 0.027266]
+      ], "n": ["seq0000006"]
+    },
+    {"p": [[0, -113.476305, 1.000000, 0.035395, 0.000006]], "n":
+      ["seq0000007"]
+    }
+  ], "metadata":
+  {"invocation":
+    "pplacer -t %s -r %s -s %s --out-dir \/tmp %s"
+  }, "version": 1, "fields":
+  ["edge_num", "likelihood", "like_weight_ratio", "distal_length",
+    "pendant_length"
+  ]
+}
+""".replace('\n','').replace(' ','')
+
+
+QUERY_SEQS= """\
+>6
+TGCATGTCAGTATAGCTTTGGTGAAACTGCGAATGGCTCATTAAATCAGT
+>7
+TGCATGTCAGTATAACTTTGGTGAAACTGCGAATGGCTCATTAAATCAGT
+"""
+
+
+REF_SEQS= """\
+>seq0000011
+TGCATGTCAGTATAGCTTTAGTGAAACTGCGAATGGCTCATTAAATCAGT
+>seq0000012
+TGCATGTCAGTATAGCTTTAGTGAAACTGCGAATGGCTNNTTAAATCAGT
+>seq0000013
+TGCATGTCAGTATAGCATTAGTGAAACTGCGAATGGCTCATTAAATCAGT
+>seq0000014
+TCCATGTCAGTATAACTTTGGTGAAACTGCGAATGGCTCATTAAATCAGG
+>seq0000015
+NNNNNNNNNNTATATCTTATGTGAAACTTCGAATGCCTCATTAAATCAGT
+"""
+
+REF_TREE="""((seq0000014:0.08408,seq0000015:0.13713)0.609:0.00215,seq0000013:0.02032,(seq0000011:0.00014,seq0000012:0.00014)0.766:0.00015);
+"""
+
+RESULT_TREE="""((((seq0000014:0.0353946,7:6.11352e-06):0.0291093,6:6.11352e-06):0.019576,seq0000015:0.13713)0.609:0.00215,seq0000013:0.02032,(seq0000011:0.00014,seq0000012:0.00014)0.766:0.00015);"""
+
+RAXML_STATS="""
+
+
+This is RAxML version 7.2.6 released by Alexandros Stamatakis in February 2010.
+
+With greatly appreciated code contributions by:
+Andre Aberer (TUM)
+Simon Berger (TUM)
+John Cazes (TACC)
+Michael Ott (TUM)
+Nick Pattengale (UNM)
+Wayne Pfeiffer (SDSC)
+
+
+Alignment has 18 distinct alignment patterns
+
+Proportion of gaps and completely undetermined characters in this alignment: 4.80%
+
+RAxML rapid hill-climbing mode
+
+Using 1 distinct models/data partitions with joint branch length optimization
+
+
+Executing 1 inferences on the original alignment using 1 distinct randomized MP trees
+
+All free model parameters will be estimated by RAxML
+ML estimate of 25 per site rate categories
+
+Likelihood of final tree will be evaluated and optimized under GAMMA
+
+GAMMA Model parameters will be estimated up to an accuracy of 0.1000000000 Log Likelihood units
+
+Partition: 0
+Alignment Patterns: 18
+Name: No Name Provided
+DataType: DNA
+Substitution Matrix: GTR
+
+
+
+
+RAxML was called as follows:
+
+raxmlHPC -m GTRCAT -s test_raxml.phy -n results
+
+
+Inference[0]: Time 0.072128 CAT-based likelihood -85.425107, best rearrangement setting 2
+alpha[0]: 1.000000 rates[0] ac ag at cg ct gt: 0.000017 0.037400 0.859448 1.304301 0.000017 1.000000
+
+
+Conducting final model optimizations on all 1 trees under GAMMA-based models ....
+
+Inference[0] final GAMMA-based Likelihood: -107.575676 tree written to file /home/RAxML_result.results
+
+
+Starting final GAMMA-based thorough Optimization on tree 0 likelihood -107.575676 ....
+
+Final GAMMA-based Score of best tree -107.575676
+
+Program execution info written to /home/RAxML_info.results
+Best-scoring ML tree written to: /home/RAxML_bestTree.results
+
+Overall execution time: 0.078965 secs or 0.000022 hours or 0.000001 days
+"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_raxml_v730.py b/bfillings/tests/test_raxml_v730.py
new file mode 100644
index 0000000..84738fd
--- /dev/null
+++ b/bfillings/tests/test_raxml_v730.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+
+from os import getcwd, remove, rmdir, mkdir
+from os.path import splitext
+import re
+from random import choice, randint
+from random import randint
+from StringIO import StringIO
+from subprocess import Popen, PIPE, STDOUT
+from unittest import TestCase, main
+
+from cogent.util.misc import flatten
+from cogent.parse.phylip import get_align_for_phylip
+from cogent.core.tree import PhyloNode
+from cogent.core.moltype import RNA, DNA
+from cogent.util.misc import app_path
+from cogent.core.alignment import Alignment
+
+from burrito.util import ApplicationError, get_tmp_filename
+
+from bfillings.raxml_v730 import (Raxml, raxml_alignment,
+                               build_tree_from_alignment,
+                               insert_sequences_into_tree)
+
+
+class GenericRaxml(TestCase):
+
+    def setUp(self):
+        """Check if Raxml version is supported for this test"""
+        acceptable_version = (7,3,0)
+        self.assertTrue(app_path('raxmlHPC'),
+         "raxmlHPC not found. This may or may not be a problem depending on "+\
+         "which components of QIIME you plan to use.")
+        command = "raxmlHPC -v | grep version"
+        proc = Popen(command,shell=True,universal_newlines=True,\
+                         stdout=PIPE,stderr=STDOUT)
+        stdout = proc.stdout.read()
+        version_string = stdout.strip().split(' ')[4].strip()
+        try:
+            version = tuple(map(int,version_string.split('.')))
+            pass_test = version == acceptable_version
+        except ValueError:
+            pass_test = False
+            version_string = stdout
+        self.assertTrue(pass_test,\
+         "Unsupported raxmlHPC version. %s is required, but running %s." \
+         % ('.'.join(map(str,acceptable_version)), version_string))
+        
+        
+        """Setup data for raxml tests"""
+        self.seqs1 = ['ACUGCUAGCUAGUAGCGUACGUA','GCUACGUAGCUAC',
+            'GCGGCUAUUAGAUCGUA']
+        self.labels1 = ['>1','>2','>3']
+        self.lines1 = flatten(zip(self.labels1,self.seqs1))
+
+        self.test_model = "GTRCAT"
+
+        self.align1 = get_align_for_phylip(StringIO(PHYLIP_FILE))
+
+        self.test_fn1 = "/tmp/raxml_test1.txt"
+        self.test_fn2 = "raxml_test1.txt"
+        self.test_fn1_space = "/tmp/raxml test1.txt"
+
+    def writeTmp(self, outname):
+        """Write data to temp file"""
+        t = open(outname, "w+")
+        t.write(PHYLIP_FILE)
+        t.close()
+
+
+class RaxmlTests(GenericRaxml):
+    """Tests for the Raxml application controller"""
+
+    def test_raxml(self):
+        """raxml BaseCommand should return the correct BaseCommand"""
+        r = Raxml()
+        self.assertEqual(r.BaseCommand, \
+            ''.join(['cd \"',getcwd(),'/\"; ','raxmlHPC -f d -# 1']))
+        r.Parameters['-s'].on('seq.nexus')
+        self.assertEqual(r.BaseCommand,\
+            ''.join(['cd \"',getcwd(),'/\"; ',\
+            'raxmlHPC -f d -s seq.nexus -# 1']))
+
+
+    def test_raxml_params(self):
+        """raxml should raise exception if missing required params"""
+
+        r = Raxml(WorkingDir="/tmp")
+
+        r.SuppressStdout = True
+        r.SuppressStderr = True
+        # raise error by default
+        self.assertRaises(ValueError, r)
+
+        # specify output name 
+        r.Parameters['-n'].on("test_name")
+        r.Parameters["-p"].on(randint(1,100000))
+        self.assertRaises(ApplicationError, r)
+
+        # specify model 
+        r.Parameters['-m'].on("GTRCAT")
+        self.assertRaises(ApplicationError, r)
+
+        r.Parameters['-s'].on(self.test_fn1)
+        self.assertRaises(ApplicationError, r)
+
+
+        self.writeTmp(self.test_fn1)
+
+        o = r()
+        o.cleanUp()
+
+        remove(self.test_fn1)
+
+    
+    def test_raxml_from_file(self):
+        """raxml should run correctly using filename"""
+        r = Raxml(WorkingDir="/tmp")
+
+        r.Parameters['-s'].on(self.test_fn1)
+        r.Parameters['-m'].on("GTRCAT")
+        r.Parameters['-n'].on("test_me")
+        r.Parameters["-p"].on(randint(1,100000))
+       
+        # test with abs filename
+        cur_out = self.test_fn1
+        self.writeTmp(cur_out)
+        out = r()
+        out.cleanUp()
+        remove(cur_out)
+
+        # test with rel + working dir 
+        r.Parameters['-s'].on(self.test_fn2)
+        r.Parameters['-n'].on("test_me2")
+        r.Parameters['-w'].on("/tmp/")
+        r.Parameters["-p"].on(randint(1,100000))
+        self.writeTmp(self.test_fn1)
+        out = r()
+        out.cleanUp()
+        remove(self.test_fn1)
+
+        r.Parameters['-s'].on("\"%s\"" % self.test_fn1_space)
+        r.Parameters['-n'].on("test_me3")
+        r.Parameters['-w'].on("/tmp/")
+        r.Parameters["-p"].on(randint(1,100000))
+        #print r.BaseCommand
+        self.writeTmp(self.test_fn1_space)
+        out = r()
+        out.cleanUp()
+        remove(self.test_fn1_space)
+
+    def test_raxml_alignment(self):
+        """raxml_alignment should work as expected"""
+        phy_node, parsimony_phy_node, log_likelihood, total_exec \
+            = raxml_alignment(self.align1)
+
+    def test_build_tree_from_alignment(self):
+        """Builds a tree from an alignment"""
+        
+        tree = build_tree_from_alignment(self.align1, RNA, False)
+        
+        self.assertTrue(isinstance(tree, PhyloNode))
+        self.assertEqual(len(tree.tips()), 7)
+        self.assertRaises(NotImplementedError, build_tree_from_alignment, \
+                          self.align1, RNA, True)
+    
+    def test_insert_sequences_into_tree(self):
+        """Inserts sequences into Tree using params - test handles tree-insertion"""
+        
+        # generate temp filename for output
+        outfname=splitext(get_tmp_filename('/tmp/'))[0]
+        
+        # create starting tree
+        outtreefname=outfname+'.tre'
+        outtree=open(outtreefname,'w')
+        outtree.write(REF_TREE)
+        outtree.close()
+        
+        # set params for tree-insertion
+        params={}
+        params["-w"]="/tmp/"
+        params["-n"] = get_tmp_filename().split("/")[-1]
+        params["-f"] = 'v'
+        #params["-G"] = '0.25'
+        params["-t"] = outtreefname
+        params["-m"] = 'GTRGAMMA'
+        
+        aln_ref_query=get_align_for_phylip(StringIO(PHYLIP_FILE_DNA_REF_QUERY))
+        aln = Alignment(aln_ref_query)
+        seqs, align_map = aln.toPhylip()
+        
+        tree = insert_sequences_into_tree(seqs, DNA, params=params,
+                                          write_log=False)
+        
+        for node in tree.tips():
+            removed_query_str=re.sub('QUERY___','',str(node.Name))
+            new_node_name=re.sub('___\d+','',str(removed_query_str))
+            if new_node_name in align_map:
+                node.Name = align_map[new_node_name]
+
+        self.assertTrue(isinstance(tree, PhyloNode))
+        self.assertEqual(tree.getNewick(with_distances=True),RESULT_TREE)
+        self.assertEqual(len(tree.tips()), 7)
+        self.assertRaises(NotImplementedError, build_tree_from_alignment, \
+                         self.align1, RNA, True)
+                         
+        remove(outtreefname)
+
+PHYLIP_FILE= """ 7 50
+Species001   UGCAUGUCAG UAUAGCUUUA GUGAAACUGC GAAUGGCUCA UUAAAUCAGU
+Species002   UGCAUGUCAG UAUAGCUUUA GUGAAACUGC GAAUGGCUNN UUAAAUCAGU
+Species003   UGCAUGUCAG UAUAGCAUUA GUGAAACUGC GAAUGGCUCA UUAAAUCAGU
+Species004   UCCAUGUCAG UAUAACUUUG GUGAAACUGC GAAUGGCUCA UUAAAUCAGG
+Species005   NNNNNNNNNN UAUAUCUUAU GUGAAACUUC GAAUGCCUCA UUAAAUCAGU
+Species006   UGCAUGUCAG UAUAGCUUUG GUGAAACUGC GAAUGGCUCA UUAAAUCAGU
+Species007   UGCAUGUCAG UAUAACUUUG GUGAAACUGC GAAUGGCUCA UUAAAUCAGU
+""" 
+
+
+PHYLIP_FILE_DNA_REF_QUERY= """ 7 50
+Species001   TGCATGTCAG TATAGCTTTA GTGAAACTGC GAATGGCTCA TTAAATCAGT
+Species002   TGCATGTCAG TATAGCTTTA GTGAAACTGC GAATGGCTNN TTAAATCAGT
+Species003   TGCATGTCAG TATAGCATTA GTGAAACTGC GAATGGCTCA TTAAATCAGT
+Species004   TCCATGTCAG TATAACTTTG GTGAAACTGC GAATGGCTCA TTAAATCAGG
+Species005   NNNNNNNNNN TATATCTTAT GTGAAACTTC GAATGCCTCA TTAAATCAGT
+Species006   TGCATGTCAG TATAGCTTTG GTGAAACTGC GAATGGCTCA TTAAATCAGT
+Species007   TGCATGTCAG TATAACTTTG GTGAAACTGC GAATGGCTCA TTAAATCAGT
+"""
+
+REF_TREE="""((seq0000004:0.08408,seq0000005:0.13713)0.609:0.00215,seq0000003:0.02032,(seq0000001:0.00014,seq0000002:0.00014)0.766:0.00015);
+"""
+
+RESULT_TREE="""(Species003:0.0194919169324,(Species001:4.34281710439e-07,Species002:4.34281710439e-07):4.34281710439e-07,(((Species006:0.0,Species007:0.0):0.0,Species004:0.0438017433031):0.0438017433031,Species005:0.171345128781):0.00331197405878);"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_rdp_classifier.py b/bfillings/tests/test_rdp_classifier.py
new file mode 100644
index 0000000..5efed73
--- /dev/null
+++ b/bfillings/tests/test_rdp_classifier.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""Tests for the rdp_classifier_2.0.1 application controller"""
+
+from cStringIO import StringIO
+from os import getcwd, environ, remove, listdir
+from shutil import rmtree
+import tempfile
+from unittest import TestCase, main
+
+from bfillings.rdp_classifier import (RdpClassifier, RdpTrainer, assign_taxonomy,
+                                   train_rdp_classifier,
+                                   train_rdp_classifier_and_assign_taxonomy,
+                                   parse_rdp_assignment)
+
+
+class RdpClassifierTests(TestCase):
+    def setUp(self):
+        # fetch user's RDP_JAR_PATH
+        if 'RDP_JAR_PATH' in environ:
+            self.user_rdp_jar_path = environ['RDP_JAR_PATH']
+        else:
+            self.user_rdp_jar_path = 'rdp_classifier-2.2.jar'
+        self.output_file = tempfile.NamedTemporaryFile()
+
+    def test_default_java_vm_parameters(self):
+        """RdpClassifier should store default arguments to Java VM."""
+        a = RdpClassifier()
+        self.assertTrue('-Xmx' in a.Parameters)
+        self.assertEqual(a.Parameters['-Xmx'].Value, '1000m')
+
+    def test_parameters_list(self):
+        a = RdpClassifier()
+        parameters = a.Parameters.keys()
+        parameters.sort()
+        self.assertEqual(parameters, ['-Xmx', '-f', '-o', '-t'])
+
+    def test_assign_jvm_parameters(self):
+        """RdpCalssifier should pass alternate parameters to Java VM."""
+        app = RdpClassifier()
+        app.Parameters['-Xmx'].on('75M')
+        exp = ''.join([
+            'cd "', getcwd(), '/"; java -Xmx75M -jar "',
+            self.user_rdp_jar_path, '" -q'])
+        self.assertEqual(app.BaseCommand, exp)
+
+    def test_basecommand_property(self):
+        """RdpClassifier BaseCommand property should use overridden method."""
+        app = RdpClassifier()
+        self.assertEqual(app.BaseCommand, app._get_base_command())
+
+    def test_base_command(self):
+        """RdpClassifier should return expected shell command."""
+        app = RdpClassifier()
+        exp = ''.join([
+            'cd "', getcwd(), '/"; java -Xmx1000m -jar "',
+            self.user_rdp_jar_path, '" -q'])
+        self.assertEqual(app.BaseCommand, exp)
+
+    def test_change_working_dir(self):
+        """RdpClassifier should run program in expected working directory."""
+        test_dir = '/tmp/RdpTest'
+
+        app = RdpClassifier(WorkingDir=test_dir)
+        exp = ''.join([
+            'cd "', test_dir, '/"; java -Xmx1000m -jar "',
+            self.user_rdp_jar_path, '" -q'])
+        self.assertEqual(app.BaseCommand, exp)
+
+        rmtree(test_dir)
+
+    def test_sample_fasta(self):
+        """RdpClassifier should classify its own sample data correctly"""
+        test_dir = '/tmp/RdpTest'
+        app = RdpClassifier(WorkingDir=test_dir)
+        _, output_fp = tempfile.mkstemp(dir=test_dir)
+        app.Parameters['-o'].on(output_fp)
+
+        results = app(StringIO(rdp_sample_fasta))
+
+        assignment_toks = results['Assignments'].readline().split('\t')
+
+        self.assertEqual(assignment_toks[0], 'X67228')
+        lineage = [x.strip('"') for x in assignment_toks[2::3]]
+        self.assertEqual(lineage, [
+            'Root', 'Bacteria', 'Proteobacteria', 'Alphaproteobacteria',
+            'Rhizobiales', 'Rhizobiaceae', 'Rhizobium'])
+        rmtree(test_dir)
+
+
+class RdpTrainerTests(TestCase):
+    """Tests of the trainer for the RdpClassifier app
+    """
+
+    def setUp(self):
+        self.reference_file = StringIO(rdp_training_sequences)
+        self.reference_file.seek(0)
+
+        self.taxonomy_file = tempfile.NamedTemporaryFile(
+            prefix="RdpTaxonomy", suffix=".txt")
+        self.taxonomy_file.write(rdp_training_taxonomy)
+        self.taxonomy_file.seek(0)
+
+        self.training_dir = tempfile.mkdtemp(prefix='RdpTrainer_output_')
+
+    def tearDown(self):
+        rmtree(self.training_dir)
+
+    def test_call(self):
+        app = RdpTrainer()
+        app.Parameters['taxonomy_file'].on(self.taxonomy_file.name)
+        app.Parameters['model_output_dir'].on(self.training_dir)
+        results = app(self.reference_file)
+
+        exp_file_list = [
+            'bergeyTrainingTree.xml', 'genus_wordConditionalProbList.txt',
+            'logWordPrior.txt', 'RdpClassifier.properties',
+            'wordConditionalProbIndexArr.txt',
+            ]
+        obs_file_list = listdir(self.training_dir)
+        exp_file_list.sort()
+        obs_file_list.sort()
+        self.assertEqual(obs_file_list, exp_file_list)
+
+        autogenerated_headers = {
+            'bergeyTree': 'bergeyTrainingTree',
+            'probabilityList': 'genus_wordConditionalProbList',
+            'wordPrior': 'logWordPrior',
+            'probabilityIndex': 'wordConditionalProbIndexArr',
+            }
+        for id, basename in autogenerated_headers.iteritems():
+            obs_header = results[id].readline()
+            exp_header = exp_training_header_template % basename
+            self.assertEqual(exp_header, obs_header)
+
+
+class RdpWrapperTests(TestCase):
+    """ Tests of RDP classifier wrapper functions
+    """
+    def setUp(self):
+        self.num_trials = 10
+
+        self.test_input1 = rdp_test_fasta.split('\n')
+        self.expected_assignments1 = rdp_expected_out
+
+        # Files for training
+        self.reference_file = StringIO(rdp_training_sequences)
+        self.reference_file.seek(0)
+
+        self.taxonomy_file = StringIO(rdp_training_taxonomy)
+        self.taxonomy_file.seek(0)
+
+        self.training_dir = tempfile.mkdtemp(prefix='RdpTrainer_output_')
+
+        # Sequences for trained classifier
+        self.test_trained_input = rdp_trained_fasta.split("\n")
+
+    def tearDown(self):
+        rmtree(self.training_dir)
+
+    def test_parse_rdp_assignment(self):
+        seqid, direction, assignments = parse_rdp_assignment(
+            "X67228\t\t"
+            "Root\tnorank\t1.0\t"
+            "Bacteria\tdomain\t1.0\t"
+            "\"Proteobacteria\"\tphylum\t1.0\t"
+            "Alphaproteobacteria\tclass\t0.9\t"
+            "Rhizobiales\torder\t0.9\t"
+            "Rhizobiaceae\tfamily\t0.47\t"
+            "Rhizobium\tgenus\t0.46")
+        self.assertEqual(seqid, "X67228")
+
+    def test_assign_taxonomy_short_sequence(self):
+        """assign_taxonomy should return Unclassifiable if sequence is too short
+        """
+        assignments = assign_taxonomy([
+            '>MySeq 1',
+            'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGA',
+            ])
+        self.assertEqual(assignments, {'MySeq 1': ('Unassignable', 1.0)})
+
+    def test_assign_taxonomy(self):
+        """assign_taxonomy wrapper functions as expected
+
+        This test may fail periodicially, but failure should be rare.
+        """
+        unverified_seq_ids = set(self.expected_assignments1.keys())
+        for i in range(self.num_trials):
+            obs_assignments = assign_taxonomy(self.test_input1)
+            for seq_id in list(unverified_seq_ids):
+                obs_lineage, obs_confidence = obs_assignments[seq_id]
+                exp_lineage = self.expected_assignments1[seq_id]
+                if (obs_lineage == exp_lineage):
+                    unverified_seq_ids.remove(seq_id)
+            if not unverified_seq_ids:
+                break
+
+        messages = []
+        for seq_id in unverified_seq_ids:
+            messages.append("Unable to verify %s trials" % self.num_trials)
+            messages.append("  Sequence ID: %s" % seq_id)
+            messages.append("  Expected: %s" % self.expected_assignments1[seq_id])
+            messages.append("  Observed: %s" % obs_assignments[seq_id][0])
+            messages.append("  Confidence: %s" % obs_assignments[seq_id][1])
+
+        # make sure all taxonomic results were correct at least once
+        self.assertFalse(unverified_seq_ids, msg='\n'.join(messages))
+
+    def test_assign_taxonomy_alt_confidence(self):
+        """assign_taxonomy wrapper functions as expected with alt confidence
+        """
+        obs_assignments = assign_taxonomy(
+            self.test_input1, min_confidence=0.95)
+
+        for seq_id, assignment in obs_assignments.items():
+            obs_lineage, obs_confidence = assignment
+            exp_lineage = self.expected_assignments1[seq_id]
+            message = "Sequence ID: %s, assignment: %s" % (seq_id, assignment)
+            self.assertTrue(
+                exp_lineage.startswith(obs_lineage) or \
+                (obs_lineage == "Unclassified"),
+                msg=message,
+                )
+            self.assertTrue(obs_confidence >= 0.95, msg=message)
+
+    def test_assign_taxonomy_file_output(self):
+        """ assign_taxonomy wrapper writes correct file output when requested
+
+            This function tests for sucessful completion of assign_taxonomy
+             when writing to file, that the lines in the file roughly look
+             correct by verifying how many are written (by zipping with
+             expected), and that each line starts with the correct seq id.
+             Actual testing of taxonomy data is performed elsewhere.
+
+        """
+        _, output_fp = tempfile.mkstemp(prefix='RDPAssignTaxonomyTests',
+                                        suffix='.txt')
+        # convert the expected dict to a list of lines to match
+        # file output
+        expected_file_headers = self.expected_assignments1.keys()
+        expected_file_headers.sort()
+
+        actual_return_value = assign_taxonomy(\
+         self.test_input1,min_confidence=0.95,output_fp=output_fp)
+
+        actual_file_output = list(open(output_fp))
+        actual_file_output.sort()
+
+        # remove the output_fp before running the tests, so if they
+        # fail the output file is still cleaned-up
+        remove(output_fp)
+
+        # None return value on write to file
+        self.assertEqual(actual_return_value,None)
+
+        # check that each line starts with the correct seq_id -- not
+        # checking the taxonomies or confidences here as these are variable and
+        # tested elsewhere
+        for a,e in zip(actual_file_output,expected_file_headers):
+            self.assertTrue(a.startswith(e))
+
+    def test_train_rdp_classifier(self):
+        results = train_rdp_classifier(
+            self.reference_file, self.taxonomy_file, self.training_dir)
+
+        exp_file_list = [
+            'bergeyTrainingTree.xml', 'genus_wordConditionalProbList.txt',
+            'logWordPrior.txt', 'RdpClassifier.properties',
+            'wordConditionalProbIndexArr.txt',
+            ]
+        obs_file_list = listdir(self.training_dir)
+        exp_file_list.sort()
+        obs_file_list.sort()
+        self.assertEqual(obs_file_list, exp_file_list)
+
+        autogenerated_headers = {
+            'bergeyTree': 'bergeyTrainingTree',
+            'probabilityList': 'genus_wordConditionalProbList',
+            'wordPrior': 'logWordPrior',
+            'probabilityIndex': 'wordConditionalProbIndexArr',
+            }
+        for id, basename in autogenerated_headers.iteritems():
+            obs_header = results[id].readline()
+            exp_header = exp_training_header_template % basename
+            self.assertEqual(exp_header, obs_header)
+
+    def test_train_rdp_classifier_and_assign_taxonomy(self):
+        obs = train_rdp_classifier_and_assign_taxonomy(self.reference_file,
+            self.taxonomy_file, self.test_trained_input, min_confidence=0.80,
+            model_output_dir=self.training_dir)
+        exp = {'X67228': (
+            'Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;'
+            'Rhizobiaceae;Rhizobium', 1.0
+            )}
+        self.assertEqual(obs, exp)
+
+    def test_train_rdp_classifier_and_assign_taxonomy_no_model_output(self):
+        obs = train_rdp_classifier_and_assign_taxonomy(
+            self.reference_file, self.taxonomy_file, self.test_trained_input)
+        exp = {'X67228': (
+            'Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;'
+            'Rhizobiaceae;Rhizobium', 1.0
+            )}
+        self.assertEqual(obs, exp)
+
+# Sample data copied from rdp_classifier-2.0, which is licensed under
+# the GPL 2.0 and Copyright 2008 Michigan State University Board of
+# Trustees
+
+rdp_training_sequences = """>X67228 Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Rhizobium
+aacgaacgctggcggcaggcttaacacatgcaagtcgaacgctccgcaaggagagtggcagacgggtgagtaacgcgtgggaatctacccaaccctgcggaatagctctgggaaactggaattaataccgcatacgccctacgggggaaagatttatcggggatggatgagcccgcgttggattagctagttggtggggtaaaggcctaccaaggcgacgatccatagctggtctgagaggatgatcagccacattgggactgagacacggcccaaa
+>X73443 Bacteria;Firmicutes;Clostridia;Clostridiales;Clostridiaceae;Clostridium
+nnnnnnngagatttgatcctggctcaggatgaacgctggccggccgtgcttacacatgcagtcgaacgaagcgcttaaactggatttcttcggattgaagtttttgctgactgagtggcggacgggtgagtaacgcgtgggtaacctgcctcatacagggggataacagttagaaatgactgctaataccnnataagcgcacagtgctgcatggcacagtgtaaaaactccggtggtatgagatggacccgcgtctgattagctagttggtggggt
+>AB004750 Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Enterobacter
+acgctggcggcaggcctaacacatgcaagtcgaacggtagcagaaagaagcttgcttctttgctgacgagtggcggacgggtgagtaatgtctgggaaactgcccgatggagggggataactactggaaacggtagctaataccgcataacgtcttcggaccaaagagggggaccttcgggcctcttgccatcggatgtgcccagatgggattagctagtaggtggggtaacggctcacctaggcgacgatccctagctggtctgagaggatgaccagccacactggaactgagacacggtccagactcctacgggaggcagcagtggggaatattgca
+>xxxxxx Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas
+ttgaacgctggcggcaggcctaacacatgcaagtcgagcggcagcannnncttcgggaggctggcgagcggcggacgggtgagtaacgcatgggaacttacccagtagtgggggatagcccggggaaacccggattaataccgcatacgccctgagggggaaagcgggctccggtcgcgctattggatgggcccatgtcggattagttagttggtggggtaatggcctaccaaggcgacgatccgtagctggtctgagaggatgatcagccacaccgggactgagacacggcccggactcctacgggaggcagcagtggggaatattggacaatgggggcaaccctgatccagccatgccg
+>AB004748 Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Enterobacter
+acgctggcggcaggcctaacacatgcaagtcgaacggtagcagaaagaagcttgcttctttgctgacgagtggcggacgggtgagtaatgtctgggaaactgcccgatggagggggataactactggaaacggtagctaataccgcataacgtcttcggaccaaagagggggaccttcgggcctcttgccatcggatgtgcccagatgggattagctagtaggtggggtaacggctcacctaggcgacgatccctagctggtctgagaggatgaccagccacactggaactgagacacggtccagactcctacgggaggcagcagtggggaatattgcacaatgggcgcaagcctgatgcagccatgccgcgtgtatgaagaaggccttcgggttg
+>AB000278 Bacteria;Proteobacteria;Gammaproteobacteria;Vibrionales;Vibrionaceae;Photobacterium
+caggcctaacacatgcaagtcgaacggtaanagattgatagcttgctatcaatgctgacgancggcggacgggtgagtaatgcctgggaatataccctgatgtgggggataactattggaaacgatagctaataccgcataatctcttcggagcaaagagggggaccttcgggcctctcgcgtcaggattagcccaggtgggattagctagttggtggggtaatggctcaccaaggcgacgatccctagctggtctgagaggatgatcagccacactggaactgagacacggtccagactcctacgggaggcagcagtggggaatattgcacaatgggggaaaccctgatgcagccatgccgcgtgta
+>AB000390 Bacteria;Proteobacteria;Gammaproteobacteria;Vibrionales;Vibrionaceae;Vibrio
+tggctcagattgaacgctggcggcaggcctaacacatgcaagtcgagcggaaacgantnntntgaaccttcggggnacgatnacggcgtcgagcggcggacgggtgagtaatgcctgggaaattgccctgatgtgggggataactattggaaacgatagctaataccgcataatgtctacggaccaaagagggggaccttcgggcctctcgcttcaggatatgcccaggtgggattagctagttggtgaggtaatggctcaccaaggcgacgatccctagctggtctgagaggatgatcagccacactggaactgag
+"""
+
+rdp_training_taxonomy = """\
+1*Bacteria*0*0*domain
+765*Firmicutes*1*1*phylum
+766*Clostridia*765*2*class
+767*Clostridiales*766*3*order
+768*Clostridiaceae*767*4*family
+769*Clostridium*768*5*genus
+160*Proteobacteria*1*1*phylum
+433*Gammaproteobacteria*160*2*class
+586*Vibrionales*433*3*order
+587*Vibrionaceae*586*4*family
+588*Vibrio*587*5*genus
+592*Photobacterium*587*5*genus
+552*Pseudomonadales*433*3*order
+553*Pseudomonadaceae*552*4*family
+554*Pseudomonas*553*5*genus
+604*Enterobacteriales*433*3*order
+605*Enterobacteriaceae*604*4*family
+617*Enterobacter*605*5*genus
+161*Alphaproteobacteria*160*2*class
+260*Rhizobiales*161*3*order
+261*Rhizobiaceae*260*4*family
+262*Rhizobium*261*5*genus"""
+
+exp_training_header_template = "<trainsetNo>1</trainsetNo><version>version1</version><modversion>cogent</modversion><file>%s</file>\n"
+
+rdp_trained_fasta = """>X67228
+aacgaacgctggcggcaggcttaacacatgcaagtcgaacgctccgcaaggagagtggcagacgggtgagtaacgcgtgggaatctacccaaccctgcggaatagctctgggaaactggaattaataccgcatacgccctacgggggaaagatttatcggggatggatgagcccgcgttggattagctagttggtggggtaaaggcctaccaaggcgacgatccatagctggtctgagaggatgatcagccacattgggactgagacacggcccaaa
+"""
+
+rdp_sample_fasta = """>X67228 Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Rhizobium
+aacgaacgctggcggcaggcttaacacatgcaagtcgaacgctccgcaaggagagtggcagacgggtgagtaacgcgtgggaatctacccaaccctgcggaatagctctgggaaactggaattaataccgcatacgccctacgggggaaagatttatcggggatggatgagcccgcgttggattagctagttggtggggtaaaggcctaccaaggcgacgatccatagctggtctgagaggatgatcagccacattgggactgagacacggcccaaa
+"""
+
+rdp_sample_classification = """>X67228 reverse=false
+Root; 1.0; Bacteria; 1.0; Proteobacteria; 1.0; Alphaproteobacteria; 1.0; Rhizobiales; 1.0; Rhizobiaceae; 1.0; Rhizobium; 0.95;
+"""
+
+rdp_test_fasta = """>AY800210 description field
+TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGAT [...]
+>EU883771
+TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAG [...]
+>EF503699
+AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAG [...]
+>random_seq
+AAGCTCCGTCGCGTGAGCTAAAAACCATGCTGACTTATGAGACCTAAAAGCGATGCGCCGACCTGACGATGCTCTGTTCAGTTTCATCACGATCACCGGTAGTCAGGGTACCCTCCAGACCGCGCATAGTGACTATGTTCCCGCACCTGTATATGTAATTCCCATTATACGTCTACGTTATGTAGTAAAGTTGCTCACGCCAGGCACAGTTTGTCTTGATACATAGGGTAGCTTAAGTCCCGTCCATTTCACCGCGATTGTAATAGACGAATCAGCAGTGGTGCAATCAAGTCCCAACAGTTATATTTCAAAAATCTTCCGATAGTCGTGGGCGAAGTTGTCAACCTACCTACCATGGCTATAAGGCCCAGTTTACTTCAGTTGAACGTGACGGTAACCCTACTGAGTGCACGATACCTGCTCAACAACGGCCCAAAACCCGTGCGACACATTGGGCACTACAATAATCTTAGAGGACCATGGATCTGGTGG [...]
+>DQ260310
+GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGG [...]
+>EF503697
+TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAG [...]
+>short_seq
+TAAAATGACTAGCCTGCGAGTCAC
+"""
+
+rdp_expected_out = {
+    'AY800210 description field': 'Archaea;Euryarchaeota',
+    'EU883771': 'Archaea;Euryarchaeota;Methanomicrobia;Methanomicrobiales;Methanomicrobiaceae;Methanomicrobium',
+    'EF503699': 'Archaea;Crenarchaeota;Thermoprotei',
+    'random_seq': 'Bacteria',
+    'DQ260310': 'Archaea;Euryarchaeota;Methanobacteria;Methanobacteriales;Methanobacteriaceae;Methanosphaera',
+    'EF503697': 'Archaea;Crenarchaeota;Thermoprotei',
+    'short_seq': 'Unassignable',
+    }
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_rtax.py b/bfillings/tests/test_rtax.py
new file mode 100644
index 0000000..572caac
--- /dev/null
+++ b/bfillings/tests/test_rtax.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+from unittest import TestCase, main
+from tempfile import mkstemp
+
+from skbio.util import remove_files
+
+from bfillings.rtax import Rtax, assign_taxonomy
+
+
+class RtaxClassifierTests(TestCase):
+    """ Tests of the RTAX classifier module """
+
+    def setUp(self):
+        self.maxDiff = None
+
+        _, self.id_to_taxonomy_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+                                            suffix='.txt')
+        _, self.input_seqs_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+                                        suffix='.fasta')
+        _, self.reference_seqs_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+                                            suffix='.fasta')
+        _, self.read_1_seqs_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+                                         suffix='.fasta')
+        _, self.read_2_seqs_fp = mkstemp(prefix='RtaxTaxonAssignerTests_',
+                                         suffix='.fasta')
+
+        self._paths_to_clean_up = [self.id_to_taxonomy_fp,self.input_seqs_fp,self.reference_seqs_fp, self.read_1_seqs_fp,self.read_2_seqs_fp]
+
+        a = open(self.id_to_taxonomy_fp,'w')
+        a.write(rtax_reference_taxonomy)
+        a.close()
+        b = open(self.reference_seqs_fp,'w')
+        b.write(rtax_reference_fasta)
+        b.close()
+        c = open(self.input_seqs_fp,'w')
+        c.write(rtax_test_repset_fasta)
+        c.close()
+        d = open(self.read_1_seqs_fp,'w')
+        d.write(rtax_test_read1_fasta)
+        d.close()
+        e = open(self.read_2_seqs_fp,'w')
+        e.write(rtax_test_read2_fasta)
+        e.close()
+
+    def tearDown(self):
+        remove_files(set(self._paths_to_clean_up),error_on_missing=False)
+
+    def test_paired_end_classification(self):
+        self._paths_to_clean_up += cleanAll(self.read_1_seqs_fp)
+        self._paths_to_clean_up += cleanAll(self.read_2_seqs_fp)
+        result = assign_taxonomy(self.input_seqs_fp, self.reference_seqs_fp, self.id_to_taxonomy_fp, self.read_1_seqs_fp, self.read_2_seqs_fp,single_ok=False,header_id_regex="\\S+\\s+(\\S+?)\/")
+        self.assertEqual(result, rtax_expected_result_paired)
+
+    def test_paired_end_classification_with_fallback(self):
+        self._paths_to_clean_up += cleanAll(self.read_1_seqs_fp)
+        self._paths_to_clean_up += cleanAll(self.read_2_seqs_fp)
+        result = assign_taxonomy(self.input_seqs_fp, self.reference_seqs_fp, self.id_to_taxonomy_fp, self.read_1_seqs_fp, self.read_2_seqs_fp,single_ok=True,header_id_regex="\\S+\\s+(\\S+?)\/")
+        self.assertEqual(result, rtax_expected_result_paired_with_fallback)
+
+    def test_single_end_classification(self):
+        self._paths_to_clean_up += cleanAll(self.read_1_seqs_fp)
+        result = assign_taxonomy(self.input_seqs_fp, self.reference_seqs_fp, self.id_to_taxonomy_fp, self.read_1_seqs_fp, None ,header_id_regex="\\S+\\s+(\\S+?)\/")
+        self.assertEqual(result, rtax_expected_result_single)
+
+    # I'd like to add tests here that involve the TOOMANYHITS case.  However, that requires either a reference
+    # database with >16,000 sequences, which we don't have handy for tests, or adjusting the maxMaxAccepts parameter to rtaxSearch.pl.
+    # However the "rtax" wrapper shell script currently doesn't allow setting that option, and I'd prefer to leave that as is
+    # unless someone actually wants to use it.  Thus the TOOMANYHITS situation is not easily testable at the moment.
+
+
+def cleanAll(path):
+    return [path, path + ".pos.db",  path + ".pos.dir", path + ".pos.pag", path + ".lines.db", path + ".lines.dir", path + ".lines.pag"]
+
+
+# sample data copied from GreenGenes
+
+
+rtax_reference_taxonomy = """508720	99.0	k__Bacteria	 p__Actinobacteria	 c__Actinobacteria	 o__Actinomycetales	 f__Propionibacteriaceae	 g__Propionibacterium	 s__Propionibacterium acnes
+508050	99.0	k__Bacteria	 p__Proteobacteria	 c__Betaproteobacteria	 o__Burkholderiales	 f__Comamonadaceae	 g__Diaphorobacter	 s__
+502492	99.0	k__Bacteria	 p__Proteobacteria	 c__Betaproteobacteria	 o__Burkholderiales	 f__	 g__Aquabacterium	 s__
+"""
+
+rtax_reference_fasta = """>508720
+GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAGGCCCTGCTTTTGTGGGGTGCTCGAGTGGCGAACG
+GGTGAGTAACACGTGAGTAACCTGCCCTTGACTTTGGGATAACTTCAGGAAACTGGGGCTAATACCGGATAGGAGCTCCT
+GCTGCATGGTGGGGGTTGGAAAGTTTCGGCGGTTGGGGATGGACTCGCGGCTTATCAGCTTGTTGGTGGGGTAGTGGCTT
+ACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGCCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGA
+GGCAGCAGTGGGGAATATTGCACAATGGGCGGAAGCCTGATGCAGCAACGCCGCGTGCGGGATGACGGCCTTCGGGTTGT
+AAACCGCTTTCGCCTGTGACGAAGCGTGAGTGACGGTAATGGGTAAAGAAGCACCGGCTAACTACGTGCCAGCAGCCGCG
+GTGATACGTAGGGTGCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGCTCGTAGGTGGTTGATCGCGTCGGAAGTGTAA
+TCTTGGGGCTTAACCCTGAGCGTGCTTTCGATACGGGTTGACTTGAGGAAGGTAGGGGAGAATGGAATTCCTGGTGGAGC
+GGTGGAATGCGCAGATATCAGGAGGAACACCAGTGGCGAAGGCGGTTCTCTGGGCCTTTCCTGACGCTGAGGAGCGAAAG
+CGTGGGGAGCGAACAGGCTTAGATACCCTGGTAGTCCACGCTGTAAACGGTGGGTACTAGGTGTGGGGTCCATTCCACGG
+GTTCCGTGCCGTAGCTAACGCTTTAAGTACCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCAAAGGAATTGACGGG
+GCCCCGCACAAGCGGCGGAGCATGCGGATTAATTCGATGCAACGCGTAGAACCTTACCTGGGTTTGACATGGATCGGGAG
+TGCTCAGAGATGGGTGTGCCTCTTTTGGGGTCGGTTCACAGGTGGTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTT
+GGGTTAAGTCCCGCAACGAGCGCAACCCTTGTTCACTGTTGCCAGCACGTTATGGTGGGGACTCAGTGGAGACCGCCGGG
+GTCAACTCGGAGGAAGGTGGGGATGACGTCAAGTCATCATGCCCCTTATGTCCAGGGCTTCACGCATGCTACAATGGCTG
+GTACAGAGAGTGGCGAGCCTGTGAGGGTGAGCGAATCTCGGAAAGCCGGTCTCAGTTCGGATTGGGGTCTGCAACTCGAC
+CTCATGAAGTCGGAGTCGCTAGTAATCGCAGATCAGCAACGCTGCGGTGAATACGTTCCCGGGGCT
+>508050
+ATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGTAACAGGTCTTCGGATGCTGACGAGTGGCGAACGGGTG
+AGTAATACATCGGAACGTGCCCGATCGTGGGGGATAACGAGGCGAAAGCTTTGCTAATACCGCATACGATCTACGGATGA
+AAGCGGGGGATCTTCGGACCTCGCGCGGACGGAGCGGCCGATGGCAGATTAGGTAGTTGGTGGGATAAAAGCTTACCAAG
+CCGACGATCTGTAGCTGGTCTGAGAGGATGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGC
+AGTGGGGAATTTTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGCAGGATGAAGGCCTTCGGGTTGTAAACTG
+CTTTTGTACGGAACGAAAAGCCTCTTTCTAATAAAGAGGGGTCATGACGGTACCGTAAGAATAAGCACCGGCTAACTACG
+TGCCAGCAGCCGCGGTAATACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTTTGTA
+AGACAGAGGTGAAATCCCCGGGCTCAACCTGGGAACTGCCTTTGTGACTGCAAGGCTGGAGTGCGGCAGAGGGGGATGGA
+ATTCCGCGTGTAGCAGTGAAATGCGTAGATATGCGGAGGAACACCGATGGCGAAGGCAATCCCCTGGGCCTGCACTGACG
+CTCATGCACGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGTCAACTGGTTGTTG
+GGTCTTCACTGACTCAGTAACGAAGCTAACGCGTGAAGTTGACCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAAAG
+GAATTGACGGGGACCCGCACAAGCGGTGGATGATGTGGTTTAATTCGATGCAACGCGAAAAACCTTACCCACCTTTGACA
+TGGCAGGAAGTTTCCAGAGATGGATTCGTGCCCGAAAGGGAACCTGCACACAGGTGCTGCATGGCTGTCGTCAGCTCGTG
+TCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGCCATTAGTTGCTACGAAAGGGCACTCTAATGGGACTG
+CCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATAGGTGGGGCTACACACGTCATACAAT
+GGCTGGTACAGAGGGTTGCCAACCCGCGAGGGGGAGCTAATCCCATAAAGCCAGTCGTAGTCCGGATCGCAGTCTGCAAC
+TCGACTGCGTGAAGTCGGAATCGCTAGTAATCGCGGATCAGAATGTCGCGGTGAATACGTTCCCGGGTCT
+>502492
+ATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGTAACGGGTCCTTCGGGATGCCGACGAGTGGCGAACGGG
+TGAGTAATATATCGGAACGTGCCCAGTAGTGGGGGATAACTGCTCGAAAGAGCAGCTAATACCGCATACGACCTGAGGGT
+GAAAGGGGGGGATCGCAAGACCTCTCGCTATTGGAGCGGCCGATATCAGATTAGCTAGTTGGTGGGGTAAAGGCCTACCA
+AGGCAACGATCTGTAGTTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+GCAGTGGGGAATTTTGGACAATGGGCGCAAGCCTGATCCAGCAATGCCGCGTGCAGGAAGAAGGCCTTCGGGTTGTAAAC
+TGCTTTTGTCAGGGAAGAAATCTTCTGGGCTAATACCCCGGGAGGATGACGGTACCTGAAGAATAAGCACCGGCTAACTA
+CGTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTTTG
+CAAGACAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATTTGTGACTGCAAGGCTAGAGTACGGCAGAGGGGGATG
+GAATTCCGCGTGTAGCAGTGAAATGCGTAGATATGCGGAGGAACACCAATGGCGAAGGCAATCCCCTGGGCCTGTACTGA
+CGCTCATGCACGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGTCAACTGGTTGT
+TGGACGGCTTGCTGTTCAGTAACGAAGCTAACGCGTGAAGTTGACCGCCTGGGGAGTACGGCCGCAAGGTTGAAACTCAA
+AGGAATTGACGGGGACCCGCACAAGCGGTGGATGATGTGGTTTAATTCGATGCAACGCGAAAAACCTTACCTACCCTTGA
+CATGTCAAGAATTCTGCAGAGATGTGGAAGTGCTCGAAAGAGAACTTGAACACAGGTGCTGCATGGCCGTCGTCAGCTCG
+TGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCATTAGTTGCTACGCAAGAGCACTCTAATGAGAC
+TGCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAGGTCCTCATGGCCCTTATGGGTAGGGCTACACACGTCATACA
+ATGGCCGGTACAGAGGGCTGCCAACCCGCGAGGGGGAGCCAATCCCAGAAAACCGGTCGTAGTCCGGATCGTAGTCTGCA
+ACTCGACTGCGTGAAGTCGGAATCGCTAGTAATCGCGGATCAGCTTGCCGCGGTGAATACGTTCCCGGGTCT
+"""
+
+
+rtax_test_repset_fasta = """>clusterIdA splitRead1IdA
+ACCAAGGCTTTGACGGGTAGCCGGCCTGAGTGGGTGACCGGCCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGA
+>clusterIdB splitRead1IdB
+CCGACGATCTGTAGCTGGTCTGAGAGGATGTTCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGC
+>clusterIdC splitRead1IdC
+AGGCAACGATCTGTAGTTGGTCTGAGAGGAGGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+>clusterIdD splitRead1IdD
+AGGCAACGATCTGTAGTTGGTCTGAGAGGAGGACCAGCCACACTGGGACGGGGGGGGGGCCCAGACTCCTACGGGAGGCA
+"""
+
+# these reads are the 4th and 14th lines from the reference seqs
+
+#rtax_test_read1_fasta = """>splitRead1IdA ampliconId_34563456/1
+#ACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGCCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGA
+#>splitRead1IdB ampliconId_
+#CCGACGATCTGTAGCTGGTCTGAGAGGATGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGC
+#>splitRead1IdC ampliconId_
+#AGGCAACGATCTGTAGTTGGTCTGAGAGGACGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+#"""
+#
+#rtax_test_read2_fasta = """>splitRead2IdA ampliconId_34563456/3
+#GGGTTAAGTCCCGCAACGAGCGCAACCCTTGTTCACTGTTGCCAGCACGTTATGGTGGGGACTCAGTGGAGACCGCCGGG
+#>splitRead2IdB ampliconId_
+#TCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGCCATTAGTTGCTACGAAAGGGCACTCTAATGGGACTG
+#>splitRead2IdC ampliconId_
+#TGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCATTAGTTGCTACGCAAGAGCACTCTAATGAGAC
+#"""
+
+
+# these reads are the 4th and 14th lines from the reference seqs, with one nucleotide changed each
+# except D and E, which are unique to one read or the other
+# and F and G, which are just decoys
+
+rtax_test_read1_fasta = """>splitRead1IdA ampliconId_34563456/1
+ACCAAGGCTTTGACGGGTAGCCGGCCTGAGTGGGTGACCGGCCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGA
+>splitRead1IdB ampliconId_12341234/1
+CCGACGATCTGTAGCTGGTCTGAGAGGATGTTCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGC
+>splitRead1IdC ampliconId_23452345/1
+AGGCAACGATCTGTAGTTGGTCTGAGAGGAGGACCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+>splitRead1IdD ampliconId_45674567/1
+AGGCAACGATCTGTAGTTGGTCTGAGAGGAGGACCAAAAAAAAAAAGACTGAGACACGGCCCAGACTCCTACGGGAGGCA
+>splitRead1IdF ampliconId_56785678/1
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+"""
+
+rtax_test_read2_fasta = """>splitRead2IdA ampliconId_34563456/3
+GGGTTAAGTCCCGCAACGAGCGCAACCCTTATTCACTGTTGCCAGCACGTTATGGTGGGGACTCAGTGGAGACCGCCGGG
+>splitRead2IdB ampliconId_12341234/3
+TCGTGAGATGTTGGGTTAAGTCCCGCAACGTGCGCAACCCTTGCCATTAGTTGCTACGAAAGGGCACTCTAATGGGACTG
+>splitRead2IdC ampliconId_23452345/3
+TGTCGTGAGATGTTGGGTTAAGTCCCGCAAAGAGCGCAACCCTTGTCATTAGTTGCTACGCAAGAGCACTCTAATGAGAC
+>splitRead2IdE ampliconId_67896789/3
+TGTCGTGAGATGTTGGGTTAAAAAAAAAAAAAAACGCAACCCTTGTCATTAGTTGCTACGCAAGAGCACTCTAATGAGAC
+>splitRead2IdG ampliconId_78907890/3
+TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+"""
+
+
+rtax_expected_result_paired = {
+    'clusterIdA splitRead1IdA': ('k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Propionibacteriaceae; g__Propionibacterium; s__Propionibacterium acnes', 1.0),
+    'clusterIdB splitRead1IdB': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Diaphorobacter; s__', 1.0),
+    'clusterIdC splitRead1IdC': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+    'clusterIdD splitRead1IdD': ('NOMATEPAIR', 1.0),
+    }
+
+rtax_expected_result_paired_with_fallback = {
+    'clusterIdA splitRead1IdA': ('k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Propionibacteriaceae; g__Propionibacterium; s__Propionibacterium acnes', 1.0),
+    'clusterIdB splitRead1IdB': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Diaphorobacter; s__', 1.0),
+    'clusterIdC splitRead1IdC': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+    'clusterIdD splitRead1IdD': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+    }
+
+rtax_expected_result_single = {
+    'clusterIdA splitRead1IdA': ('k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Propionibacteriaceae; g__Propionibacterium; s__Propionibacterium acnes', 1.0),
+    'clusterIdB splitRead1IdB': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Diaphorobacter; s__', 1.0),
+    'clusterIdC splitRead1IdC': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+    'clusterIdD splitRead1IdD': ('k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__; g__Aquabacterium; s__', 1.0),
+    }
+
+if __name__ == "__main__":
+    main()
diff --git a/bfillings/tests/test_sortmerna_v2.py b/bfillings/tests/test_sortmerna_v2.py
new file mode 100644
index 0000000..1945ebe
--- /dev/null
+++ b/bfillings/tests/test_sortmerna_v2.py
@@ -0,0 +1,855 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Unit tests for the SortMeRNA version 2.0 Application controller
+===============================================================
+"""
+
+
+from unittest import TestCase, main
+import re
+from os import close
+from os.path import abspath, exists, join, dirname
+from tempfile import mkstemp, mkdtemp
+from shutil import rmtree
+
+from skbio.util import remove_files
+from skbio.parse.sequences import parse_fasta
+
+from bfillings.sortmerna_v2 import (build_database_sortmerna,
+                                 sortmerna_ref_cluster,
+                                 sortmerna_map)
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+
+# Test class and cases
+class SortmernaV2Tests(TestCase):
+    """ Tests for SortMeRNA version 2.0 functionality """
+
+    def setUp(self):
+        self.output_dir = mkdtemp()
+        self.reference_seq_fp = reference_seqs_fp
+        self.read_seqs_fp = read_seqs_fp
+
+        # create temporary file with reference sequences defined
+        # in reference_seqs_fp
+        f, self.file_reference_seq_fp = mkstemp(prefix='temp_references_',
+                                                suffix='.fasta')
+        close(f)
+
+        # write _reference_ sequences to tmp file
+        with open(self.file_reference_seq_fp, 'w') as tmp:
+            tmp.write(self.reference_seq_fp)
+        tmp.close()
+
+        # create temporary file with read sequences defined in read_seqs_fp
+        f, self.file_read_seqs_fp = mkstemp(prefix='temp_reads_',
+                                            suffix='.fasta')
+        close(f)
+
+        # write _read_ sequences to tmp file
+        with open(self.file_read_seqs_fp, 'w') as tmp:
+            tmp.write(self.read_seqs_fp)
+        tmp.close()
+
+        # list of files to remove
+        self.files_to_remove = [self.file_reference_seq_fp,
+                                self.file_read_seqs_fp]
+
+    def tearDown(self):
+        remove_files(self.files_to_remove)
+        rmtree(self.output_dir)
+
+    def test_indexdb_default_param(self):
+    	""" Test indexing a database using SortMeRNA
+        """
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        expected_db_files = set(sortmerna_db + ext
+                                for ext in ['.bursttrie_0.dat', '.kmer_0.dat',
+                                            '.pos_0.dat', '.stats'])
+
+        # Make sure all db_files exist
+        for fp in expected_db_files:
+            self.assertTrue(exists(fp))
+
+        # Add files to be remove
+        self.files_to_remove.extend(db_files_to_remove)
+
+    def test_empty_fasta_path(self):
+        """ Indexdb should fail with an empty fasta path
+        """
+        self.assertRaises(ValueError,
+                          build_database_sortmerna,
+                          fasta_path=None,
+                          max_pos=250,
+                          output_dir=self.output_dir)
+
+    def test_empty_inputs(self):
+        """ (1) Indexdb should set output_dir to the same directory
+                as where the input FASTA file is located;
+            (2) SortMeRNA should fail if an empty result path is
+                passed;
+            (3) SortMeRNA should fail if an empty seq path is passed
+        """
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=None)
+
+        self.files_to_remove.extend(db_files_to_remove)
+
+        fasta_dir = dirname(abspath(self.file_reference_seq_fp))
+        out_dir = dirname(sortmerna_db)
+
+        self.assertEqual(fasta_dir, out_dir)
+
+        self.assertRaises(ValueError,
+                          sortmerna_ref_cluster,
+                          seq_path=self.file_read_seqs_fp,
+                          sortmerna_db=sortmerna_db,
+                          refseqs_fp=self.file_reference_seq_fp,
+                          result_path=None)
+
+        self.assertRaises(ValueError,
+                          sortmerna_ref_cluster,
+                          seq_path=None,
+                          sortmerna_db=sortmerna_db,
+                          refseqs_fp=self.file_reference_seq_fp,
+                          result_path=join(self.output_dir,
+                                           "sortmerna_otus.txt"))
+
+    def test_tabular_output(self):
+        """ SortMeRNA should output a BLAST tabular output
+        """
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        self.files_to_remove.extend(db_files_to_remove)
+
+        # Run SortMeRNA
+        clusters, failures, smr_files_to_remove = sortmerna_ref_cluster(
+            seq_path=self.file_read_seqs_fp,
+            sortmerna_db=sortmerna_db,
+            refseqs_fp=self.file_reference_seq_fp,
+            result_path=join(self.output_dir, "sortmerna_otus.txt"),
+            tabular=True)
+
+        self.assertTrue(exists(join(self.output_dir,
+                                    "sortmerna_otus.blast")))
+
+    def test_empty_result_path(self):
+        """ SortMeRNA should fail with an empty indexed database
+        """
+        self.assertRaises(ValueError,
+                          sortmerna_ref_cluster,
+                          seq_path=self.file_read_seqs_fp,
+                          sortmerna_db=None,
+                          refseqs_fp=self.file_reference_seq_fp,
+                          result_path=join(self.output_dir,
+                                           "sortmerna_otus.txt")
+                          )
+
+    def test_sortmerna_default_param(self):
+        """ SortMeRNA version 2.0 reference OTU picking works with default settings
+        """
+        # rebuild the index
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        # Files created by indexdb_rna to be deleted
+        self.files_to_remove.extend(db_files_to_remove)
+
+        # Run SortMeRNA
+        cluster_map, failures, smr_files_to_remove = sortmerna_ref_cluster(
+            seq_path=self.file_read_seqs_fp,
+            sortmerna_db=sortmerna_db,
+            refseqs_fp=self.file_reference_seq_fp,
+            result_path=join(self.output_dir, "sortmerna_otus.txt"))
+
+        # Check all sortmerna output files exist
+        output_files = [join(self.output_dir, ext)
+                        for ext in ['sortmerna_otus_otus.txt',
+                                    'sortmerna_otus.log',
+                                    'sortmerna_otus_denovo.fasta',
+                                    'sortmerna_otus.fasta']]
+
+        # Check output files exist
+        for fp in output_files:
+            self.assertTrue(exists(fp))
+
+        # Files created sortmerna to be deleted (StdErr and StdOut were already
+        # removed in sortmerna_ref_cluster)
+        self.files_to_remove.extend(output_files)
+
+        # Random reads that should not appear in any output file
+        random_reads = ['simulated_random_reads.fa.000000000',
+                        'simulated_random_reads.fa.000000001',
+                        'simulated_random_reads.fa.000000002',
+                        'simulated_random_reads.fa.000000003',
+                        'simulated_random_reads.fa.000000004',
+                        'simulated_random_reads.fa.000000005',
+                        'simulated_random_reads.fa.000000006',
+                        'simulated_random_reads.fa.000000007',
+                        'simulated_random_reads.fa.000000008',
+                        'simulated_random_reads.fa.000000009']
+
+        # Reads passing E-value threshold and with similarity/coverage >=97%
+        otu_reads = ['HMPMockV1.2.Staggered2.673827_47',
+                     'HMPMockV1.2.Staggered2.673827_115',
+                     'HMPMockV1.2.Staggered2.673827_122',
+                     'HMPMockV1.2.Staggered2.673827_161',
+                     'HMPMockV1.2.Staggered2.673827_180',
+                     'HMPMockV1.2.Staggered2.673827_203',
+                     'HMPMockV1.2.Staggered2.673827_207',
+                     'HMPMockV1.2.Staggered2.673827_215',
+                     'HMPMockV1.2.Staggered2.673827_218',
+                     'HMPMockV1.2.Staggered2.673827_220']
+
+        # Reads passing E-value threshold and with similarity/coverage <97%
+        denovo_reads = ['HMPMockV1.2.Staggered2.673827_0',
+                        'HMPMockV1.2.Staggered2.673827_1',
+                        'HMPMockV1.2.Staggered2.673827_2',
+                        'HMPMockV1.2.Staggered2.673827_3',
+                        'HMPMockV1.2.Staggered2.673827_4',
+                        'HMPMockV1.2.Staggered2.673827_5',
+                        'HMPMockV1.2.Staggered2.673827_6',
+                        'HMPMockV1.2.Staggered2.673827_7',
+                        'HMPMockV1.2.Staggered2.673827_8',
+                        'HMPMockV1.2.Staggered2.673827_9']
+
+        # Check correct number of OTU clusters in file
+        otu_clusters = ['295053']
+
+        f_aligned = open(output_files[3], "U")
+        f_otumap = open(output_files[0], "U")
+        f_denovo = open(output_files[2], "U")
+
+        # Verify the aligned FASTA file
+        for label, seq in parse_fasta(f_aligned):
+            id = label.split()[0]
+            # Read is not random
+            self.assertNotIn(id, random_reads)
+            # Read is either in otu_reads or denovo_reads
+            self.assertIn(id, otu_reads+denovo_reads)
+        f_aligned.close()
+
+        # Verify the de novo reads FASTA file
+        for label, seq in parse_fasta(f_denovo):
+            id = label.split()[0]
+            # Read is not random
+            self.assertNotIn(id, random_reads)
+            # Read is not an OTU read
+            self.assertNotIn(id, otu_reads)
+            # Read is a de novo read
+            self.assertIn(id, denovo_reads)
+        f_denovo.close()
+
+        # Check the OTU map
+        for line in f_otumap:
+            otu_entry = line.split()
+            # Cluster ID is correct
+            self.assertIn(otu_entry[0], otu_clusters)
+            # Each read in the cluster must exclusively be an OTU read
+            for read in otu_entry[1:]:
+                self.assertNotIn(read, random_reads)
+                self.assertNotIn(read, denovo_reads)
+                self.assertIn(read, otu_reads)
+        f_otumap.close()
+
+        # Check returned list of lists of clusters
+        expected_cluster = ['HMPMockV1.2.Staggered2.673827_47',
+                            'HMPMockV1.2.Staggered2.673827_115',
+                            'HMPMockV1.2.Staggered2.673827_122',
+                            'HMPMockV1.2.Staggered2.673827_161',
+                            'HMPMockV1.2.Staggered2.673827_180',
+                            'HMPMockV1.2.Staggered2.673827_203',
+                            'HMPMockV1.2.Staggered2.673827_207',
+                            'HMPMockV1.2.Staggered2.673827_215',
+                            'HMPMockV1.2.Staggered2.673827_218',
+                            'HMPMockV1.2.Staggered2.673827_220']
+
+        # Should only have 1 cluster
+        self.assertEqual(1, len(cluster_map))
+        for actual_cluster in cluster_map.itervalues():
+            actual_cluster.sort()
+            expected_cluster.sort()
+            self.assertEqual(actual_cluster, expected_cluster)
+
+        # Check log file number of clusters and failures corresponds to
+        # the results in the output files
+        f_log = open(output_files[1], "U")
+        num_clusters = 0
+        num_failures = 0
+        for line in f_log:
+            if line.startswith(" Total OTUs"):
+                num_clusters = (re.split(' = ', line)[1]).strip()
+            elif line.startswith("    Total reads for de novo clustering"):
+                num_failures = (re.split(' = ', line)[1]).strip()
+        f_log.close()
+
+        self.assertEqual(int(num_clusters), len(otu_clusters))
+        self.assertEqual(int(num_failures), len(denovo_reads))
+
+    def test_sortmerna_map_default(self):
+        """ SortMeRNA version 2.0 for mapping sequences onto a reference
+            using default parameters
+        """
+
+        # Rebuild the index
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        # Files created by indexdb_rna to be deleted
+        self.files_to_remove.extend(db_files_to_remove)
+
+        # Run SortMeRNA mapper
+        app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+                                   output_dir=self.output_dir,
+                                   refseqs_fp=self.file_reference_seq_fp,
+                                   sortmerna_db=sortmerna_db)
+
+        # Check all sortmerna output files exist
+        output_files = [join(self.output_dir, ext)
+                        for ext in ['sortmerna_map.blast',
+                                    'sortmerna_map.log']]
+
+        # Check output files exist
+        for fp in output_files:
+            self.assertTrue(exists(fp))
+
+        blast_alignments_fp = app_result['BlastAlignments'].name
+
+        # Check there are 30 alignments (1 per read)
+        with open(blast_alignments_fp, 'U') as blast_actual:
+            entries = (line.strip().split('\t') for line in blast_actual)
+            actual_alignments = {r[0]: r[1:] for r in entries}
+
+        self.assertEqual(30, len(actual_alignments))
+
+        # Check this alignment exists
+        self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
+                        in actual_alignments)
+        self.assertEqual("97.3", actual_alignments[
+            "HMPMockV1.2.Staggered2.673827_47"][1])
+        self.assertEqual("100", actual_alignments[
+            "HMPMockV1.2.Staggered2.673827_47"][12])
+
+        # Check alignment for random read is NULL
+        self.assertTrue("simulated_random_reads.fa.000000000"
+                        in actual_alignments)
+        self.assertEqual("*", actual_alignments[
+            "simulated_random_reads.fa.000000000"][0])
+
+    def test_sortmerna_map_sam_alignments(self):
+        """ SortMeRNA version 2.0 for mapping sequences onto a reference
+            outputting Blast and SAM alignments
+        """
+
+        # Rebuild the index
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        # Files created by indexdb_rna to be deleted
+        self.files_to_remove.extend(db_files_to_remove)
+
+        # Run SortMeRNA mapper
+        app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+                                   output_dir=self.output_dir,
+                                   refseqs_fp=self.file_reference_seq_fp,
+                                   sortmerna_db=sortmerna_db,
+                                   output_sam=True)
+
+        # Check all sortmerna output files exist
+        output_files = [join(self.output_dir, ext)
+                        for ext in ['sortmerna_map.blast',
+                                    'sortmerna_map.sam',
+                                    'sortmerna_map.log']]
+
+        # Check output files exist
+        for fp in output_files:
+            self.assertTrue(exists(fp))
+
+        sam_alignments_fp = app_result['SAMAlignments'].name
+
+        # Check there are 30 alignments in the SAM output (1 per read)
+        with open(sam_alignments_fp, 'U') as sam_actual:
+            entries = (line.strip().split('\t') for line in sam_actual)
+            actual_alignments = {r[0]: r[1:] for r in entries}
+
+        # 30 alignments expected + 2 lines for @HD and @PG fields
+        self.assertEqual(32, len(actual_alignments))
+
+        # Check this alignment exists
+        self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
+                        in actual_alignments)
+        self.assertEqual("295053", actual_alignments[
+            "HMPMockV1.2.Staggered2.673827_47"][1])
+        self.assertEqual("AS:i:418", actual_alignments[
+            "HMPMockV1.2.Staggered2.673827_47"][10])
+
+        # Check alignment for random read is NULL
+        self.assertTrue("simulated_random_reads.fa.000000000"
+                        in actual_alignments)
+        self.assertEqual("*", actual_alignments[
+            "simulated_random_reads.fa.000000000"][1])
+
+    def test_sortmerna_map_sam_alignments_with_tags(self):
+        """ SortMeRNA version 2.0 for mapping sequences onto a reference
+            outputting SAM alignments with @SQ tags
+        """
+
+        # Rebuild the index
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        # Files created by indexdb_rna to be deleted
+        self.files_to_remove.extend(db_files_to_remove)
+
+        # Run SortMeRNA mapper
+        app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+                                   output_dir=self.output_dir,
+                                   refseqs_fp=self.file_reference_seq_fp,
+                                   sortmerna_db=sortmerna_db,
+                                   output_sam=True,
+                                   sam_SQ_tags=True,
+                                   blast_format=None)
+
+        # Check all sortmerna output files exist
+        output_files = [join(self.output_dir, ext)
+                        for ext in ['sortmerna_map.sam',
+                                    'sortmerna_map.log']]
+
+        # Check output files exist
+        for fp in output_files:
+            self.assertTrue(exists(fp))
+
+        sam_alignments_fp = app_result['SAMAlignments'].name
+
+        # Check there are 30 alignments in the SAM output (1 per read)
+        with open(sam_alignments_fp, 'U') as sam_actual:
+            actual_entries = [line.strip().split('\t') for line in sam_actual]
+
+        # 30 alignments expected + 2 lines for @HD and @PG fields + 5 lines
+        # for the @SQ tags
+        self.assertEqual(37, len(actual_entries))
+
+        # Check all expected @SQ tags have been included
+        SQ_array = [['@SQ', 'SN:42684', 'LN:1501'],
+                    ['@SQ', 'SN:342684', 'LN:1486'],
+                    ['@SQ', 'SN:426848', 'LN:1486'],
+                    ['@SQ', 'SN:295053', 'LN:1389'],
+                    ['@SQ', 'SN:879972', 'LN:1371']]
+        for entry in SQ_array:
+            self.assertTrue(entry in actual_entries)
+
+    def test_sortmerna_map_blast_no_null_alignments(self):
+        """ SortMeRNA version 2.0 for mapping sequences onto a reference
+            using Blast with --print_all_reads option set to False
+            (no NULL alignments output)
+        """
+
+        # Rebuild the index
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        # Files created by indexdb_rna to be deleted
+        self.files_to_remove.extend(db_files_to_remove)
+
+        # Run SortMeRNA mapper
+        app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+                                   output_dir=self.output_dir,
+                                   refseqs_fp=self.file_reference_seq_fp,
+                                   sortmerna_db=sortmerna_db,
+                                   print_all_reads=False)
+
+        # Check all sortmerna output files exist
+        output_files = [join(self.output_dir, ext)
+                        for ext in ['sortmerna_map.blast',
+                                    'sortmerna_map.log']]
+
+        # Check output files exist
+        for fp in output_files:
+            self.assertTrue(exists(fp))
+
+        blast_alignments_fp = app_result['BlastAlignments'].name
+
+        # Check there are 20 alignments (1 per read)
+        with open(blast_alignments_fp, 'U') as blast_actual:
+            entries = (line.strip().split('\t') for line in blast_actual)
+            actual_alignments = {r[0]: r[1:] for r in entries}
+
+        self.assertEqual(20, len(actual_alignments))
+
+        # Check this alignment exists
+        self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
+                        in actual_alignments)
+        self.assertEqual("97.3", actual_alignments[
+            "HMPMockV1.2.Staggered2.673827_47"][1])
+        self.assertEqual("100", actual_alignments[
+            "HMPMockV1.2.Staggered2.673827_47"][12])
+
+        # Check alignment for random read does not exist
+        self.assertFalse("simulated_random_reads.fa.000000000"
+                         in actual_alignments)
+
+    def test_sortmerna_map_num_alignments(self):
+        """ SortMeRNA version 2.0 for mapping sequences onto a reference
+            outputting first INT num_alignments passing the E-value threshold
+            (rather than first INT best alignments)
+        """
+
+        # Rebuild the index
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        # Files created by indexdb_rna to be deleted
+        self.files_to_remove.extend(db_files_to_remove)
+
+        # Run SortMeRNA mapper
+        app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
+                                   output_dir=self.output_dir,
+                                   refseqs_fp=self.file_reference_seq_fp,
+                                   sortmerna_db=sortmerna_db,
+                                   num_alignments=1)
+
+        # Check all sortmerna output files exist
+        output_files = [join(self.output_dir, ext)
+                        for ext in ['sortmerna_map.blast',
+                                    'sortmerna_map.log']]
+
+        # Check output files exist
+        for fp in output_files:
+            self.assertTrue(exists(fp))
+
+        blast_alignments_fp = app_result['BlastAlignments'].name
+
+        # Check there are 30 alignments (1 per read)
+        with open(blast_alignments_fp, 'U') as blast_actual:
+            entries = (line.strip().split('\t') for line in blast_actual)
+            actual_alignments = {r[0]: r[1:] for r in entries}
+
+        self.assertEqual(30, len(actual_alignments))
+
+        # Check this alignment exists
+        self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
+                        in actual_alignments)
+        self.assertEqual("97.3", actual_alignments[
+            "HMPMockV1.2.Staggered2.673827_47"][1])
+        self.assertEqual("100", actual_alignments[
+            "HMPMockV1.2.Staggered2.673827_47"][12])
+
+        # Check alignment for random read is NULL
+        self.assertTrue("simulated_random_reads.fa.000000000"
+                        in actual_alignments)
+        self.assertEqual("*", actual_alignments[
+            "simulated_random_reads.fa.000000000"][0])
+
+    def test_blast_or_sam(self):
+        """ SortMeRNA should fail with output_sam and blast_format both
+            set to False
+        """
+        # Rebuild the index
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        # Files created by indexdb_rna to be deleted
+        self.files_to_remove.extend(db_files_to_remove)
+
+        self.assertRaises(ValueError,
+                          sortmerna_map,
+                          seq_path=self.file_read_seqs_fp,
+                          output_dir=self.output_dir,
+                          refseqs_fp=self.file_reference_seq_fp,
+                          sortmerna_db=sortmerna_db,
+                          output_sam=False,
+                          blast_format=None)
+
+    def test_best_or_num_alignments(self):
+        """ SortMeRNA should fail with "best" and "num_alignments" both
+            set to True
+        """
+        # Rebuild the index
+        sortmerna_db, db_files_to_remove = build_database_sortmerna(
+            abspath(self.file_reference_seq_fp),
+            max_pos=250,
+            output_dir=self.output_dir)
+
+        # Files created by indexdb_rna to be deleted
+        self.files_to_remove.extend(db_files_to_remove)
+
+        self.assertRaises(ValueError,
+                          sortmerna_map,
+                          seq_path=self.file_read_seqs_fp,
+                          output_dir=self.output_dir,
+                          refseqs_fp=self.file_reference_seq_fp,
+                          sortmerna_db=sortmerna_db,
+                          best=1,
+                          num_alignments=1)
+
+
+# Reference sequence database
+reference_seqs_fp = """>426848
+AGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGCAGGCTTAATACATGCAAGTCGAGGGGCAGCACTGGTAGCAATAC
+CTGGTGGCGACCGGCGGACGGGTGCGTAACACGTATGCAACCTACCCTGTACAGGGGGATAGCCCGAGGAAATTCGGATT
+AATACCCCATACGATAAGAATCGGCATCGATTTTTATTGAAAGCTCCGGCGGTACAGGATGGGCATGCGCCCCATTAGCT
+AGTTGGTGAGGTAACGGCTCACCAAGGCTACGATGGGTAGGGGGCCTGAGAGGGTGATCCCCCACACTGGAACTGAGACA
+CGGTCCAGACTCCTACGGGAGGCAGCAGTAAGGAATATTGGTCAATGGGCGCAAGCCTGAACCAGCCATGCCGCGTGCAG
+GAAGACTGCCATTATGGTTGTAAACTGCTTTTATATGGGAAGAAACCTCCGGACGTGTCCGGAGCTGACGGTACCATGTG
+AATAAGGATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATCCAAGCGTTATCCGGATTTATTGGGTTTAAA
+GGGTGCGTAGGCGGCGTGTTAAGTCAGAGGTGAAATTCGGCAGCTCAACTGTCAAATTGCCTTTGATACTGGCACACTTG
+AATGCGATTGAGGTAGGCGGAATGTGACATGTAGCGGTGAAATGCTTAGACATGTGACAGAACACCGATTGCGAAGGCAG
+CTTACCAAGTCGTTATTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTA
+AACGATGATAACTCGACGTTAGCGATACACTGTTAGCGTCCAAGCGAAAGCGTTAAGTTATCCACCTGGGAAGTACGATC
+GCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGATACGCGAGGA
+ACCTTACCAGGGCTTAAATGGGGAACGACCTTCTGGGAAACCAGAATTTCTTTTAGACGGTCCTCAAGGTGCTGCATGGT
+TGTCGTCAGCTCGTGCCGTGAGGTGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTACTGTTAGTTGCCAGCGGATAAT
+GCCGGGGACTCTAGCGGAACTGCCTGTGCAAACAGAGAGGAAGGTGGGGATGACGTCAAATCATCACGGCCCTTACGTCC
+TGGGCTACACACGTGCTACAATGGCCGGTACAGAGGGCAGCCACTTCGTGAGAAGGAGCGAATCCTTAAAGCCGGTCTCA
+GTTCGGATTGTAGTCTGCAACTCGACTACATGAAGCTGGAATCGCTAGTAATCGCGTATCAGCCATGACGCGGTGAATAC
+GTTCCCGGGCCTTGTACACACCGCCCGTCAAGCCATGGGAATTGGGAGTACCTAAAGTCGGTAACCGCAAGGAGCCGCCT
+AAGGTAATACCAGTGACTGGGGCTAAGTCGTAACAAGGTAGCCGTA
+>42684
+AGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCTTTACACATGCAAGTCGGACGGCAGCACAGAGGAGCTTGC
+TTCTTGGGTGGCGAGTGGCGAACGGGTGAGTGACGCATCGGAACGTACCGAGTAATGGGGGATAACTGTCCGAAAGGACA
+GCTAATACCGCATACGCCCTGAGGGGGAAAGCGGGGGATCTTAGGACCTCGCGTTATTCGAGCGGCCGATGTCTGATTAG
+CTGGTTGGCGGGGTAAAGGCCCACCAAGGCGACGATCAGTAGCGGGTCTGAGAGGATGATCCGCCACACTGGGACTGAGA
+CACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGT
+CTGAAGAAGGCCTTCGGGTTGTAAAGGACTTTTGTCAGGGAAGAAAAGGAACGTGTTAATACCATGTTCTGATGACGGTA
+CCTGAAGAATAAGCACCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGG
+GCGTAAAGCGGGCGCAGACGGTTACTTAAGCGGGATGTGAAATCCCCGGGCTCAACCCGGGAACTGCGTTCCGAACTGGG
+TGGCTAGAGTGTGTCAGAGGGGGGTAGAATTCCACGTGTAGCAGTGAAATGCGTAGAGATGTGGAGGAATACCGATGGCG
+AAGGCAGCCCCCTGGGATAACACTGACGTTCATGCCCGAAAGCGTGGGTAGCAAACAGGGTTAGATACCCTGGTAGTCCA
+CGCCCTAAACGATGTCGATTAGCTGTTGGGGCACTTGATGCCTTAGTAGCGTAGCTAACGCGTGAAATCGACCGCCTGGG
+GAGTACGGTCGCAAGATTAAAACTCAAAGGAATTGACGGGGACCCGCACAAGCGGTGGATGATGTGGATTAATTCGATGC
+AACGCGAAGAACCTTACCTGGTCTTGACATGTACGGAATCTTCCAGAGACGGAAGGGTGCCTTCGGGAGCCGTAACACAG
+GTGCTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCATTAGTTG
+CCATCACTTGGTTGGGCACTCTAATGAGACTGCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGC
+CCTTATGACCAGGGCTTCACACGTCATACAATGGTCGGTACAGAGGGTAGCCAAGCCGCGAGGCGGAGCCAATCCCAGAA
+AACCGATCGTAGTCCGGATTGCACTCTGCAACTCGAGTGCATGAAGTCGGAATCGCTAGTAATCGCAGGTCAGCATACTG
+CGGTGAATACGTTCCCGGGTCTTGTACACACCGCCCGTCACACCATGGGAGTGGGGGATACCAGAAGCAGGTAGGCTAAC
+CGCAAGGAGGCCGCTTGCCACGGTATGCTTCATGACTGGGGTGAAGTCGTAACAAGGTAAC
+>342684
+AGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGCAGGCTTAACACATGCAAGTCGAGGGGCATCGCGGGTAGCAATAC
+CTGGCGGCGACCGGCGGAAGGGTGCGTAACGCGTGAGCGACATACCCGTGACAGGGGGATAACAGATGGAAACGTCTCCT
+AATACCCCATAAGATCATATATCGCATGGTATGTGATTGAAAGGTGAGAACCGGTCACGGATTGGCTCGCGTCCCATCAG
+GTAGACGGCGGGGCAGCGGCCCGCCGTGCCGACGACGGGTAGGGGCTCTGAGAGGAGTGACCCCCACAATGGAACTGAGA
+CACGGTCCATACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGGCGGAAGCCTGAACCAGCCATGCCGCGTGC
+GGGAGGACGGCCCTATGGGTTGTAAACCGCTTTTGAGTGAGAGCAATAAGGTTCACGTGTGGACCGATGAGAGTATCATT
+CGAATAAGCATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTCATTGGGTTTA
+AAGGGTGCGTAGGCGGACATGTAAGTCCGAGGTGAAAGACCGGGGCCCAACCCCGGGGTTGCCTCGGATACTGTGTGTCT
+GGAGTGGACGTGCCGCCGGGGGAATGAGTGGTGTAGCGGTGAAATGCATAGATGTCACTCAGAACACCGATTGCGAAGGC
+ACCTGGCGAATGTCTTACTGACGCTGAGGCACGAAAGCGTGGGGATCGAACAGGATTAGATACCCTGGTAGTCCACGCAG
+TAAACGATGATGGCTGTCCGTTCGCTCCGATAGGAGTGAGTAGACAAGCGAAAGCGCTAAGCCATCCACCTGGGGAGTAC
+GGCCGCAAGGCTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCG
+AGGAACCTTACCCGGGCTCGAACGGCAGGTGAACGATGCAGAGATGCAAAGGCCCTTCGGGGCGTCTGTCGAGGTGCTGC
+ATGGTTGTCGTCAGCTCGTGCCGTGAGGTGTCGGCTCAAGTGCCATAACGAGCGCAACCCTTGCCTGCAGTTGCCATCGG
+GTAAAGCCGGGGACTCTGCAGGGACTGCCACCGCAAGGTGAGAGGAGGGGGGGGATGACGTCAAATCAGCACGGCCCTTA
+CGTCCGGGGCGACACACGTGTTACAATGGCGGCCACAGCGGGAAGCCACCCAGTGATGGGGCGCGGATCCCAAAAAAGCC
+GCCTCAGTTCGGATCGGAGTCTGCAACCCGACTCCGTGAAGCTGGATTCGCTAGTAATCGCGCATCAGCCATGGCGCGGT
+GAATACGTTCCCGGGCCTTGTACACACCGCCCGTCAAGCCATGGGAGTCGTGGGCGCCTGAAGGCCGTGACCGCGAGGAG
+CGGCCTAGGGCGAACGCGGTGACTGGGGCTAAGTCGTAACAAGGTA
+>295053
+AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGAGATGCTCCTTCGGGAGT
+ATCTTAGTGGCGAACGGGTGAGTAACGCGTGAGCAACCTGACCTTCACAGGGGGATAACCGCTGGAAACAGCAGCTAATA
+CCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTTGTTG
+GTGGGGTAACGGCTCACCAAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTC
+CAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGA
+AGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAG
+AAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAA
+GCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTG
+AGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGG
+CCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTA
+AACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACG
+GCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGA
+AGAACCTTACCTGGTCTTGACATCCACAGAACTTTCCAGAGATGGATTGGTGCCTTCGGGAACTGTGAGACAGGTGCTGC
+ATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCCTTTGTTGCCAGCGG
+TCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTCATCATGGCCCTTAC
+GACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCGACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCG
+TCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGA
+ATACGTTCCCGGGCCTTGCACACACCGCC
+>879972
+GACGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGAGATTGACCGGTGCTTGCACTGGTCAATCTAGTGGCGAA
+CGGGTGAGTAACACGTGGGTAACCTGCCCATCAGAGGGGGATAACATTCGGAAACGGATGCTAAAACCGCATAGGTCTTC
+GAACCGCATGGTTTGAAGAGGAAAAGAGGCGCAAGCTTCTGCTGATGGATGGACCCGCGGTGTATTAGCTAGTTGGTGGG
+GTAACGGCTCACCAAGGCGACGATACATAGCCGACCTGAGAGGGTGATCGGCCACACTGGGACTGAGACACGGCCCAGAC
+TCCTACGGGAGGCAGCAGTAGGGAATCTTCGGCAATGGACGGAAGTCTGACCGAGCAACGCCGCGTGAGTGAAGAAGGTT
+TTCGGATCGTAAAGCTCTGTTGTAAGAGAAGAACGAGTGTGAGAGTGGAAAGTTCACACTGTGACGGTATCTTACCAGAA
+AGGGACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGA
+GCGCAGGCGGTTAGATAAGTCTGAAGTTAAAGGCTGTGGCTTAACCATAGTACGCTTTGGAAACTGTTTAACTTGAGTGC
+AAGAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTC
+TGGCTTGTAACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGA
+TGAGTGCTAGGTGTTAGACCCTTTCCGGGGTTTAGTGCCGCAGCTAACGCATTAAGCACTCCGCCTGGGGAGTACGACCG
+CAGGGTTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAA
+CCTTACCAGGTCTTGACATCCCTCTGACCGCTCTAGAGATAGAGCTTTCCTTCGGGACAGAGGTGACAGGTGGTGCATGG
+TTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTATTGTTAGTTGCCATCATTCAG
+TTGGGCACTCTAGCGAGACTGCCGGTAATAAACCGGAGGAAGGTGGGGATGACGTCAAATCATCATGCCCCTTATGACCT
+GGGCTACACACGTGCTACAATGGCTGGTACAACGAGTCGCAAGCCGGTGACGGCAAGCTAATCTCTTAAAGCCAGTCTCA
+GTTCGGATTGTAGGCTGCAACTCGCCTACATGAAGTCGGAATCGCTAGTAATCGCGGATCAGCACGCCGCGGTGAATACG
+TTCCCGGGCCT
+"""
+
+# Reads to search against the database
+# - 10 rRNA reads:   amplicon reads were taken from Qiime study 1685
+# - 10 random reads: simulated using mason with the following command:
+#     mason illumina -N 10 -snN -o simulated_random_reads.fa -n
+#     150 random.fasta
+# - 10 rRNA reads with id < 97: amplicon reads were taken from
+#   Qiime study 1685
+read_seqs_fp = """>HMPMockV1.2.Staggered2.673827_47 M141:79:749142:1:1101:16169:1589
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCAAGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATTTGATACTGGCAAGCTTGAGTCTCGTAGAGGAGGGTAGAATTCCAGGTGTAGCGGGG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCTCCATGGACGAAGACTGACGCT
+>HMPMockV1.2.Staggered2.673827_115 M141:79:749142:1:1101:14141:1729
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CCGGCTCAACCTTGGAACTGCATCTGATACGGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCTCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGAGCAAACA
+>HMPMockV1.2.Staggered2.673827_122 M141:79:749142:1:1101:16032:1739
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GTGATCAAACA
+>HMPMockV1.2.Staggered2.673827_161 M141:79:749142:1:1101:17917:1787
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCTCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGAGCAAACA
+>HMPMockV1.2.Staggered2.673827_180 M141:79:749142:1:1101:16014:1819
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGTGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+>HMPMockV1.2.Staggered2.673827_203 M141:79:749142:1:1101:17274:1859
+TACGGAGGTTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CCGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCTCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGATCAAACA
+>HMPMockV1.2.Staggered2.673827_207 M141:79:749142:1:1101:17460:1866
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGAGCAAACA
+>HMPMockV1.2.Staggered2.673827_215 M141:79:749142:1:1101:18390:1876
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACG
+>HMPMockV1.2.Staggered2.673827_218 M141:79:749142:1:1101:18249:1879
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTTCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTG
+GGGAGCACACA
+>HMPMockV1.2.Staggered2.673827_220 M141:79:749142:1:1101:15057:1880
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCC
+CGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTG
+AAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCTCCTGGACGAAGACTGACGCTC
+>simulated_random_reads.fa.000000000
+AGCCGGGTGTCTACGGTCAGGTGTGTTCTGACTACGTAGTTTGACAGCACGTGTCCTTTCCCCTTCCCAAGGTAACGAAT
+TGTCGTTATCAACGTTTCGATCCGTAATTTCACGGAACGACATAAAGGCATCAATACTATCGCCAACAGA
+>simulated_random_reads.fa.000000001
+GTGGACGTCGTGGCGGCGTACTAACTTCCTACAGGCATATCCGGAATAACATTCTGCCGCTTGTCGACATAAGCTGTTCC
+CTACATAGACGACGACGGTTGAAGGGTGTATGTATTCTTTGGGTACGGCTCCTCTGGGCGCATGGTAGCA
+>simulated_random_reads.fa.000000002
+CATTCTTTATAGGCCTACAACACTAATCATCGTTAAGCATAAGGGGAGGAGTGTGCGTGGCATCAAGTCCTGGTTCTTCG
+CCTAGTACCACACCGTCTCACACGCAGCCGCCGACGACCAGTGAGGGCGCGTGGGACACCCATTCGGTCC
+>simulated_random_reads.fa.000000003
+TCGCCTTGGTACAAACAGTCGCGGCACGCTGTATGGAGGACCATAGAGGCACAGGCTGAGGACAGGGGCATGGAAGGTTC
+AATCGCCCCCCACAGCTTTAGGTAGGAAGTACTGTTCTAGTGCCAATTTGATTTTAACGGCAGTTACTCG
+>simulated_random_reads.fa.000000004
+CATATTCTAATATCCTACTTCTGATACCCGATTATACACGACACCACCCCAGGACTGTCGTCACATCCTTATCTGGATAA
+ACATCCGGTTCCGTTTGGCCGTGCTCCGCAAGTGATGCGTCTGTGGAATGTACGTGGAGCGTTGACAGTT
+>simulated_random_reads.fa.000000005
+CCGGATTAGGCATGTTTATAGTACAACGGATTCGCAAAAAGGTCAGGGTAACAATTTTGAAATGCTTTCATACTGCGGTC
+TAAATGGACCACCCTTTAGGTGCAGCCAACTATAGTTGGTCGATTCTCTGAACACGTACCGAAGGCAATT
+>simulated_random_reads.fa.000000006
+AACCCATCGGAATAATCTACTGCTTCGTATGGAACGGTCCTACATTTAAATAAACGTGTCCAGTGCCACCCGATACCTCT
+CGTCAATCAGGGGCTCTCCCTGAATCAGCAGTAAACAAACCCAGTACACTGTCGAACACTACTGAGACCG
+>simulated_random_reads.fa.000000007
+CCGAAGGCAAGTCTGTCGTAGAATGGTTTTTGTCGTTGTAACAACCCCGCTCTAGACCCTGAAAACCATAAAGTCAAGCC
+CAACTAATATTAGAGGCATTCTGGCTACTCCCGCTCACCGCAATCTTCACATACTGTGATACCCTCAGCC
+>simulated_random_reads.fa.000000008
+ATATCCGTTAAACCCCGGATTTGACAATTCATCATCAACGCTACTAACGGCTTTCTCAATTTGGGGCTGTGGCCTATCCG
+CATACGGCTACCTGCGCAAGAAGAGAGTACTGTTAGATGTCACGCTGCACTTGCGAAGACCGGTGGGCGT
+>simulated_random_reads.fa.000000009
+AGCGATGAGTACACAAGATGAGTGAAGGGATTAAACTTCAAACCTTGAAGTGTTACCCGATTTCCTACCATTGGGGATTC
+GTTAATGCTTCGAATGGATCTATATCCGGTGTTTAGCTGACTGTTAAAATACTCTCGTTGTACGAAAGTA
+>HMPMockV1.2.Staggered2.673827_0 M141:79:749142:1:1101:17530:1438
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGCAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACCTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTG
+>HMPMockV1.2.Staggered2.673827_1 M141:79:749142:1:1101:17007:1451
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTTACGCTG
+>HMPMockV1.2.Staggered2.673827_2 M141:79:749142:1:1101:16695:1471
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTG
+GGGA
+>HMPMockV1.2.Staggered2.673827_3 M141:79:749142:1:1101:17203:1479
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGTAGAGATATGGAGGAACACCAGTGGCGAAGGCGACGTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTG
+G
+>HMPMockV1.2.Staggered2.673827_4 M141:79:749142:1:1101:14557:1490
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGGCTGTAACTGACGCTGATGTGCGCAAGCGTG
+GTGATCAAACA
+>HMPMockV1.2.Staggered2.673827_5 M141:79:749142:1:1101:16104:1491
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGC
+>HMPMockV1.2.Staggered2.673827_6 M141:79:749142:1:1101:16372:1491
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACAACAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGTAAG
+>HMPMockV1.2.Staggered2.673827_7 M141:79:749142:1:1101:17334:1499
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGT
+>HMPMockV1.2.Staggered2.673827_8 M141:79:749142:1:1101:17273:1504
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+AAATGCACAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGA
+>HMPMockV1.2.Staggered2.673827_9 M141:79:749142:1:1101:16835:1505
+TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCC
+ACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTG
+ACATGCGCAGAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTG
+GGGAT
+"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_sumaclust_v1.py b/bfillings/tests/test_sumaclust_v1.py
new file mode 100644
index 0000000..816ef23
--- /dev/null
+++ b/bfillings/tests/test_sumaclust_v1.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Unit tests for the SumaClust version 1.0 Application controller
+===============================================================
+"""
+
+
+from unittest import TestCase, main
+import filecmp
+from tempfile import mkstemp, mkdtemp
+from os import close
+from os.path import exists, getsize, join
+from shutil import rmtree
+
+from skbio.util import remove_files
+
+from bfillings.sumaclust_v1 import sumaclust_denovo_cluster
+
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+
+class SumaclustV1Tests(TestCase):
+    """ Tests for Sumaclust version 2.0 functionality """
+
+    def setUp(self):
+
+        self.output_dir = mkdtemp()
+        self.read_seqs = reads_seqs
+
+        # create temporary file with read sequences defined in read_seqs
+        f, self.file_read_seqs = mkstemp(prefix='temp_reads_',
+                                         suffix='.fasta')
+        close(f)
+
+        # write read sequences to tmp file
+        with open(self.file_read_seqs, 'w') as tmp:
+            tmp.write(self.read_seqs)
+
+        # list of files to remove
+        self.files_to_remove = [self.file_read_seqs]
+
+    def tearDown(self):
+        remove_files(self.files_to_remove)
+        rmtree(self.output_dir)
+
+    def check_clusters(self,
+                       clusters,
+                       result_path):
+
+        # Check the OTU map file exists
+        self.assertTrue(exists(result_path))
+
+        # Checkout output file has the correct size
+        size = getsize(result_path)
+        self.assertTrue(size, 270)
+
+        with open(result_path, "U") as f_otumap:
+            otu_map = [line.strip().split('\t') for line in f_otumap]
+
+        self.assertTrue(len(otu_map),3)
+
+        # Check the returned clusters list of lists is as expected
+        expected_clusters = [['s1_844', 's1_1886', 's1_5347', 's1_5737',
+                              's1_7014', 's1_7881', 's1_7040', 's1_6200',
+                              's1_1271', 's1_8615'],
+                             ['s1_8977', 's1_10439', 's1_12366', 's1_15985',
+                              's1_21935', 's1_11650', 's1_11001', 's1_8592',
+                              's1_14735', 's1_4677'],
+                             ['s1_630', 's1_4572', 's1_5748', 's1_13961',
+                              's1_2369', 's1_3750', 's1_7634', 's1_8623',
+                              's1_8744', 's1_6846']]
+
+        # Should be 3 clusters
+        self.assertEqual(len(clusters), 3)
+
+        # List of actual clusters matches list of expected clusters
+        for actual_cluster, expected_cluster in zip(clusters,
+                                                    expected_clusters):
+            actual_cluster.sort()
+            expected_cluster.sort()
+            self.assertEqual(actual_cluster, expected_cluster)
+
+    def test_empty_seq_path(self):
+        """ SumaClust should return a ValueError
+            if empty sequence path is passed
+        """
+        result_path = join(self.output_dir, "sumaclust_otus.txt")
+
+        self.assertRaises(ValueError,
+                          sumaclust_denovo_cluster,
+                          seq_path=None,
+                          result_path=result_path)
+
+    def test_empty_result_path(self):
+        """ SumaClust should return a ValueError
+            if empty result path is passed
+        """
+        self.assertRaises(ValueError,
+                          sumaclust_denovo_cluster,
+                          seq_path=self.file_read_seqs,
+                          result_path=None)
+
+    def test_negative_threads(self):
+        """ SumaClust should raise ValueError
+            on negative number of threads
+        """
+        result_path = join(self.output_dir, "sumaclust_otus.txt")
+
+        self.assertRaises(ValueError,
+                          sumaclust_denovo_cluster,
+                          seq_path=self.file_read_seqs,
+                          result_path=result_path,
+                          shortest_len=True,
+                          similarity=0.97,
+                          threads=-2)
+
+    def test_positive_threads(self):
+        """ SumaClust's actual clusters should match
+            the exact clusters when using multithreading
+        """
+        result_path = join(self.output_dir, "sumaclust_otus_exact.txt")
+        clusters = sumaclust_denovo_cluster(seq_path=self.file_read_seqs,
+                                            result_path=result_path,
+                                            shortest_len=True,
+                                            similarity=0.97,
+                                            threads=3,
+                                            exact=True)
+
+        self.files_to_remove.append(result_path)
+
+        self.check_clusters(clusters, result_path)
+
+    def test_exact_clustering(self):
+        """ SumaClust's actual clusters should match
+            the exact clusters when using the exact option
+        """
+        result_path = join(self.output_dir, "sumaclust_otus_exact.txt")
+        clusters = sumaclust_denovo_cluster(seq_path=self.file_read_seqs,
+                                            result_path=result_path,
+                                            shortest_len=True,
+                                            similarity=0.97,
+                                            threads=1,
+                                            exact=True)
+
+        self.files_to_remove.append(result_path)
+
+        self.check_clusters(clusters, result_path)
+
+    def test_shortest_len_clustering(self):
+        """ SumaClust's actual clusters should match
+            the exact clusters when not using the
+            shortest len option
+        """
+        result_path = join(self.output_dir, "sumaclust_otus_exact.txt")
+        clusters = sumaclust_denovo_cluster(seq_path=self.file_read_seqs,
+                                            result_path=result_path,
+                                            shortest_len=False,
+                                            similarity=0.97,
+                                            threads=1,
+                                            exact=True)
+
+        self.files_to_remove.append(result_path)
+
+        self.check_clusters(clusters, result_path)
+
+    def test_sumaclust_denovo_cluster(self):
+        """ Test de novo clustering with SumaClust """
+
+        result_path = join(self.output_dir, "sumaclust_otus.txt")
+
+        clusters = sumaclust_denovo_cluster(seq_path=self.file_read_seqs,
+                                            result_path=result_path)
+
+        self.files_to_remove.append(result_path)
+
+        self.check_clusters(clusters, result_path)
+
+
+# Reads to cluster
+# there are 30 reads representing 3 species (gives 3 clusters)
+reads_seqs = """>s1_630 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_2369 reference=1049393 amplicon=complement(497..788) errors=73%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTAGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_3750 reference=1049393 amplicon=complement(497..788) errors=100%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCA
+>s1_4572 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_5748 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_6846 reference=1049393 amplicon=complement(497..788) errors=67%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCATAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_7634 reference=1049393 amplicon=complement(497..788) errors=99%T
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTTG
+>s1_8623 reference=1049393 amplicon=complement(497..788) errors=17-
+GTGCCAGCAGCCGCGGAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_8744 reference=1049393 amplicon=complement(497..788) errors=62%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGAGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_13961 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_4677 reference=4382408 amplicon=complement(487..778) errors=74%T
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGTGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_8592 reference=4382408 amplicon=complement(487..778) errors=95+A
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAAGCCCA
+>s1_8977 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_10439 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_11001 reference=4382408 amplicon=complement(487..778) errors=91%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGGGAAAGCCCA
+>s1_11650 reference=4382408 amplicon=complement(487..778) errors=78-
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCGTAAGTCAGAGGTGAAAGCCCA
+>s1_12366 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_14735 reference=4382408 amplicon=complement(487..778) errors=94%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGACAGCCCA
+>s1_15985 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_21935 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_844 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_1271 reference=129416 amplicon=complement(522..813) errors=94%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGACAGCCCA
+>s1_1886 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_5347 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_5737 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_6200 reference=129416 amplicon=complement(522..813) errors=92%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTCAAAGCCCA
+>s1_7014 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_7040 reference=129416 amplicon=complement(522..813) errors=40%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAGTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_7881 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_8615 reference=129416 amplicon=complement(522..813) errors=81%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTGAGTCAGATGTGAAAGCCCA
+"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_swarm_v127.py b/bfillings/tests/test_swarm_v127.py
new file mode 100644
index 0000000..4a79b73
--- /dev/null
+++ b/bfillings/tests/test_swarm_v127.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+"""
+Unit tests for the Swarm version 1.2.7 Application controller
+=============================================================
+"""
+
+
+from unittest import TestCase, main
+from tempfile import mkstemp
+from os import close
+
+from skbio.util import remove_files
+
+from bfillings.swarm_v127 import swarm_denovo_cluster
+
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2014--, biocore development team
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+
+class SwarmTests(TestCase):
+    """ Tests for Swarm version 1.2.7 functionality """
+
+    def setUp(self):
+        self.read_seqs = reads_seqs
+
+        # create temporary file with read sequences defined in read_seqs
+        f, self.file_read_seqs = mkstemp(prefix='temp_reads_',
+                                         suffix='.fasta')
+        close(f)
+
+        # write read sequences to tmp file
+        with open(self.file_read_seqs, 'w') as tmp:
+            tmp.write(self.read_seqs)
+
+        # list of files to remove
+        self.files_to_remove = [self.file_read_seqs]
+
+    def tearDown(self):
+        remove_files(self.files_to_remove)
+
+    def test_default_param(self):
+        """ Swarm should return the correct clusters using
+            default inputs
+        """
+        clusters = swarm_denovo_cluster(seq_path=self.file_read_seqs,
+                                        d=1,
+                                        threads=1)
+
+        # Check the returned clusters list of lists is as expected
+        expected_clusters = [['s1_630', 's1_4572', 's1_5748',
+                              's1_13961', 's1_8744', 's1_8623',
+                              's1_7634', 's1_6846', 's1_3750',
+                              's1_2369'],
+                             ['s1_8977', 's1_10439', 's1_12366',
+                              's1_15985', 's1_21935', 's1_8592',
+                              's1_4677', 's1_14735', 's1_11650',
+                              's1_11001'],
+                             ['s1_844', 's1_1886', 's1_5347',
+                              's1_5737', 's1_7014', 's1_7881',
+                              's1_8615', 's1_7040', 's1_6200',
+                              's1_1271']]
+
+        # Should be 3 clusters
+        self.assertEqual(len(clusters), 3)
+
+        # List of actual clusters matches list of expected clusters
+        for actual_cluster, expected_cluster in zip(clusters,
+                                                    expected_clusters):
+            actual_cluster.sort()
+            expected_cluster.sort()
+            self.assertEqual(actual_cluster, expected_cluster)
+
+    def test_seq_path(self):
+        """ Swarm should raise a ValueError if the sequences
+            filepath does not exist
+        """
+
+        f, tmp_file = mkstemp(prefix='temp_reads_',
+                              suffix='.fasta')
+        close(f)
+        remove_files([tmp_file])
+
+        self.assertRaises(ValueError,
+                          swarm_denovo_cluster,
+                          seq_path=tmp_file,
+                          d=1,
+                          threads=1)
+
+    def test_negative_resolution(self):
+        """ Swarm should raise a ValueError if the resolution
+            is negative
+        """
+
+        self.assertRaises(ValueError,
+                          swarm_denovo_cluster,
+                          seq_path=self.file_read_seqs,
+                          d=-2,
+                          threads=1)
+
+    def test_negative_threads(self):
+        """ Swarm should raise a ValueError if number of threads
+            is negative
+        """
+
+        self.assertRaises(ValueError,
+                          swarm_denovo_cluster,
+                          seq_path=self.file_read_seqs,
+                          d=1,
+                          threads=-2)
+
+# Reads to cluster
+# there are 30 reads representing 3 species (gives 3 clusters)
+reads_seqs = """>s1_630 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_2369 reference=1049393 amplicon=complement(497..788) errors=73%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTAGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_3750 reference=1049393 amplicon=complement(497..788) errors=100%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCA
+>s1_4572 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_5748 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_6846 reference=1049393 amplicon=complement(497..788) errors=67%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCATAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_7634 reference=1049393 amplicon=complement(497..788) errors=99%T
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTTG
+>s1_8623 reference=1049393 amplicon=complement(497..788) errors=17-
+GTGCCAGCAGCCGCGGAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_8744 reference=1049393 amplicon=complement(497..788) errors=62%A
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGAGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_13961 reference=1049393 amplicon=complement(497..788)
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGTAAGTCAGGTGTGAAATCTCG
+>s1_4677 reference=4382408 amplicon=complement(487..778) errors=74%T
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGTGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_8592 reference=4382408 amplicon=complement(487..778) errors=95+A
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAAGCCCA
+>s1_8977 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_10439 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_11001 reference=4382408 amplicon=complement(487..778) errors=91%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGGGAAAGCCCA
+>s1_11650 reference=4382408 amplicon=complement(487..778) errors=78-
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCGTAAGTCAGAGGTGAAAGCCCA
+>s1_12366 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_14735 reference=4382408 amplicon=complement(487..778) errors=94%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGACAGCCCA
+>s1_15985 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_21935 reference=4382408 amplicon=complement(487..778)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTCCAAGCGTTGTCCGGAATCACTGGGTGTAAAGGGTGCGTAGGCGGGTCTGTAAGTCAGAGGTGAAAGCCCA
+>s1_844 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_1271 reference=129416 amplicon=complement(522..813) errors=94%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGACAGCCCA
+>s1_1886 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_5347 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_5737 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_6200 reference=129416 amplicon=complement(522..813) errors=92%C
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTCAAAGCCCA
+>s1_7014 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_7040 reference=129416 amplicon=complement(522..813) errors=40%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAGTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_7881 reference=129416 amplicon=complement(522..813)
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTAAGTCAGATGTGAAAGCCCA
+>s1_8615 reference=129416 amplicon=complement(522..813) errors=81%G
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGCTTTGTGAGTCAGATGTGAAAGCCCA
+"""
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_uclust.py b/bfillings/tests/test_uclust.py
new file mode 100644
index 0000000..0db9856
--- /dev/null
+++ b/bfillings/tests/test_uclust.py
@@ -0,0 +1,758 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+ : provides unit tests for the uclust.py module
+
+Modified from Daniel McDonald's test_cd_hit.py code on Feb-4-2010 """
+
+from subprocess import Popen, PIPE, STDOUT
+from tempfile import mkstemp, gettempdir
+from os.path import join
+
+from unittest import TestCase, main
+
+from skbio.util import remove_files
+
+from bfillings.uclust import (Uclust,
+                           uclust_fasta_sort_from_filepath,
+                           uclust_cluster_from_sorted_fasta_filepath,
+                           get_output_filepaths, clusters_from_uc_file,
+                           get_clusters_from_fasta_filepath,
+                           uclust_search_and_align_from_fasta_filepath,
+                           process_uclust_pw_alignment_results,
+                           UclustParseError)
+
+__author__ = "William Walters"
+__copyright__ = "Copyright 2007-2012, The Cogent Project"
+__credits__ = ["Daniel McDonald", "William Walters", "Greg Caporaso",
+               "Jai Ram Rideout"]
+__license__ = "GPL"
+__version__ = "1.5.3-dev"
+__maintainer__ = "William Walters"
+__email__ = "William.A.Walters at colorado.edu"
+__status__ = "Production"
+
+
+class UclustTests(TestCase):
+
+    def setUp(self):
+
+        _, self.tmp_unsorted_fasta_filepath = mkstemp(prefix="uclust_test",
+                                                      suffix=".fasta")
+        tmp_unsorted_fasta = open(self.tmp_unsorted_fasta_filepath, "w")
+        tmp_unsorted_fasta.write('\n'.join(raw_dna_seqs))
+        tmp_unsorted_fasta.close()
+
+        _, self.tmp_sorted_fasta_filepath = mkstemp(prefix="uclust_test",
+                                                    suffix=".fasta")
+        tmp_sorted_fasta = open(self.tmp_sorted_fasta_filepath, "w")
+        tmp_sorted_fasta.write('\n'.join(sorted_dna_seqs))
+        tmp_sorted_fasta.close()
+
+        _, self.tmp_uc_filepath = mkstemp(prefix="uclust_test", suffix=".uc")
+        tmp_uc = open(self.tmp_uc_filepath, "w")
+        tmp_uc.write('\n'.join(uc_dna_clusters))
+        tmp_uc.close()
+
+        _, self.tmp_clstr_filepath = mkstemp(prefix="uclust_test",
+                                             suffix=".clstr")
+
+        self.tmpdir = gettempdir()
+        self.WorkingDir = join(self.tmpdir, 'uclust_test')
+
+        self.files_to_remove = [self.tmp_unsorted_fasta_filepath,
+                                self.tmp_sorted_fasta_filepath,
+                                self.tmp_uc_filepath,
+                                self.tmp_clstr_filepath]
+
+    def tearDown(self):
+        remove_files(self.files_to_remove, error_on_missing=False)
+
+    def test_fasta_sorting(self):
+        """ Should sort fasta seqs from largest to smallest in outfile
+
+        Since a fasta file has to be passed to the app controller for uclust,
+        a temporary fasta file is created, and the raw fasta seqs supplied
+        in this module are written to it.  This file is sent to the app
+        controller, and the resulting sorted file is compared to the expected
+        results to ensure proper function of uclust as called by this app
+        controller."""
+
+        test_app = Uclust({'--tmpdir': self.tmpdir})
+
+        test_app_res = test_app(data=
+                                {'--mergesort': self.tmp_unsorted_fasta_filepath,
+                                 '--output': self.tmp_sorted_fasta_filepath})
+
+        sorted_fasta_actual = [l.strip()
+                               for l in open(test_app_res['Output'].name, "U")]
+        sorted_fasta_expected = [l.strip() for l in sorted_dna_seqs if l]
+
+        self.assertEqual(sorted_fasta_actual, sorted_fasta_expected)
+
+        test_app_res.cleanUp()
+
+    def test_parameter_availability(self):
+        """ Often used parameters are accessible
+
+            This is just some basic sanity checking.
+
+        """
+        a = Uclust()
+        # if a parameter is not accessible, trying to turn it on will
+        # raise a KeyError
+        a.Parameters['--allhits'].on()
+        a.Parameters['--libonly'].on()
+        a.Parameters['--maxaccepts'].on(42)
+        a.Parameters['--maxrejects'].on(42)
+        a.Parameters['--rev'].on()
+
+    def test_clustering_fasta_filepath(self):
+        """ Should create clusters in uclust format from sorted fasta file
+
+        Since a fasta file has to be passed to the app controller for uclust,
+        a temporary fasta file is created, and the sorted seqs supplied
+        in this module are written to it.  This file is sent to the app
+        controller, and the resulting uclust file is compared to the expected
+        results to ensure proper function of uclust as called by this app
+        controller."""
+
+        test_app = Uclust({'--id': 0.9}, HALT_EXEC=False)
+        test_app_res = test_app(data=
+                                {'--input': self.tmp_sorted_fasta_filepath,
+                                 '--uc': self.tmp_uc_filepath})
+
+        uc_file = open(test_app_res['ClusterFile'].name, "U")
+        # compare the actual and expect uc files, ignoring comment lines
+        uc_file_actual = [l.strip() for l in uc_file
+                          if not l.startswith('#')]
+        uc_file_expected = [l.strip() for l in uc_dna_clusters
+                            if not l.startswith('#')]
+
+        self.assertEqual(uc_file_actual, uc_file_expected)
+
+        test_app_res.cleanUp()
+
+
+class UclustConvenienceWrappers(TestCase):
+
+    """ Unit tests for uclust convenience wrappers """
+
+    def setUp(self):
+
+        _, self.tmp_unsorted_fasta_filepath = mkstemp(prefix="uclust_test",
+                                                      suffix=".fasta")
+        tmp_unsorted_fasta = open(self.tmp_unsorted_fasta_filepath, "w")
+        tmp_unsorted_fasta.write('\n'.join(raw_dna_seqs))
+        tmp_unsorted_fasta.close()
+
+        _, self.tmp_raw_dna_seqs_rc_filepath = mkstemp(prefix="uclust_test",
+                                                       suffix=".fasta")
+        tmp_rc_fasta = open(self.tmp_raw_dna_seqs_rc_filepath, "w")
+        tmp_rc_fasta.write('\n'.join(raw_dna_seqs_rc))
+        tmp_rc_fasta.close()
+
+        _, self.tmp_sorted_fasta_filepath = mkstemp(prefix="uclust_test",
+                                                    suffix=".fasta")
+        tmp_sorted_fasta = open(self.tmp_sorted_fasta_filepath, "w")
+        tmp_sorted_fasta.write('\n'.join(sorted_dna_seqs))
+        tmp_sorted_fasta.close()
+
+        _, self.tmp_uc_filepath = mkstemp(prefix="uclust_test", suffix=".uc")
+        tmp_uc = open(self.tmp_uc_filepath, "w")
+        tmp_uc.write('\n'.join(uc_dna_clusters))
+        tmp_uc.close()
+
+        _, self.tmp_clstr_filepath = mkstemp(prefix="uclust_test",
+                                             suffix=".clstr")
+
+        self.search_align_out1_expected = search_align_out1_expected
+        self.search_align_out_fasta_pairs1 = search_align_out_fasta_pairs1
+        self.search_align_out_uc1 = search_align_out_uc1
+        _, self.search_align_query1_fp = mkstemp(prefix="uclust_test",
+                                                 suffix=".clstr")
+        open(self.search_align_query1_fp, 'w').write(search_align_query1)
+        _, self.search_align_template1_fp = mkstemp(prefix="uclust_test",
+                                                    suffix=".clstr")
+        open(self.search_align_template1_fp, 'w').write(search_align_template1)
+
+        self.search_align_out2_expected = search_align_out2_expected
+        _, self.search_align_query2_fp = mkstemp(prefix="uclust_test",
+                                                 suffix=".clstr")
+        open(self.search_align_query2_fp, 'w').write(search_align_query2)
+        _, self.search_align_template2_fp = mkstemp(prefix="uclust_test",
+                                                    suffix=".clstr")
+        open(self.search_align_template2_fp, 'w').write(search_align_template2)
+
+        _, self.ref_dna_seqs_fp = mkstemp(prefix="uclust_test",
+                                          suffix=".fasta")
+        open(self.ref_dna_seqs_fp, 'w').write(ref_dna_seqs)
+
+        self.files_to_remove = [self.tmp_unsorted_fasta_filepath,
+                                self.tmp_raw_dna_seqs_rc_filepath,
+                                self.tmp_sorted_fasta_filepath,
+                                self.tmp_uc_filepath,
+                                self.tmp_clstr_filepath,
+                                self.search_align_query1_fp,
+                                self.search_align_template1_fp,
+                                self.search_align_query2_fp,
+                                self.search_align_template2_fp,
+                                self.ref_dna_seqs_fp]
+
+        self.ref_test_clusters1 = ref_test_clusters1
+        self.ref_test_failures1 = ref_test_failures1
+        self.ref_test_new_seeds1 = ref_test_new_seeds1
+        self.ref_test_clusters2 = ref_test_clusters2
+        self.ref_test_failures2 = ref_test_failures2
+        self.ref_test_new_seeds2 = ref_test_new_seeds2
+        self.uc_dna_clusters = uc_dna_clusters
+        self.uc_lines1 = uc_lines1
+        self.uc_lines_w_multiple_hits_per_query = \
+            uc_lines_w_multiple_hits_per_query
+        self.uc_lines_overlapping_lib_input_seq_ids = \
+            uc_lines_overlapping_lib_input_seq_ids
+
+        self.tmpdir = gettempdir()
+
+    def tearDown(self):
+        remove_files(self.files_to_remove, error_on_missing=False)
+
+    def test_uclust_fasta_sort_from_filepath(self):
+        """ Given an unsorted fasta filepath, will return sorted file """
+
+        app_res = \
+            uclust_fasta_sort_from_filepath(self.tmp_unsorted_fasta_filepath)
+
+        sorted_fasta_actual = [l.strip()
+                               for l in open(app_res['Output'].name, "U")]
+        sorted_fasta_expected = [l.strip() for l in sorted_dna_seqs if l]
+
+        self.assertEqual(sorted_fasta_actual, sorted_fasta_expected)
+
+        app_res.cleanUp()
+
+    def test_clusters_from_uc_file(self):
+        """ clusters_from_uc_file functions as expected """
+
+        expected_clusters = {'s2': ['s2', 's3']}
+        expected_failures = ['s1']
+        expected_new_seeds = ['s2']
+        self.assertEqual(clusters_from_uc_file(self.uc_lines1),
+                         (expected_clusters, expected_failures, expected_new_seeds))
+
+    def test_clusters_from_uc_file_multiple_hits(self):
+        """ clusters_from_uc_file handles error_on_multiple_hits correctly
+        """
+        # when a query hits multiple hits and error_on_multiple_hits=True
+        # an error should be raised
+        self.assertRaises(UclustParseError,
+                          clusters_from_uc_file,
+                          self.uc_lines_w_multiple_hits_per_query,
+                          error_on_multiple_hits=True)
+
+        # when a query hits multiple hits and error_on_multiple_hits=False
+        # the query should show up in multiple clusters
+        actual = clusters_from_uc_file(self.uc_lines_w_multiple_hits_per_query,
+                                       error_on_multiple_hits=False)
+        expected_clusters = {'s2': ['s2', 's3'],
+                             's4': ['s4', 's3']}
+        expected_failures = ['s1']
+        expected_new_seeds = ['s2', 's4']
+        self.assertEqual(actual,
+                         (expected_clusters, expected_failures, expected_new_seeds))
+
+    def test_clusters_from_uc_file_error(self):
+        """ clusters_from_uc_file raises error when lib/input seq ids overlap
+        """
+        self.assertRaises(UclustParseError,
+                          clusters_from_uc_file,
+                          self.uc_lines_overlapping_lib_input_seq_ids)
+
+    def test_uclust_cluster_from_sorted_fasta_filepath(self):
+        """ Given a sorted fasta filepath, will return uclust (.uc) file """
+
+        app_res = \
+            uclust_cluster_from_sorted_fasta_filepath(
+                self.tmp_sorted_fasta_filepath,
+                percent_ID=0.90, HALT_EXEC=False)
+
+        uc_file = open(app_res['ClusterFile'].name, "U")
+        # compare the actual and expect uc files, ignoring comment lines
+        uc_file_actual = [l.strip() for l in uc_file
+                          if not l.startswith('#')]
+        uc_file_expected = [l.strip() for l in uc_dna_clusters
+                            if not l.startswith('#')]
+
+        self.assertEqual(uc_file_actual, uc_file_expected)
+        app_res.cleanUp()
+
+    def test_get_output_filepaths(self):
+        """ Properly generates output filepath names """
+
+        uc_res = \
+            get_output_filepaths(self.tmpdir, "test_seqs.fasta")
+
+        self.assertEqual(uc_res, join(self.tmpdir, "test_seqs_clusters.uc"))
+
+    def test_get_output_filepaths_multiple_dots(self):
+        """Generates filepath names from names with more than one dot"""
+        obs = get_output_filepaths(self.tmpdir, "test_seqs.filtered.fasta")
+        self.assertEqual(obs, join(self.tmpdir, "test_seqs.filtered_clusters.uc"))
+
+    def test_get_clusters_from_fasta_filepath(self):
+        """ Tests for return of lists of OTUs from given fasta filepath """
+
+        clusters_res = \
+            get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
+                                             original_fasta_path=None, percent_ID=0.90, save_uc_files=False)
+        expected_cluster_list.sort()
+        expected_failure_list.sort()
+        expected_new_seed_list.sort()
+        clusters_res[0].sort()
+        clusters_res[1].sort()
+        clusters_res[2].sort()
+        self.assertEqual(clusters_res, (expected_cluster_list,
+                                        expected_failure_list,
+                                        expected_new_seed_list))
+
+    def test_get_clusters_from_fasta_filepath_reference_db_only(self):
+        """ Correct clusters returned when clustering against a database only
+        """
+        clusters_res = get_clusters_from_fasta_filepath(
+            self.tmp_unsorted_fasta_filepath,
+            original_fasta_path=None,
+            save_uc_files=False,
+            max_accepts=7, max_rejects=12,
+            percent_ID=0.90,
+            subject_fasta_filepath=self.ref_dna_seqs_fp,
+            suppress_new_clusters=True,
+            HALT_EXEC=False)
+
+        self.ref_test_clusters1.sort()
+        self.ref_test_failures1.sort()
+        self.ref_test_new_seeds1.sort()
+
+        clusters_res[0].sort()
+        clusters_res[1].sort()
+        clusters_res[2].sort()
+        self.assertEqual(clusters_res, (self.ref_test_clusters1,
+                                        self.ref_test_failures1,
+                                        self.ref_test_new_seeds1))
+
+    def test_get_clusters_from_fasta_filepath_extending_reference_db(self):
+        """ Correct clusters when clustering against db and adding new clusters
+        """
+        clusters_res = get_clusters_from_fasta_filepath(
+            self.tmp_unsorted_fasta_filepath,
+            original_fasta_path=None,
+            max_accepts=7, max_rejects=12,
+            percent_ID=0.90,
+            subject_fasta_filepath=self.ref_dna_seqs_fp,
+            suppress_new_clusters=False, enable_rev_strand_matching=True,
+            HALT_EXEC=False,
+            save_uc_files=False)
+
+        self.ref_test_clusters2.sort()
+        self.ref_test_failures2.sort()
+        self.ref_test_new_seeds2.sort()
+
+        clusters_res[0].sort()
+        clusters_res[1].sort()
+        clusters_res[2].sort()
+        self.assertEqual(clusters_res, (self.ref_test_clusters2,
+                                        self.ref_test_failures2,
+                                        self.ref_test_new_seeds2))
+
+    def test_get_clusters_from_fasta_filepath_optimal(self):
+        """ Test OTUs from filepath functions with optimal
+        """
+        # need to compile a small test where optimal has an affect --
+        # this currently is only testing that we don't get a failure with
+        # optimal
+        clusters_res = \
+            get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
+                                             original_fasta_path=None, save_uc_files=False,
+                                             percent_ID=0.90, optimal=True)
+        expected_cluster_list.sort()
+        expected_failure_list.sort()
+        expected_new_seed_list.sort()
+        clusters_res[0].sort()
+        clusters_res[1].sort()
+        clusters_res[2].sort()
+
+        self.assertEqual(clusters_res, (expected_cluster_list,
+                                        expected_failure_list,
+                                        expected_new_seed_list))
+
+    def test_get_clusters_from_fasta_filepath_suppress_sort(self):
+        """ Test OTUs from filepath functions with suppress sort
+        """
+        expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
+                    ['uclust_test_seqs_2'], ['uclust_test_seqs_3'],
+                    ['uclust_test_seqs_4'], ['uclust_test_seqs_5'],
+                    ['uclust_test_seqs_6', 'uclust_test_seqs_8'],
+                    ['uclust_test_seqs_7'], ['uclust_test_seqs_9']]
+        clusters_res = \
+            get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
+                                             original_fasta_path=None,
+                                             percent_ID=0.90, suppress_sort=True, save_uc_files=False)
+        expected_cluster_list.sort()
+        expected_failure_list.sort()
+        expected_new_seed_list.sort()
+        clusters_res[0].sort()
+        clusters_res[1].sort()
+        clusters_res[2].sort()
+
+        self.assertEqual(clusters_res, (expected_cluster_list,
+                                        expected_failure_list,
+                                        expected_new_seed_list))
+
+    def test_get_clusters_from_fasta_filepath_rev_strand_match(self):
+        """ Test OTUs from filepath functions with rev strand match
+        """
+        # seq and its rc don't cluster when enable_rev_strand_matching = False
+        expected_cluster_list = [['uclust_test_seqs_0'],
+                                 ['uclust_test_seqs_0_rc']]
+        expected_failure_list = []
+        expected_new_seed_list = [
+            'uclust_test_seqs_0',
+            'uclust_test_seqs_0_rc']
+        clusters_res = \
+            get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
+                                             original_fasta_path=None, save_uc_files=False,
+                                             percent_ID=0.90, enable_rev_strand_matching=False)
+
+        expected_cluster_list.sort()
+        expected_failure_list.sort()
+        expected_new_seed_list.sort()
+        clusters_res[0].sort()
+        clusters_res[1].sort()
+        clusters_res[2].sort()
+        self.assertEqual(clusters_res, (expected_cluster_list,
+                                        expected_failure_list,
+                                        expected_new_seed_list))
+
+        # seq and its rc cluster when enable_rev_strand_matching = False
+        expected_cluster_list = [
+            ['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']]
+        expected_failure_list = []
+        expected_new_seed_list = ['uclust_test_seqs_0']
+        clusters_res = \
+            get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
+                                             original_fasta_path=None, save_uc_files=False,
+                                             percent_ID=0.90, enable_rev_strand_matching=True)
+
+        expected_cluster_list.sort()
+        expected_failure_list.sort()
+        expected_new_seed_list.sort()
+        clusters_res[0].sort()
+        clusters_res[1].sort()
+        clusters_res[2].sort()
+        self.assertEqual(clusters_res, (expected_cluster_list,
+                                        expected_failure_list,
+                                        expected_new_seed_list))
+
+    def test_process_uclust_pw_alignment_results(self):
+        """parsing of pairwise alignment fasta pairs file functions as expected
+        """
+        actual = list(process_uclust_pw_alignment_results(
+            self.search_align_out_fasta_pairs1, self.search_align_out_uc1))
+        expected = self.search_align_out1_expected
+
+        # iterate over results so error output will highlight the bad match
+        for a, e in zip(actual, expected):
+            self.assertEqual(a, e)
+
+        # make sure the full result objects are the same
+        self.assertEqual(actual, expected)
+
+    def test_uclust_search_and_align_from_fasta_filepath(self):
+        """ uclust_search_and_align_from_fasta_filepath functions as expected """
+        # rev comp matches allowed (default)
+        actual = list(uclust_search_and_align_from_fasta_filepath(
+            self.search_align_query1_fp, self.search_align_template1_fp))
+        self.assertEqual(actual, self.search_align_out1_expected)
+
+        # rev comp matches not allowed
+        actual = list(uclust_search_and_align_from_fasta_filepath(
+            self.search_align_query1_fp, self.search_align_template1_fp,
+            enable_rev_strand_matching=False))
+        self.assertEqual(actual, self.search_align_out1_expected[:2])
+
+    def test_uclust_search_and_align_from_fasta_filepath_protein(self):
+        """ uclust_search_and_align_from_fasta_filepath functions with protein """
+        # rev comp matches allowed (default)
+        actual = list(uclust_search_and_align_from_fasta_filepath(
+            self.search_align_query2_fp, self.search_align_template2_fp))
+        self.assertEqual(actual, self.search_align_out2_expected)
+
+    def test_uclust_supported_version(self):
+        """uclust version is supported """
+        command = 'uclust --version'
+        proc = Popen(command, shell=True, universal_newlines=True,
+                     stdout=PIPE, stderr=STDOUT)
+        stdout = proc.stdout.read()
+        version_string = stdout.strip().split('v')[-1].strip('q')
+        try:
+            version = tuple(map(int, version_string.split('.')))
+            acceptable_version = version >= (1, 2, 22)
+        except ValueError:
+            acceptable_version = False
+
+        self.assertTrue(acceptable_version,
+                        "Unsupported uclust version. 1.2.22 or later " +
+                        "is required, but running %s." % version_string)
+
+raw_dna_seqs = """>uclust_test_seqs_0
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_1
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>uclust_test_seqs_2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>uclust_test_seqs_4
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>uclust_test_seqs_6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>uclust_test_seqs_7
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>uclust_test_seqs_9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+""".split('\n')
+
+ref_dna_seqs = """>ref1 25 random bases appended to uclust_test_seqs_0 and one mismatch
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATATTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCTATAGCAGCCCCAGCGTTTACTTCTA
+>ref2 15 random bases prepended to uclust_test_seqs_1 and one mismatch
+GCTGCGGCGTCCTGCGCCACGGTGGGTACAACACGTCCACTACATCTGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>ref3 5 random bases prepended and 10 random bases appended to uclust_test_seqs_2
+ATAGGCCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACTGCCTGATTCA
+>ref4 exact match to uclust_test_seqs_3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+"""
+
+ref_test_clusters1 = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
+                      ['uclust_test_seqs_2'], ['uclust_test_seqs_3']]
+ref_test_failures1 = ['uclust_test_seqs_4', 'uclust_test_seqs_5',
+                      'uclust_test_seqs_6', 'uclust_test_seqs_7',
+                      'uclust_test_seqs_8', 'uclust_test_seqs_9']
+ref_test_new_seeds1 = []
+
+ref_test_clusters2 = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
+                      ['uclust_test_seqs_2'], ['uclust_test_seqs_3'],
+                      ['uclust_test_seqs_4'], ['uclust_test_seqs_5'],
+                      ['uclust_test_seqs_6', 'uclust_test_seqs_8'],
+                      ['uclust_test_seqs_7'], ['uclust_test_seqs_9']]
+ref_test_failures2 = []
+ref_test_new_seeds2 = [
+    'uclust_test_seqs_4', 'uclust_test_seqs_5', 'uclust_test_seqs_6',
+    'uclust_test_seqs_7', 'uclust_test_seqs_9']
+
+
+raw_dna_seqs_rc = """>uclust_test_seqs_0
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_0_rc
+AGCTCTGACACAAAACTGACGTGATGTGCCTTAAGTATCCAACCCGTTGGATGGGACGTCTTGTAGCCACCGT
+""".split('\n')
+
+sorted_dna_seqs = """>uclust_test_seqs_7
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_4
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>uclust_test_seqs_1
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>uclust_test_seqs_5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>uclust_test_seqs_6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>uclust_test_seqs_0
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>uclust_test_seqs_9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+""".split('\n')
+
+# Clusters are created at a 0.90% identity
+uc_dna_clusters = """# uclust --input /tmp/uclust_testBGwZvcikrbNefYGRTk0u.fasta --id 0.9 --uc /tmp/uclust_testrbcO0CyBVpV9AwH3OIK1.uc
+# version=1.1.577
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+S	0	80	*	*	*	*	*	uclust_test_seqs_7	*
+S	1	79	*	*	*	*	*	uclust_test_seqs_4	*
+S	2	78	*	*	*	*	*	uclust_test_seqs_2	*
+S	3	77	*	*	*	*	*	uclust_test_seqs_3	*
+S	4	76	*	*	*	*	*	uclust_test_seqs_1	*
+S	5	75	*	*	*	*	*	uclust_test_seqs_5	*
+S	6	74	*	*	*	*	*	uclust_test_seqs_6	*
+S	7	73	*	*	*	*	*	uclust_test_seqs_0	*
+H	6	72	91.7	+	0	0	2I72M	uclust_test_seqs_8	uclust_test_seqs_6
+S	8	71	*	*	*	*	*	uclust_test_seqs_9	*
+C	0	1	*	*	*	*	*	uclust_test_seqs_7	*
+C	1	1	*	*	*	*	*	uclust_test_seqs_4	*
+C	2	1	*	*	*	*	*	uclust_test_seqs_2	*
+C	3	1	*	*	*	*	*	uclust_test_seqs_3	*
+C	4	1	*	*	*	*	*	uclust_test_seqs_1	*
+C	5	1	*	*	*	*	*	uclust_test_seqs_5	*
+C	6	2	91.7	*	*	*	*	uclust_test_seqs_6	*
+C	7	1	*	*	*	*	*	uclust_test_seqs_0	*
+C	8	1	*	*	*	*	*	uclust_test_seqs_9	*""".split('\n')
+
+expected_cluster_list = [['uclust_test_seqs_7'],
+                         ['uclust_test_seqs_4'],
+                         ['uclust_test_seqs_2'],
+                         ['uclust_test_seqs_3'],
+                         ['uclust_test_seqs_1'],
+                         ['uclust_test_seqs_5'],
+                         ['uclust_test_seqs_6',
+                          'uclust_test_seqs_8'],
+                         ['uclust_test_seqs_0'],
+                         ['uclust_test_seqs_9']]
+expected_failure_list = []
+expected_new_seed_list = [
+    'uclust_test_seqs_7', 'uclust_test_seqs_4', 'uclust_test_seqs_2',
+    'uclust_test_seqs_3', 'uclust_test_seqs_1', 'uclust_test_seqs_5', 'uclust_test_seqs_6',
+    'uclust_test_seqs_0', 'uclust_test_seqs_9']
+
+search_align_query1 = """>1_like
+TACGGCTACCTTGTTACGACTTCATCCCAATCATTTGTTCCACCTTCGACGGCTA
+>2_like
+ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG
+>2_like_rc
+CTTAGTTGCCATCCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAATCATCAT
+>rand
+TTGCGACGAGCGGACGGCCGGGTGTATGTCGTCATATATATGTGTCTGCCTATCGTTACGTACACTCGTCGTCT
+"""
+
+search_align_template1 = """>1
+AGAAAGGAGGTGATCCAGCCGCACCTTCCGATACGGCTACCTTGTTACGACTTCACCCCAATCATTTGTTCCACCTTCGACGGCTAGCTCCAAATGGTTACTCCACCGGCTTCGGGTGTTACAAACTC
+>2
+AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT
+"""
+
+search_align_query2 = """>1_like
+PRTEINACYYPL
+>2_like
+AGGYTPPLVN
+>rand
+GGTYPARREE
+"""
+
+search_align_template2 = """>1
+PRTELNACYYPL
+>2
+AGGYTRPPLVN
+"""
+
+search_align_out2_expected = [
+    ('1_like', '1', 'PRTEINACYYPL', 'PRTELNACYYPL', 91.70000),
+    ('2_like', '2', 'AGGYT-PPLVN', 'AGGYTRPPLVN', 100.0)]
+
+search_align_out_fasta_pairs1 = """>1_like
+-------------------------------TACGGCTACCTTGTTACGACTTCATCCCAATCATTTGTTCCACCTTCGACGGCTA------------------------------------------
+>1+
+AGAAAGGAGGTGATCCAGCCGCACCTTCCGATACGGCTACCTTGTTACGACTTCACCCCAATCATTTGTTCCACCTTCGACGGCTAGCTCCAAATGGTTACTCCACCGGCTTCGGGTGTTACAAACTC
+
+>2_like
+-------------------ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG---------------
+>2+
+AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT
+
+>2_like_rc
+---------------CTTAGTTGCCATCCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAATCATCAT-------------------
+>2-
+AGCGCAACCCTTAAGCTTAGTTGCCATCCCGGTGACAAACCGGAGGAAGGTGGGGATGACGTCAAATCATCATGCCCCTTATGATTTGGGCT
+""".split('\n')
+
+search_align_out_uc1 = """# uclust --input sm_query.fasta --lib sm_template.fasta --id 0.75 --libonly --rev --maxaccepts 8 --maxrejects 32 --fastapairs sm_pw.fasta --uc sm_result.uc
+# version=1.1.577
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+L	0	128	*	*	*	*	*	1	*
+H	0	55	98.2	+	0	0	31I55M42I	1_like	1
+L	1	92	*	*	*	*	*	2	*
+H	1	58	100.0	+	0	0	19I58M15I	2_like	2
+H	1	58	100.0	-	0	0	15I58M19I	2_like_rc	2
+N	*	74	*	*	*	*	*	rand	*
+D	0	2	*	*	*	*	98.2	1	*
+D	1	3	*	*	*	*	100.0	2	*
+""".split('\n')
+
+search_align_out1_expected = [
+    ('1_like', '1', '-------------------------------TACGGCTACCTTGTTACGACTTCATCCCAATCATTTGTTCCACCTTCGACGGCTA------------------------------------------',
+     'AGAAAGGAGGTGATCCAGCCGCACCTTCCGATACGGCTACCTTGTTACGACTTCACCCCAATCATTTGTTCCACCTTCGACGGCTAGCTCCAAATGGTTACTCCACCGGCTTCGGGTGTTACAAACTC', 98.2),
+
+    ('2_like', '2', '-------------------ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG---------------',
+     'AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT', 100.0),
+
+    ('2_like_rc RC', '2', '-------------------ATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAG---------------', 'AGCCCAAATCATAAGGGGCATGATGATTTGACGTCATCCCCACCTTCCTCCGGTTTGTCACCGGGATGGCAACTAAGCTTAAGGGTTGCGCT', 100.0)]
+
+uc_lines1 = """# uclust --input q.fasta --lib r.fasta --uc results.uc --id 0.90 --libonly --rev
+# version=1.1.579
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+N	*	80	*	*	*	*	*	s1 some comment	*
+S	4	80	*	*	*	*	*	s2 some other comment	*
+H	2	78	100.0	+	0	0	5I78M10I	s3 yet another comment	s2""".split('\n')
+
+uc_lines_w_multiple_hits_per_query = """# uclust --input q.fasta --lib r.fasta --uc results.uc --id 0.90 --libonly --rev
+# version=1.1.579
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+N	*	80	*	*	*	*	*	s1 some comment	*
+S	4	80	*	*	*	*	*	s2 some other comment	*
+S	4	80	*	*	*	*	*	s4	*
+H	2	78	100.0	+	0	0	5I78M10I	s3 yet another comment	s2
+H	2	78	98.0	+	0	0	5I78M10I	s3 yet another comment	s4
+""".split('\n')
+
+uc_lines_overlapping_lib_input_seq_ids = """# uclust --maxrejects 32 --input /tmp/OtuPickerbb092OWRWLWqlBR2BmTZ.fasta --id 0.97 --uc /tmp/uclust_clustersLf5Oqv0SvGTZo1mVWBqK.uc --rev --usersort --maxaccepts 8 --lib r.fasta
+# version=1.1.16
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+S	1	24	*	*	*	*	*	3	*
+H	1	24	100.0	+	0	0	24M	4	3
+L	0	54	*	*	*	*	*	3	*
+H	0	54	100.0	+	0	0	54M	2	3
+D	0	2	*	*	*	*	100.0	3	*
+C	1	2	100.0	*	*	*	*	3	*
+""".split('\n')
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_usearch.py b/bfillings/tests/test_usearch.py
new file mode 100755
index 0000000..5e0ba0b
--- /dev/null
+++ b/bfillings/tests/test_usearch.py
@@ -0,0 +1,2000 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+provides unit tests for the usearch.py module
+"""
+
+from os import close
+from os.path import basename, join, exists
+from shutil import rmtree
+from glob import glob
+from unittest import TestCase, main
+from tempfile import mkstemp, mkdtemp
+
+from skbio.util import remove_files
+from skbio.parse.sequences import parse_fasta
+
+from bfillings.usearch import (clusters_from_blast_uc_file,
+                            usearch_fasta_sort_from_filepath,
+                            usearch_dereplicate_exact_subseqs,
+                            usearch_dereplicate_exact_seqs,
+                            usearch_sort_by_abundance,
+                            usearch_cluster_error_correction,
+                            usearch_chimera_filter_de_novo,
+                            usearch_chimera_filter_ref_based,
+                            usearch_cluster_seqs,
+                            enumerate_otus, assign_reads_to_otus,
+                            usearch_qf, concatenate_fastas,
+                            get_retained_chimeras,
+                            assign_dna_reads_to_protein_database,
+                            usearch61_ref_cluster,
+                            usearch61_denovo_cluster,
+                            sort_by_abundance_usearch61,
+                            sort_by_length_usearch61,
+                            usearch61_cluster_ref,
+                            usearch61_fast_cluster,
+                            usearch61_smallmem_cluster,
+                            parse_dereplicated_uc,
+                            parse_usearch61_clusters,
+                            merge_clusters_dereplicated_seqs,
+                            merge_failures_dereplicated_seqs,
+                            parse_usearch61_failures,
+                            usearch61_chimera_check_denovo,
+                            usearch61_chimera_check_ref)
+
+
+class Usearch61Tests(TestCase):
+
+    """ Tests for usearch 6.1 functionality """
+
+    def setUp(self):
+        # create the temporary input files
+
+        self.output_dir = '/tmp/'
+
+        self.dna_seqs_1 = dna_seqs_1
+        self.usearch_ref_seqs1 = usearch_ref_seqs1
+        self.dna_seqs_1_subset = dna_seqs_1_subset
+        self.dna_seqs_with_dups = dna_seqs_with_dups2
+        self.usearch61_dereplicated_uc_lines = usearch61_dereplicated_uc_lines
+        self.usearch61_clustered_uc_lines = usearch61_clustered_uc_lines
+        self.usearch61_clustered_uc_lines_ref =\
+            usearch61_clustered_uc_lines_ref
+        self.usearch61_clustered_ref_lines = usearch61_clustered_ref_lines
+        self.de_novo_chimera_seqs = de_novo_chimera_seqs
+        self.expected_usearch61_denovo_uchime_file =\
+            expected_usearch61_denovo_uchime_file
+        self.reference_seqs_fp = reference_seqs_fp
+        self.expected_usearch61_ref_uchime_file =\
+            expected_usearch61_ref_uchime_file
+
+        f, self.tmp_dna_seqs_1 = mkstemp(prefix='UsearchOtuPickerTest_',
+                                         suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_dna_seqs_1, 'w')
+        seq_file.write(self.dna_seqs_1)
+        seq_file.close()
+
+        f, self.tmp_usearch_ref_seqs1 = mkstemp(prefix='UsearchOtuPickerTest_',
+                                                suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_usearch_ref_seqs1, 'w')
+        seq_file.write(self.usearch_ref_seqs1)
+        seq_file.close()
+
+        f, self.tmp_dna_seqs_1_subset = mkstemp(prefix='UsearchOtuPickerTest_',
+                                                suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_dna_seqs_1_subset, 'w')
+        seq_file.write(self.dna_seqs_1_subset)
+        seq_file.close()
+
+        f, self.tmp_dna_seqs_with_dups = \
+            mkstemp(prefix='UsearchOtuPickerTest_', suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_dna_seqs_with_dups, "w")
+        seq_file.write(self.dna_seqs_with_dups)
+        seq_file.close()
+
+        f, self.tmp_de_novo_chimera_seqs = \
+            mkstemp(prefix='Usearch61denovoChimera_', suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_de_novo_chimera_seqs, 'w')
+        seq_file.write(self.de_novo_chimera_seqs)
+        seq_file.close()
+
+        f, self.tmp_ref_chimera_seqs = mkstemp(prefix="Usearch61refChimera_",
+                                               suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_ref_chimera_seqs, "w")
+        seq_file.write(self.reference_seqs_fp)
+        seq_file.close()
+
+        self._files_to_remove =\
+            [self.tmp_dna_seqs_1, self.tmp_usearch_ref_seqs1,
+             self.tmp_dna_seqs_1_subset, self.tmp_dna_seqs_with_dups,
+             self.tmp_de_novo_chimera_seqs, self.tmp_ref_chimera_seqs]
+
+        self._dirs_to_remove = []
+
+    def tearDown(self):
+        remove_files(self._files_to_remove)
+        if self._dirs_to_remove:
+            for curr_dir in self._dirs_to_remove:
+                rmtree(curr_dir)
+
+    def test_usearch61_ref_default_params(self):
+        """ usearch61 reference OTU picking works with default settings """
+
+        clusters, failures = usearch61_ref_cluster(self.tmp_dna_seqs_1,
+                                                   self.tmp_usearch_ref_seqs1, output_dir=self.output_dir,
+                                                   save_intermediate_files=False, remove_usearch_logs=True)
+
+        # Should all fall into single, de novo clusters
+
+        expected_failures = []
+
+        self.assertEqual(failures, expected_failures)
+
+        expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+                             ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+                             ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+                             ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+        self.assertEqual(len(expected_clusters), 10)
+
+        for curr_cluster in clusters.values():
+            self.assertTrue(curr_cluster in expected_clusters)
+
+    def test_usearch61_ref_default_params_suppressed_clusters(self):
+        """ usearch61 reference OTU picking, suppressed clusters """
+
+        clusters, failures = usearch61_ref_cluster(self.tmp_dna_seqs_1,
+                                                   self.tmp_usearch_ref_seqs1, suppress_new_clusters=True,
+                                                   output_dir=self.output_dir,
+                                                   save_intermediate_files=False, remove_usearch_logs=True)
+
+        # Should all fail as the reference database does not match.
+
+        expected_clusters = {}
+
+        expected_failures = ['uclust_test_seqs_0', 'uclust_test_seqs_9',
+                             'uclust_test_seqs_4', 'uclust_test_seqs_7', 'uclust_test_seqs_2',
+                             'uclust_test_seqs_1', 'uclust_test_seqs_3', 'uclust_test_seqs_8',
+                             'uclust_test_seqs_6', 'uclust_test_seqs_5']
+
+        self.assertEqual(clusters, expected_clusters)
+
+        for curr_failure in failures:
+            self.assertTrue(curr_failure in expected_failures)
+
+    def test_usearch61_ref_default_params_matches_ref(self):
+        """ usearch61 reference OTU picking, matches ref OTU IDs """
+
+        clusters, failures = usearch61_ref_cluster(self.tmp_dna_seqs_1,
+                                                   self.tmp_dna_seqs_1, suppress_new_clusters=True,
+                                                   output_dir=self.output_dir,
+                                                   save_intermediate_files=False, remove_usearch_logs=True)
+
+        # Should all fall into single, ref-based clusters
+
+        expected_clusters = {'uclust_test_seqs_5': ['uclust_test_seqs_5'],
+                             'uclust_test_seqs_4': ['uclust_test_seqs_4'],
+                             'uclust_test_seqs_7': ['uclust_test_seqs_7'],
+                             'uclust_test_seqs_6': ['uclust_test_seqs_6'],
+                             'uclust_test_seqs_1': ['uclust_test_seqs_1'],
+                             'uclust_test_seqs_0': ['uclust_test_seqs_0'],
+                             'uclust_test_seqs_3': ['uclust_test_seqs_3'],
+                             'uclust_test_seqs_2': ['uclust_test_seqs_2'],
+                             'uclust_test_seqs_9': ['uclust_test_seqs_9'],
+                             'uclust_test_seqs_8': ['uclust_test_seqs_8']}
+
+        expected_failures = []
+
+        self.assertEqual(clusters, expected_clusters)
+        self.assertEqual(failures, expected_failures)
+
+    def test_usearch61_ref_open_ref(self):
+        """ usearch61 does open reference OTU picking """
+
+        clusters, failures = usearch61_ref_cluster(self.tmp_dna_seqs_1,
+                                                   self.tmp_dna_seqs_1_subset, percent_id=0.98, rev=True,
+                                                   save_intermediate_files=False, minlen=44,
+                                                   output_dir=self.output_dir, remove_usearch_logs=True,
+                                                   verbose=False, wordlength=12, usearch_fast_cluster=False,
+                                                   usearch61_sort_method='abundance', otu_prefix="denovo",
+                                                   usearch61_maxrejects=100, usearch61_maxaccepts=4,
+                                                   sizeorder=True)
+
+        # Should all fall into single, ref-based & denovo clusters
+
+        expected_ref_results = {'uclust_test_seqs_1': ['uclust_test_seqs_1'],
+                                'uclust_test_seqs_0': ['uclust_test_seqs_0'],
+                                'uclust_test_seqs_3': ['uclust_test_seqs_3'],
+                                'uclust_test_seqs_2': ['uclust_test_seqs_2']}
+
+        expected_denovo_results = [['uclust_test_seqs_5'],
+                                   ['uclust_test_seqs_7'], ['uclust_test_seqs_8'], ['uclust_test_seqs_4'],
+                                   ['uclust_test_seqs_6'], ['uclust_test_seqs_9']]
+
+        self.assertEqual(len(clusters), 10)
+
+        for curr_ref_result in expected_ref_results:
+            self.assertEqual(clusters[curr_ref_result],
+                             expected_ref_results[curr_ref_result])
+        for curr_denovo_result in expected_denovo_results:
+            self.assertTrue(curr_denovo_result in clusters.values())
+
+        expected_failures = []
+
+        self.assertEqual(failures, expected_failures)
+
+    def test_usearch61_denovo_default_params(self):
+        """ usearch61 denovo OTU picking works with default settings """
+
+        clusters = usearch61_denovo_cluster(self.tmp_dna_seqs_1,
+                                            output_dir=self.output_dir, save_intermediate_files=False,
+                                            remove_usearch_logs=True)
+
+        # Should all fall into single, de novo clusters
+
+        expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+                             ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+                             ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+                             ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+        self.assertEqual(len(expected_clusters), 10)
+
+        for curr_cluster in clusters.values():
+            self.assertTrue(curr_cluster in expected_clusters)
+
+    def test_usearch61_denovo_length_sorting(self):
+        """ usearch61 denovo OTU picking works with length sorting """
+
+        clusters = usearch61_denovo_cluster(self.tmp_dna_seqs_1,
+                                            output_dir=self.output_dir, save_intermediate_files=False,
+                                            remove_usearch_logs=True, usearch61_sort_method='length')
+
+        # Should all fall into single, de novo clusters
+
+        expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+                             ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+                             ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+                             ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+        self.assertEqual(len(expected_clusters), 10)
+
+        for curr_cluster in clusters.values():
+            self.assertTrue(curr_cluster in expected_clusters)
+
+    def test_usearch61_denovo_no_sorting(self):
+        """ usearch61 denovo OTU picking works with no sorting """
+
+        clusters = usearch61_denovo_cluster(self.tmp_dna_seqs_1,
+                                            output_dir=self.output_dir, save_intermediate_files=False,
+                                            remove_usearch_logs=True, usearch61_sort_method='None')
+
+        # Should all fall into single, de novo clusters
+
+        expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+                             ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+                             ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+                             ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+        self.assertEqual(len(expected_clusters), 10)
+
+        for curr_cluster in clusters.values():
+            self.assertTrue(curr_cluster in expected_clusters)
+
+    def test_usearch61_denovo_fast_cluster(self):
+        """ usearch61 denovo OTU picking works with fast_cluster sorting """
+
+        clusters = usearch61_denovo_cluster(self.tmp_dna_seqs_1,
+                                            output_dir=self.output_dir, save_intermediate_files=False,
+                                            remove_usearch_logs=True, usearch61_sort_method='length',
+                                            usearch_fast_cluster=True)
+
+        # Should all fall into single, de novo clusters
+
+        expected_clusters = [['uclust_test_seqs_9'], ['uclust_test_seqs_8'],
+                             ['uclust_test_seqs_3'], ['uclust_test_seqs_5'], ['uclust_test_seqs_4'],
+                             ['uclust_test_seqs_1'], ['uclust_test_seqs_0'], ['uclust_test_seqs_2'],
+                             ['uclust_test_seqs_7'], ['uclust_test_seqs_6']]
+
+        self.assertEqual(len(expected_clusters), 10)
+
+        for curr_cluster in clusters.values():
+            self.assertTrue(curr_cluster in expected_clusters)
+
+    def test_sort_by_abundance_usearch61(self):
+        """ usearch61 sorts by abundance successfully """
+
+        f, sorted_fna_fp = mkstemp(prefix='UsearchOtuPickerTest_',
+                                   suffix='.fasta')
+        close(f)
+        f, sorted_uc_fp = mkstemp(prefix='UsearchOtuPickerTest_', suffix='.uc')
+        close(f)
+
+        output_fna_filepath, output_uc_filepath, app_result =\
+            sort_by_abundance_usearch61(self.tmp_dna_seqs_with_dups,
+                                        self.output_dir, remove_usearch_logs=True,
+                                        output_fna_filepath=sorted_fna_fp,
+                                        output_uc_filepath=sorted_uc_fp, log_name="abundance_sorted.log")
+
+        output_fna = [
+            line for line in parse_fasta(open(output_fna_filepath, "U"))]
+
+        expected_fna = [('seq2;size=3;',
+                         'TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC'),
+                        ('seq1;size=1;',
+                         'GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA')]
+
+        self._files_to_remove.append(sorted_fna_fp)
+        self._files_to_remove.append(sorted_uc_fp)
+
+        self.assertEqual(output_fna, expected_fna)
+
+    def test_sort_by_length_usearch61(self):
+        """ usearch61 sorts by length successfully """
+
+        f, sorted_fna_fp = mkstemp(prefix='UsearchOtuPickerTest_',
+                                   suffix='.fasta')
+        close(f)
+
+        output_fna_filepath, app_result =\
+            sort_by_length_usearch61(self.tmp_usearch_ref_seqs1,
+                                     self.output_dir, remove_usearch_logs=True,
+                                     output_fna_filepath=sorted_fna_fp)
+
+        output_fna = [
+            line for line in parse_fasta(open(output_fna_filepath, "U"))]
+
+        expected_fna = [('ref1',
+                         'CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCA'),
+                        ('L07864',
+                         'GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTAATGCATGGGAATCTGCCATATAGTGGGGGACAACTGGGGAAACCCAGGCTAATACCGCATAATCTCTACGGAGGAAAGGCTTC'),
+                        ('EU199232',
+                         'TACGCGCGGAAATCGAGCGAGATTGGGAACGCAAGTTCCTGAGTATTGCGGCGAACGGGTGAGTAAGACGTGGGTGATCTACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC')]
+        self._files_to_remove.append(sorted_fna_fp)
+
+        self.assertEqual(output_fna, expected_fna)
+
+    def test_usearch61_cluster_ref(self):
+        """ usearch61 reference OTU picking application call successful """
+
+        f, output_uc_fp = mkstemp(prefix='UsearchOtuPickerTest_', suffix='.uc')
+        close(f)
+
+        uc_fp, failures = usearch61_cluster_ref(self.tmp_dna_seqs_1,
+                                                self.tmp_dna_seqs_1, output_dir=self.output_dir,
+                                                remove_usearch_logs=True, output_uc_filepath=output_uc_fp)
+
+        self._files_to_remove.append(uc_fp)
+
+        actual_uc_lines = [line.strip() for line in open(uc_fp, "U")]
+
+        # Difficult to test output, as numbers change between runs, for now
+        # just testing length of output, and order of lines changes as well.
+
+        self.assertEqual(len(actual_uc_lines), 10)
+
+    def test_usearch61_fast_cluster(self):
+        """ usearch61 fast cluster OTU picking application call successful """
+
+        f, output_uc_fp = mkstemp(prefix='UsearchOtuPickerTest_', suffix='.uc')
+        close(f)
+
+        uc_fp, failures = usearch61_fast_cluster(self.tmp_dna_seqs_1,
+                                                 output_dir=self.output_dir,
+                                                 remove_usearch_logs=True, output_uc_filepath=output_uc_fp)
+
+        self._files_to_remove.append(uc_fp)
+
+        actual_uc_lines = [line.strip() for line in open(uc_fp, "U")]
+
+        # Difficult to test output, as numbers change between runs, for now
+        # just testing length of output, and order of lines changes as well.
+
+        self.assertEqual(len(actual_uc_lines), 20)
+
+    def test_usearch61_cluster_smallmem(self):
+        """ usearch61 smallmem OTU picking application call successful """
+
+        f, output_uc_fp = mkstemp(prefix='UsearchOtuPickerTest_', suffix='.uc')
+        close(f)
+
+        uc_fp, failures = usearch61_smallmem_cluster(self.tmp_dna_seqs_1,
+                                                     output_dir=self.output_dir,
+                                                     remove_usearch_logs=True, output_uc_filepath=output_uc_fp)
+
+        self._files_to_remove.append(uc_fp)
+
+        actual_uc_lines = [line.strip() for line in open(uc_fp, "U")]
+
+        # Difficult to test output, as numbers change between runs, for now
+        # just testing length of output, and order of lines changes as well.
+
+        self.assertEqual(len(actual_uc_lines), 20)
+
+    def test_parse_dereplicated_uc(self):
+        """ Parses dereplicated usearch61 uc file successfully """
+
+        actual_derep_ids =\
+            parse_dereplicated_uc(self.usearch61_dereplicated_uc_lines)
+
+        expected_derep_ids = {'seq2': ['seq3', 'seq4'], 'seq1': []}
+
+        self.assertEqual(actual_derep_ids, expected_derep_ids)
+
+    def test_parse_usearch61_clusters_denovo(self):
+        """ Parses usearch61 de novo clusters uc file correctly """
+
+        actual_parsed_clusters, failures =\
+            parse_usearch61_clusters(self.usearch61_clustered_uc_lines,
+                                     ref_clustered=False)
+
+        expected_parsed_clusters =\
+            ({'denovo0': ['seq2'], 'denovo1': ['seq1']})
+
+        self.assertEqual(actual_parsed_clusters, expected_parsed_clusters)
+
+    def test_parse_usearch61_clusters_ref(self):
+        """ Parses usearch61 ref clusters uc file correctly """
+
+        actual_parsed_clusters, failures =\
+            parse_usearch61_clusters(self.usearch61_clustered_uc_lines_ref,
+                                     otu_prefix='', ref_clustered=True)
+
+        expected_parsed_clusters =\
+            ({'seq4': ['seq2'], 'seq1': ['seq1']})
+
+        self.assertEqual(actual_parsed_clusters, expected_parsed_clusters)
+
+    def test_merge_clusters_dereplicated_seqs(self):
+        """ Properly merges dereplicated and clustered sequences """
+
+        derep_ids = {'seq2': ['seq3', 'seq4'], 'seq1': []}
+
+        clustered_ids = ({'seq4': ['seq2'], 'seq1': ['seq1']})
+
+        merged_ids = merge_clusters_dereplicated_seqs(clustered_ids,
+                                                      derep_ids)
+
+        expected_ids = {'seq1': ['seq1'], 'seq4': ['seq2', 'seq3', 'seq4']}
+
+        self.assertEqual(merged_ids, expected_ids)
+
+    def test_merge_failures_dereplicated_seqs(self):
+        """ Usearch61 properly merges dereplicated seqs, ref based failures """
+
+        failures = ['seq2']
+        derep_ids = {'seq2': ['seq3', 'seq4'], 'seq1': []}
+
+        merged_failures = merge_failures_dereplicated_seqs(failures,
+                                                           derep_ids)
+
+        expected_failures = ['seq2', 'seq3', 'seq4']
+
+        self.assertEqual(merged_failures, expected_failures)
+
+    def test_parse_usearch61_failures(self):
+        """ Writes failures out to fasta file """
+
+        failures = ['seq2', 'seq3', 'seq4']
+        f, filtered_fna_fp = mkstemp(prefix='UsearchOtuPickerTest_',
+                                     suffix='.fasta')
+        close(f)
+        output_fp = parse_usearch61_failures(self.tmp_dna_seqs_with_dups,
+                                             failures, filtered_fna_fp)
+
+        self._files_to_remove.append(output_fp)
+
+        output_fna = [
+            line for line in parse_fasta(open(output_fp, "U"))]
+
+        expected_fna = [(
+            'seq2',
+            'TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC'),
+            ('seq3',
+             'TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC'),
+            ('seq4',
+             'TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC')]
+        self.assertEqual(output_fna, expected_fna)
+
+    # Chimera tests
+
+    def test_usearch61_denovo_chimera_detection(self):
+        """ usearch61 denovo chimera detection correctly flags chimeras """
+
+        uchime_fp = join(self.output_dir, "uchime_denovo.uchime")
+
+        uchime_fp, app_result =\
+            usearch61_chimera_check_denovo(self.tmp_de_novo_chimera_seqs,
+                                           uchime_denovo_fp=uchime_fp,
+                                           output_dir=self.output_dir,
+                                           remove_usearch_logs=True)
+
+        uchime_f = open(uchime_fp, "U")
+
+        actual_lines = [line.strip() for line in uchime_f]
+
+        # There is some system dependent stochastic effect on calculations
+        # for chimeras, need to pull out only the flags Y or N for chimeras
+
+        expected_chimera_ixs = [11, 16]
+
+        for line in range(len(actual_lines)):
+            curr_chimera_flag = actual_lines[line].split('\t')[-1]
+            if line in expected_chimera_ixs:
+                self.assertEqual(curr_chimera_flag, "Y")
+            else:
+                self.assertEqual(curr_chimera_flag, "N")
+
+        self._files_to_remove.append(uchime_fp)
+
+    def test_usearch61_ref_chimera_detection(self):
+        """ usearch61 ref chimera detection correctly flags chimeras """
+
+        uchime_fp = join(self.output_dir, "uchime_ref.uchime")
+
+        uchime_fp, app_result =\
+            usearch61_chimera_check_ref(self.tmp_de_novo_chimera_seqs,
+                                        uchime_ref_fp=uchime_fp,
+                                        reference_seqs_fp=
+                                        self.tmp_ref_chimera_seqs,
+                                        output_dir=self.output_dir,
+                                        remove_usearch_logs=True)
+
+        uchime_f = open(uchime_fp, "U")
+
+        actual_lines = [line.strip() for line in uchime_f]
+
+        self.assertEqual(actual_lines,
+                         self.expected_usearch61_ref_uchime_file)
+
+        self._files_to_remove.append(uchime_fp)
+
+
+class UsearchTests(TestCase):
+
+    def setUp(self):
+        # create the temporary input files
+        self.dna_seqs_1 = dna_seqs_1
+        self.dna_seqs_2 = dna_seqs_usearch
+        self.dna_seqs_3 = dna_seqs_3
+        self.dna_seqs_4 = dna_seqs_4
+        self.protein_ref_seqs1 = protein_ref_seqs1
+        self.ref_database = usearch_ref_seqs1
+        self.dna_seqs_with_abundance = dna_seqs_with_abundance
+        self.de_novo_chimera_seqs = de_novo_chimera_seqs
+        self.dna_seqs_with_dups = dna_seqs_with_dups
+        self.dna_seqs_reference_otu_picking = dna_seqs_reference_otu_picking
+
+        # Expected output files
+        self.uc_lines1 = uc_lines1
+        self.expected_otu_assignments = expected_otu_assignments
+        self.expected_enumerated_fasta = expected_enumerated_fasta
+        self.expected_enumerated_fasta_added_options =\
+            expected_enumerated_fasta_added_options
+        self.expected_clusters_w_abundance_default_settings =\
+            expected_clusters_w_abundance_default_settings
+        self.expected_clusters_w_abundance_low_setting =\
+            expected_clusters_w_abundance_low_setting
+        self.expected_reference_filtered_seqs =\
+            expected_reference_filtered_seqs
+        self.expected_de_novo_chimeras_default =\
+            expected_de_novo_chimeras_default
+        self.expected_de_novo_chimera_filtered_skew11 =\
+            expected_de_novo_chimera_filtered_skew11
+        self.expected_cluster_err_seqs =\
+            expected_cluster_err_seqs
+        self.expected_sorted_by_abundance_no_filter =\
+            expected_sorted_by_abundance_no_filter
+        self.expected_derep_seqs = expected_derep_seqs
+        self.expected_abundance_sort_filtered = expected_abundance_sort_filtered
+        self.expected_len_sorted_seqs = expected_len_sorted_seqs
+        self.expected_combined_dna_seqs_1_seqs_usearch =\
+            expected_combined_dna_seqs_1_seqs_usearch
+        self.retained_chimeras_seqs1 = retained_chimeras_seqs1
+        self.retained_chimeras_seqs2 = retained_chimeras_seqs2
+        self.expected_retained_chimeras_union =\
+            expected_retained_chimeras_union
+        self.expected_retained_chimeras_intersection =\
+            expected_retained_chimeras_intersection
+        self.expected_derep_seqs_full_len =\
+            expected_derep_seqs_full_len
+
+        # Create temporary files for use with unit tests
+
+        self.tmp_dir = '/tmp/'
+
+        f, self.tmp_seq_filepath1 = mkstemp(prefix='UsearchOtuPickerTest_',
+                                            suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_seq_filepath1, 'w')
+        seq_file.write(self.dna_seqs_1)
+        seq_file.close()
+
+        f, self.tmp_seq_filepath2 = mkstemp(prefix='UsearchOtuPickerTest_',
+                                            suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_seq_filepath2, 'w')
+        seq_file.write(self.dna_seqs_2)
+        seq_file.close()
+
+        f, self.dna_seqs3_filepath = mkstemp(prefix='UsearchOtuPickerTest_',
+                                             suffix='.fasta')
+        close(f)
+        seq_file = open(self.dna_seqs3_filepath, 'w')
+        seq_file.write(self.dna_seqs_3)
+        seq_file.close()
+
+        f, self.dna_seqs4_filepath = mkstemp(prefix='UsearchOtuPickerTest_',
+                                             suffix='.fasta')
+        close(f)
+        seq_file = open(self.dna_seqs4_filepath, 'w')
+        seq_file.write(self.dna_seqs_4)
+        seq_file.close()
+
+        f, self.protein_ref_seqs1_filepath = \
+            mkstemp(prefix='UsearchOtuPickerTest_', suffix='.fasta')
+        close(f)
+        seq_file = open(self.protein_ref_seqs1_filepath, 'w')
+        seq_file.write(self.protein_ref_seqs1)
+        seq_file.close()
+
+        f, self.tmp_ref_database = mkstemp(prefix='UsearchRefDatabase_',
+                                           suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_ref_database, 'w')
+        seq_file.write(self.ref_database)
+        seq_file.close()
+
+        f, self.tmp_seqs_w_abundance = mkstemp(prefix='UsearchSeqsAbundance_',
+                                               suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_seqs_w_abundance, 'w')
+        seq_file.write(self.dna_seqs_with_abundance)
+        seq_file.close()
+
+        f, self.tmp_de_novo_chimera_seqs = \
+            mkstemp(prefix='UsearchdenovoChimera_', suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_de_novo_chimera_seqs, 'w')
+        seq_file.write(self.de_novo_chimera_seqs)
+        seq_file.close()
+
+        f, self.tmp_dna_seqs_with_dups = mkstemp(prefix='UsearchDupDNASeqs_',
+                                                 suffix='.fasta')
+        close(f)
+        seq_file = open(self.tmp_dna_seqs_with_dups, 'w')
+        seq_file.write(self.dna_seqs_with_dups)
+        seq_file.close()
+
+        f, self.tmp_retained_chimeras_seqs1 = \
+            mkstemp(prefix="UsearchRetainedChimeras1_", suffix=".fasta")
+        close(f)
+        seq_file = open(self.tmp_retained_chimeras_seqs1, 'w')
+        seq_file.write(self.retained_chimeras_seqs1)
+        seq_file.close()
+
+        f, self.tmp_retained_chimeras_seqs2 = \
+            mkstemp(prefix="UsearchRetainedChimeras1_", suffix=".fasta")
+        close(f)
+        seq_file = open(self.tmp_retained_chimeras_seqs2, 'w')
+        seq_file.write(self.retained_chimeras_seqs2)
+        seq_file.close()
+
+        f, self.tmp_dna_seqs_ref_otu_picking = \
+            mkstemp(prefix="UsearchRefOtuPicking_", suffix=".fasta")
+        close(f)
+        seq_file = open(self.tmp_dna_seqs_ref_otu_picking, "w")
+        seq_file.write(self.dna_seqs_reference_otu_picking)
+        seq_file.close()
+
+        self._files_to_remove =\
+            [self.tmp_seq_filepath1, self.tmp_seq_filepath2,
+             self.tmp_ref_database, self.tmp_seqs_w_abundance,
+             self.tmp_de_novo_chimera_seqs, self.tmp_dna_seqs_with_dups,
+             self.tmp_retained_chimeras_seqs1, self.tmp_retained_chimeras_seqs2,
+             self.tmp_dna_seqs_ref_otu_picking, self.dna_seqs3_filepath,
+             self.protein_ref_seqs1_filepath, self.dna_seqs4_filepath]
+
+        self._dirs_to_remove = []
+
+    def tearDown(self):
+        remove_files(self._files_to_remove)
+        if self._dirs_to_remove:
+            for curr_dir in self._dirs_to_remove:
+                rmtree(curr_dir)
+
+    def test_usearch_qf(self):
+        """ Main program loop test, with default parameters """
+
+        # cluster size filtering set to 1 instead of default 4
+        clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+                                        output_dir=self.tmp_dir,
+                                        db_filepath=self.tmp_ref_database,
+                                        minsize=1,
+                                        remove_usearch_logs=True,
+                                        chimeras_retention='intersection')
+
+        expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+                             '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2']}
+        expected_failures = ['chimera']
+
+        self.assertEqual(clusters, expected_clusters)
+        self.assertEqual(failures, expected_failures)
+
+    def test_usearch_qf_minlen(self):
+        """ Main program loop test, with longer minlen """
+
+        # cluster size filtering set to 1 instead of default 4
+        clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+                                        output_dir=self.tmp_dir,
+                                        db_filepath=self.tmp_ref_database,
+                                        minsize=1,
+                                        remove_usearch_logs=True,
+                                        chimeras_retention='intersection',
+                                        minlen=110)
+
+        expected_clusters = {'0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2']}
+        expected_failures = ['Solemya', 'Solemya_seq2', 'chimera']
+
+        self.assertEqual(clusters, expected_clusters)
+        self.assertEqual(failures, expected_failures)
+
+    def test_usearch_qf_reference_otu_picking(self):
+        """ Main program loop test, with reference + new clusters """
+
+        # cluster size filtering set to 1 instead of default 4
+        clusters, failures = usearch_qf(self.tmp_dna_seqs_ref_otu_picking,
+                                        output_dir=self.tmp_dir,
+                                        refseqs_fp=self.tmp_ref_database,
+                                        reference_chimera_detection=False,
+                                        minsize=1,
+                                        remove_usearch_logs=True,
+                                        suppress_new_clusters=False)
+
+        # Will cluster everything including RandomCrap, as new clusters
+        # allowed.
+        expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+                             '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2'],
+                             '2': ['RandomCrap']}
+        expected_failures = []
+
+        self.assertEqual(clusters, expected_clusters)
+        self.assertEqual(failures, expected_failures)
+
+    def test_usearch_qf_reference_otu_picking_no_new_clusters(self):
+        """ Main program loop test, with reference and no new clusters """
+
+        # cluster size filtering set to 1 instead of default 4
+        clusters, failures = usearch_qf(self.tmp_dna_seqs_ref_otu_picking,
+                                        output_dir=self.tmp_dir,
+                                        refseqs_fp=self.tmp_ref_database,
+                                        reference_chimera_detection=False,
+                                        minsize=1,
+                                        remove_usearch_logs=True,
+                                        suppress_new_clusters=True)
+
+        # Will cluster everything but RandomCrap, as no new clusters allowed.
+        expected_clusters = {'L07864': ['Solemya', 'Solemya_seq2'],
+                             'ref1': ['usearch_ecoli_seq', 'usearch_ecoli_seq2']}
+        expected_failures = ['RandomCrap']
+
+        self.assertEqual(clusters, expected_clusters)
+        self.assertEqual(failures, expected_failures)
+
+    def test_usearch_qf_no_ref_database(self):
+        """ Main program loop with no reference chimera testing """
+
+        # cluster size filtering set to 1 instead of default 4
+        clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+                                        output_dir=self.tmp_dir,
+                                        reference_chimera_detection=False,
+                                        minsize=1,
+                                        remove_usearch_logs=True)
+
+        # Chimera sequence should not be detected without reference test.
+        expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+                             '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2'],
+                             '2': ['chimera']}
+
+        expected_failures = []
+
+        self.assertEqual(clusters, expected_clusters)
+        self.assertEqual(failures, expected_failures)
+
+    def test_usearch_qf_union(self):
+        """ Main program loop with union nonchimera retention """
+
+        # cluster size filtering set to 1 instead of default 4
+        clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+                                        output_dir=self.tmp_dir,
+                                        reference_chimera_detection=False,
+                                        minsize=1,
+                                        remove_usearch_logs=True,
+                                        chimeras_retention='union')
+
+        # Chimera sequence retained as passes de novo test
+        expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+                             '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2'],
+                             '2': ['chimera']}
+
+        expected_failures = []
+
+        self.assertEqual(clusters, expected_clusters)
+        self.assertEqual(failures, expected_failures)
+
+    def test_usearch_qf_disabled_filters(self):
+        """ Returns expected clustering with no filtering """
+
+        # cluster size filtering set to 1 instead of default 4
+        clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+                                        output_dir=self.tmp_dir,
+                                        de_novo_chimera_detection=False,
+                                        reference_chimera_detection=False,
+                                        cluster_size_filtering=False,
+                                        remove_usearch_logs=True)
+
+        # Chimera sequence should not be detected without reference test.
+        expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+                             '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2'],
+                             '2': ['chimera']}
+
+        expected_failures = []
+
+        self.assertEqual(clusters, expected_clusters)
+        self.assertEqual(failures, expected_failures)
+
+    def test_usearch_qf_generates_logs(self):
+        """ Generates expected log files """
+        curr_output_dir = mkdtemp(dir=self.tmp_dir)
+
+        self._dirs_to_remove.append(curr_output_dir)
+
+        # cluster size filtering set to 1 instead of default 4
+        clusters, failures = usearch_qf(self.tmp_seq_filepath2,
+                                        output_dir=curr_output_dir,
+                                        db_filepath=self.tmp_ref_database,
+                                        minsize=1,
+                                        remove_usearch_logs=False,
+                                        chimeras_retention='intersection')
+
+        expected_clusters = {'1': ['Solemya', 'Solemya_seq2'],
+                             '0': ['usearch_ecoli_seq', 'usearch_ecoli_seq2']}
+        expected_failures = ['chimera']
+
+        self.assertEqual(clusters, expected_clusters)
+        self.assertEqual(failures, expected_failures)
+
+        # Only checking for creation of files, as file contents contain
+        # tmp file names.
+        expected_log_names = ['assign_reads_to_otus.log',
+                              'uchime_de_novo_chimera_filtering.log',
+                              'derep.log',
+                              'uchime_reference_chimera_filtering.log',
+                              'minsize_0_abundance_sort.log',
+                              'usearch_cluster_err_corrected.log',
+                              'minsize_1_abundance_sort.log',
+                              'usearch_cluster_seqs.log',
+                              'sortlen.log']
+
+        actual_logs =\
+            [basename(curr_file)
+             for curr_file in glob(curr_output_dir + "/*.*")]
+
+        self.assertItemsEqual(actual_logs, expected_log_names)
+
+    def test_concatenate_fastas(self):
+        """ Properly concatenates two fasta files """
+
+        f, out_f = mkstemp(prefix='UsearchConcatFileTest_', suffix='.fasta')
+        close(f)
+
+        actual_concatenated_seqs = concatenate_fastas(self.tmp_seq_filepath1,
+                                                      self.tmp_seq_filepath2, out_f)
+
+        self._files_to_remove.append(out_f)
+
+        actual_lines =\
+            [line.strip() for line in open(actual_concatenated_seqs, "U")]
+
+        self.assertEqual(actual_lines,
+                         expected_combined_dna_seqs_1_seqs_usearch)
+
+    def test_assign_reads_to_otus(self):
+        """ Properly assigns reads back to original ID """
+
+        app_result, output_filepath =\
+            assign_reads_to_otus(original_fasta=self.tmp_ref_database,
+                                 filtered_fasta=self.tmp_seq_filepath2,
+                                 remove_usearch_logs=True,
+                                 working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        # Stripping off first line, which refers to the command using tmp
+        # file names, retaining other actual results.
+        actual_assignments =\
+            [line.strip() for line in open(output_filepath, "U")][2:]
+
+        self.assertEqual(actual_assignments, self.expected_otu_assignments)
+
+    def test_enumerate_otus(self):
+        """ Enumerates OTUs properly """
+
+        output_filepath = enumerate_otus(self.tmp_seq_filepath1)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_fasta = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_fasta, self.expected_enumerated_fasta)
+
+    def test_enumerate_otus_added_options(self):
+        """ Enumerates with all options properly """
+
+        output_filepath = enumerate_otus(self.tmp_seq_filepath1,
+                                         label_prefix="Big",
+                                         label_suffix="Ern",
+                                         retain_label_as_comment=True,
+                                         count_start=255)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_fasta = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_fasta,
+                         self.expected_enumerated_fasta_added_options)
+
+    def test_usearch_cluster_seqs(self):
+        """ Clusters sequences correctly """
+
+        # clusters all seqs with default 97% identity
+        app_result, output_filepath =\
+            usearch_cluster_seqs(self.tmp_seqs_w_abundance,
+                                 save_intermediate_files=False,
+                                 remove_usearch_logs=True,
+                                 percent_id=0.97,
+                                 working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_clusters = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_clusters,
+                         self.expected_clusters_w_abundance_default_settings)
+
+    def test_usearch_cluster_seqs_high_identity(self):
+        """ Clusters sequences correctly """
+
+        # Should get two clusters with 99.9% identity
+        app_result, output_filepath =\
+            usearch_cluster_seqs(self.tmp_seqs_w_abundance,
+                                 save_intermediate_files=False,
+                                 remove_usearch_logs=True,
+                                 percent_id=0.999,
+                                 working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_clusters = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_clusters,
+                         self.expected_clusters_w_abundance_low_setting)
+
+    def test_usearch_chimera_filter_ref_based(self):
+        """ Properly detects chimeras against reference database """
+
+        app_result, output_filepath =\
+            usearch_chimera_filter_ref_based(self.tmp_seq_filepath2,
+                                             self.tmp_ref_database,
+                                             remove_usearch_logs=True,
+                                             working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_filtered_chimeras =\
+            [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_filtered_chimeras,
+                         self.expected_reference_filtered_seqs)
+
+    def test_usearch_chimera_filter_de_novo(self):
+        """ Properly detects de novo chimeras """
+
+        app_result, output_filepath =\
+            usearch_chimera_filter_de_novo(self.tmp_de_novo_chimera_seqs,
+                                           remove_usearch_logs=True,
+                                           abundance_skew=2,
+                                           working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_seqs = \
+            [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_seqs, self.expected_de_novo_chimeras_default)
+
+    def test_usearch_chimera_filter_de_novo_abundance_skew(self):
+        """ Properly detects de novo chimeras with skew changes """
+
+        app_result, output_filepath =\
+            usearch_chimera_filter_de_novo(self.tmp_de_novo_chimera_seqs,
+                                           remove_usearch_logs=True,
+                                           abundance_skew=11,
+                                           working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_seqs = \
+            [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_seqs,
+                         self.expected_de_novo_chimera_filtered_skew11)
+
+    def test_usearch_cluster_error_correction(self):
+        """ Properly clusters seqs for chimera testing/filtering """
+
+        # clusters all seqs with default 97% identity
+        app_result, output_filepath =\
+            usearch_cluster_error_correction(self.tmp_seqs_w_abundance,
+                                             save_intermediate_files=False,
+                                             remove_usearch_logs=True,
+                                             percent_id_err=0.97,
+                                             working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_clusters = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_clusters,
+                         self.expected_cluster_err_seqs)
+
+    def test_usearch_sort_by_abundance(self):
+        """ Properly sorts fasta by abundance """
+
+        app_result, output_filepath =\
+            usearch_sort_by_abundance(self.tmp_de_novo_chimera_seqs,
+                                      remove_usearch_logs=True,
+                                      working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_seqs,
+                         self.expected_sorted_by_abundance_no_filter)
+
+    def test_usearch_sort_by_abundance_filter(self):
+        """ Properly sorts fasta by abundance, filters low count otus """
+
+        app_result, output_filepath =\
+            usearch_sort_by_abundance(self.tmp_de_novo_chimera_seqs,
+                                      remove_usearch_logs=True,
+                                      minsize=40,
+                                      working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_seqs,
+                         self.expected_abundance_sort_filtered)
+
+    def test_usearch_dereplicate_exact_subseqs(self):
+        """ Properly dereplicates fasta file """
+
+        app_result, output_filepath =\
+            usearch_dereplicate_exact_subseqs(self.tmp_dna_seqs_with_dups,
+                                              remove_usearch_logs=True,
+                                              working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_seqs, self.expected_derep_seqs)
+
+    def test_usearch_dereplicate_exact_seqs(self):
+        """ Properly dereplicates fasta file """
+
+        app_result, output_filepath =\
+            usearch_dereplicate_exact_seqs(self.tmp_dna_seqs_with_dups,
+                                           remove_usearch_logs=True,
+                                           working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_seqs, self.expected_derep_seqs_full_len)
+
+    def test_usearch_fasta_sort_from_filepath(self):
+        """ Properly sorts fasta according to seq length """
+
+        app_result, output_filepath =\
+            usearch_fasta_sort_from_filepath(self.tmp_seq_filepath2,
+                                             remove_usearch_logs=True,
+                                             working_dir=self.tmp_dir)
+
+        self._files_to_remove.append(output_filepath)
+
+        actual_seqs = [line.strip() for line in open(output_filepath, "U")]
+
+        self.assertEqual(actual_seqs, self.expected_len_sorted_seqs)
+
+    def test_clusters_from_blast_uc_file(self):
+        """ clusters_from_uc_file functions as expected """
+
+        expected_clusters = {'19': ['PC.634_4'], '42': ['PC.test2_1',
+                                                        'PC.test1_2', 'PC.634_3'], '6': ['PC.269_5']}
+        expected_failures = ['PC.481_6']
+
+        self.assertEqual(clusters_from_blast_uc_file(self.uc_lines1),
+                         (expected_clusters, expected_failures))
+
+    def test_get_retained_chimeras_union(self):
+        """ Properly returns union of two fastas """
+
+        f, out_f = mkstemp(prefix='UsearchUnionTest_', suffix='.fasta')
+        close(f)
+
+        actual_out_fp = get_retained_chimeras(self.tmp_retained_chimeras_seqs1,
+                                              self.tmp_retained_chimeras_seqs2, out_f, chimeras_retention='union')
+
+        self._files_to_remove.append(out_f)
+
+        actual_out_f = [line.strip() for line in open(actual_out_fp, "U")]
+
+        self.assertEqual(actual_out_f, self.expected_retained_chimeras_union)
+
+    def test_get_retained_chimeras_intersection(self):
+        """ Properly returns intersection of two fastas """
+
+        f, out_f = mkstemp(prefix='UsearchIntersectionTest_', suffix='.fasta')
+        close(f)
+
+        actual_out_fp = get_retained_chimeras(self.tmp_retained_chimeras_seqs1,
+                                              self.tmp_retained_chimeras_seqs2, out_f,
+                                              chimeras_retention='intersection')
+
+        self._files_to_remove.append(out_f)
+
+        actual_out_f = [line.strip() for line in open(actual_out_fp, "U")]
+
+        self.assertEqual(actual_out_f,
+                         self.expected_retained_chimeras_intersection)
+
+    def test_assign_dna_reads_to_protein_database(self):
+        """assign_dna_reads_to_protein_database wrapper functions as expected
+        """
+        output_dir = mkdtemp(dir=self.tmp_dir)
+        self._dirs_to_remove.append(output_dir)
+        output_fp = join(output_dir, 'out.uc')
+        assign_dna_reads_to_protein_database(self.dna_seqs3_filepath,
+                                             self.protein_ref_seqs1_filepath,
+                                             output_fp,
+                                             temp_dir=self.tmp_dir)
+
+        self.assertTrue(exists(output_fp))
+        self.assertTrue(exists(output_fp.replace('.uc', '.bl6')))
+
+        # confirm that the clusters look like what we expect
+        expected_clusters = sorted(
+            [['eco:b0015'], ['eco:b0122', 'eco:b0122-like']])
+        actual_clusters = sorted(clusters_from_blast_uc_file(
+            open(output_fp))[0].values())
+        self.assertEqual(actual_clusters, expected_clusters)
+
+    def test_assign_dna_reads_to_protein_database_alt_params(self):
+        """assign_dna_reads_to_protein_database wrapper functions with alt params
+        """
+        output_dir = mkdtemp(dir=self.tmp_dir)
+        self._dirs_to_remove.append(output_dir)
+        output_fp = join(output_dir, 'out.uc')
+        assign_dna_reads_to_protein_database(self.dna_seqs3_filepath,
+                                             self.protein_ref_seqs1_filepath,
+                                             output_fp,
+                                             temp_dir=self.tmp_dir,
+                                             params={'--id': 1.0})
+
+        self.assertTrue(exists(output_fp))
+        self.assertTrue(exists(output_fp.replace('.uc', '.bl6')))
+
+        # confirm that the clusters look like what we expect
+        expected_clusters = sorted([['eco:b0015'], ['eco:b0122']])
+        actual_clusters = sorted(clusters_from_blast_uc_file(
+            open(output_fp))[0].values())
+        self.assertEqual(actual_clusters, expected_clusters)
+
+    def test_assign_dna_reads_to_dna_database(self):
+        """assign_dna_reads_to_protein_database wrapper functions as expected
+        """
+        output_dir = mkdtemp(dir=self.tmp_dir)
+        self._dirs_to_remove.append(output_dir)
+        output_fp = join(output_dir, 'out.uc')
+        assign_dna_reads_to_protein_database(self.dna_seqs3_filepath,
+                                             self.dna_seqs4_filepath,
+                                             output_fp,
+                                             temp_dir=self.tmp_dir)
+
+        self.assertTrue(exists(output_fp))
+        self.assertTrue(exists(output_fp.replace('.uc', '.bl6')))
+
+        # confirm that the clusters look like what we expect
+        expected_clusters = sorted(
+            [['eco:b0015'], ['eco:b0122', 'eco:b0122-like']])
+        actual_clusters = sorted(clusters_from_blast_uc_file(
+            open(output_fp))[0].values())
+        self.assertEqual(actual_clusters, expected_clusters)
+
+# Long strings for test files, output, etc.
+# *************************************************
+
+retained_chimeras_seqs1 = """>seq1
+ACAGGCC
+>seq2
+ACAGGCCCCC
+>seq3
+TTATCCATT"""
+
+retained_chimeras_seqs2 = """>seq3
+ACAGGCC
+>seq4
+ACAGGCCCCC
+>seq5
+TTATCCATT"""
+
+dna_seqs_1 = """>uclust_test_seqs_0 some comment0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_1 some comment1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_2 some comment2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3 some comment3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>uclust_test_seqs_4 some comment4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>uclust_test_seqs_5 some comment4_again
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>uclust_test_seqs_6 some comment6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>uclust_test_seqs_7 some comment7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_8 some comment8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>uclust_test_seqs_9 some comment9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA"""
+
+dna_seqs_1_subset = """>uclust_test_seqs_0 some comment0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_1 some comment1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_2 some comment2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3 some comment3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT"""
+
+dna_seqs_3 = """>eco:b0001 thrL; thr operon leader peptide; K08278 thr operon leader peptide (N)
+atgaaacgcattagcaccaccattaccaccaccatcaccattaccacaggtaacggtgcg
+ggctga
+>eco:b0015 dnaJ; chaperone Hsp40, co-chaperone with DnaK; K03686 molecular chaperone DnaJ (N)
+atggctaagcaagattattacgagattttaggcgtttccaaaacagcggaagagcgtgaa
+atcagaaaggcctacaaacgcctggccatgaaataccacccggaccgtaaccagggtgac
+aaagaggccgaggcgaaatttaaagagatcaaggaagcttatgaagttctgaccgactcg
+caaaaacgtgcggcatacgatcagtatggtcatgctgcgtttgagcaaggtggcatgggc
+ggcggcggttttggcggcggcgcagacttcagcgatatttttggtgacgttttcggcgat
+atttttggcggcggacgtggtcgtcaacgtgcggcgcgcggtgctgatttacgctataac
+atggagctcaccctcgaagaagctgtacgtggcgtgaccaaagagatccgcattccgact
+ctggaagagtgtgacgtttgccacggtagcggtgcaaaaccaggtacacagccgcagact
+tgtccgacctgtcatggttctggtcaggtgcagatgcgccagggattcttcgctgtacag
+cagacctgtccacactgtcagggccgcggtacgctgatcaaagatccgtgcaacaaatgt
+catggtcatggtcgtgttgagcgcagcaaaacgctgtccgttaaaatcccggcaggggtg
+gacactggagaccgcatccgtcttgcgggcgaaggtgaagcgggcgagcatggcgcaccg
+gcaggcgatctgtacgttcaggttcaggttaaacagcacccgattttcgagcgtgaaggc
+aacaacctgtattgcgaagtcccgatcaacttcgctatggcggcgctgggtggcgaaatc
+gaagtaccgacccttgatggtcgcgtcaaactgaaagtgcctggcgaaacccagaccggt
+aagctattccgtatgcgcggtaaaggcgtcaagtctgtccgcggtggcgcacagggtgat
+ttgctgtgccgcgttgtcgtcgaaacaccggtaggcctgaacgaaaggcagaaacagctg
+ctgcaagagctgcaagaaagcttcggtggcccaaccggcgagcacaacagcccgcgctca
+aagagcttctttgatggtgtgaagaagttttttgacgacctgacccgctaa
+>eco:b0122 yacC; conserved protein, PulS_OutS family (N)
+atgaagacgtttttcagaacagtgttattcggcagcctgatggccgtctgcgcaaacagt
+tacgcgctcagcgagtctgaagccgaagatatggccgatttaacggcagtttttgtcttt
+ctgaagaacgattgtggttaccagaacttacctaacgggcaaattcgtcgcgcactggtc
+tttttcgctcagcaaaaccagtgggacctcagtaattacgacaccttcgacatgaaagcc
+ctcggtgaagacagctaccgcgatctcagcggcattggcattcccgtcgctaaaaaatgc
+aaagccctggcccgcgattccttaagcctgcttgcctacgtcaaataa
+>eco:b0122-like
+atgaagaaaattttcagaacagtgttattcggcagcctgatggccgtctgcgcaaacagt
+tacgcgctcagcgagtctgaagccgaagatatggccgatttaacggcagtttttgtcttt
+ctgaagaacgattgtggttaccagaacttacctaacgggcaaattcgtcgcgcactggtc
+tttttcgctcagcaaaaccagtgggacctcagtaattacgacaccttcgacatgaaagcc
+ctcggtgaagacagctaccgcgatctcagcggcattggcattcccgtcgctaaaaaatgc
+aaagccctggcccgcgattccttaagcctgcttgcctacgtcaaatcc"""
+
+dna_seqs_4 = """>eco:b0015 dnaJ; chaperone Hsp40, co-chaperone with DnaK; K03686 molecular chaperone DnaJ (N)
+atggctaagcaagattattacgagattttaggcgtttccaaaacagcggaagagcgtgaa
+atcagaaaggcctacaaacgcctggccatgaaataccacccggaccgtaaccagggtgac
+aaagaggccgaggcgaaatttaaagagatcaaggaagcttatgaagttctgaccgactcg
+caaaaacgtgcggcatacgatcagtatggtcatgctgcgtttgagcaaggtggcatgggc
+ggcggcggttttggcggcggcgcagacttcagcgatatttttggtgacgttttcggcgat
+atttttggcggcggacgtggtcgtcaacgtgcggcgcgcggtgctgatttacgctataac
+atggagctcaccctcgaagaagctgtacgtggcgtgaccaaagagatccgcattccgact
+ctggaagagtgtgacgtttgccacggtagcggtgcaaaaccaggtacacagccgcagact
+tgtccgacctgtcatggttctggtcaggtgcagatgcgccagggattcttcgctgtacag
+cagacctgtccacactgtcagggccgcggtacgctgatcaaagatccgtgcaacaaatgt
+catggtcatggtcgtgttgagcgcagcaaaacgctgtccgttaaaatcccggcaggggtg
+gacactggagaccgcatccgtcttgcgggcgaaggtgaagcgggcgagcatggcgcaccg
+gcaggcgatctgtacgttcaggttcaggttaaacagcacccgattttcgagcgtgaaggc
+aacaacctgtattgcgaagtcccgatcaacttcgctatggcggcgctgggtggcgaaatc
+gaagtaccgacccttgatggtcgcgtcaaactgaaagtgcctggcgaaacccagaccggt
+aagctattccgtatgcgcggtaaaggcgtcaagtctgtccgcggtggcgcacagggtgat
+ttgctgtgccgcgttgtcgtcgaaacaccggtaggcctgaacgaaaggcagaaacagctg
+ctgcaagagctgcaagaaagcttcggtggcccaaccggcgagcacaacagcccgcgctca
+aagagcttctttgatggtgtgaagaagttttttgacgacctgacccgctaa
+>eco:b0122 yacC; conserved protein, PulS_OutS family (N)
+atgaagacgtttttcagaacagtgttattcggcagcctgatggccgtctgcgcaaacagt
+tacgcgctcagcgagtctgaagccgaagatatggccgatttaacggcagtttttgtcttt
+ctgaagaacgattgtggttaccagaacttacctaacgggcaaattcgtcgcgcactggtc
+tttttcgctcagcaaaaccagtgggacctcagtaattacgacaccttcgacatgaaagcc
+ctcggtgaagacagctaccgcgatctcagcggcattggcattcccgtcgctaaaaaatgc
+aaagccctggcccgcgattccttaagcctgcttgcctacgtcaaataa
+>eco:b0122-like
+atgaagacgtttttcagaacagtgttattcggcagcctgatggccgtctgcgcaaacagt
+tacgcgctcagcgagtctgaagccgaagatatggccgatttaacggcagtttttgtcttt
+ctgaagaacgattgtggttaccagaacttacctaacgggcaaattcgtcgcgcactggtc
+tttttcgctcagcaaaaccagtgggacctcagtaattacgacaccttcgacatgaaagcc
+ctcggtgaagacagctaccgcgatctcagcggcattggcattcccgtcgctaaaaaatgc
+aaagccctggcccgcgattccttaagcctgcttgcctacgtcaaatcc"""
+
+protein_ref_seqs1 = """>eco:b0001 thrL; thr operon leader peptide; K08278 thr operon leader peptide (A)
+MKRISTTITTTITITTGNGAG
+>eco:b0015 dnaJ; chaperone Hsp40, co-chaperone with DnaK; K03686 molecular chaperone DnaJ (A)
+MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAKFKEIKEAYEVLTDS
+QKRAAYDQYGHAAFEQGGMGGGGFGGGADFSDIFGDVFGDIFGGGRGRQRAARGADLRYN
+MELTLEEAVRGVTKEIRIPTLEECDVCHGSGAKPGTQPQTCPTCHGSGQVQMRQGFFAVQ
+QTCPHCQGRGTLIKDPCNKCHGHGRVERSKTLSVKIPAGVDTGDRIRLAGEGEAGEHGAP
+AGDLYVQVQVKQHPIFEREGNNLYCEVPINFAMAALGGEIEVPTLDGRVKLKVPGETQTG
+KLFRMRGKGVKSVRGGAQGDLLCRVVVETPVGLNERQKQLLQELQESFGGPTGEHNSPRS
+KSFFDGVKKFFDDLTR
+>eco:b0015:rep
+MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAKFKEIKEAYEVLTDS
+QKRAAYDQYGHAAFEQGGMGGGGFGGGADFSDIFGDVFGDIFGGGRGRQRAARGADLRYN
+MELTLEEAVRGVTKEIRIPTLEECDVCHGSGAKPGTQPQTCPTCHGSGQVQMRQGFFAVQ
+QTCPHCQGRGTLIKDPCNKCHGHGRVERSKTLSVKIPAGVDTGDRIRLAGEGEAGEHGAP
+AGDLYVQVQVKQHPIFEREGNNLYCEVPINFAMAALGGEIEVPTLDGRVKLKVPGETQTG
+KLFRMRGKGVKSVRGGAQGDLLCRVVVETPVGLNERQKQLLQELQESFGGPTGEHNSPRS
+KSFFDGVKKFFDDLTR
+>eco:b0122 yacC; conserved protein, PulS_OutS family (A)
+MKTFFRTVLFGSLMAVCANSYALSESEAEDMADLTAVFVFLKNDCGYQNLPNGQIRRALV
+FFAQQNQWDLSNYDTFDMKALGEDSYRDLSGIGIPVAKKCKALARDSLSLLAYVK"""
+
+usearch_ref_seqs1 = """>ref1 ecoli sequence
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCA
+>EU199232 1 1236 Bacteria/Deltaproteobacteria/Desulfurella - Hippea/uncultured
+TACGCGCGGAAATCGAGCGAGATTGGGAACGCAAGTTCCTGAGTATTGCGGCGAACGGGTGAGTAAGACGTGGGTGATCTACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC
+>L07864 1 1200 Bacteria/Beta Gammaproteobacteria/Solemya symbiont
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTAATGCATGGGAATCTGCCATATAGTGGGGGACAACTGGGGAAACCCAGGCTAATACCGCATAATCTCTACGGAGGAAAGGCTTC
+"""
+
+dna_seqs_usearch = """>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTA
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTATCAAG
+>chimera
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC
+"""
+
+dna_seqs_reference_otu_picking = """>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTA
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTATCAAG
+>RandomCrap
+ACACAAACAGTATATTATATCCCCAGACAGGGACCGAGATTTACCACACCCAAAAAAAAAAAAAACACACCCCCCCCCCCCCCACACACACACTTATTTT
+"""
+
+dna_seqs_with_abundance = """>Cluster1;size=114
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster2;size=45
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCC
+>Cluster0;size=37
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGAACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster7;size=33
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster6;size=32
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster5;size=25
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster11;size=22
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster12;size=15
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster13;size=2
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTGTGTCAGGCCT
+>Cluster14;size=1
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTG"""
+
+de_novo_chimera_seqs = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG
+>Cluster3;size=30
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCAGTCTCTCAACTCGGCTATGCATCATTGCCTTGGTAAGCCGTTACCT
+TACCAACTAGCTAATGCACCGCAGGTCCATCCAAGAGTGATAGCAGAACCATCTTTCAAACTCTAGACATGCGTCTAGTG
+TTGTTATCCGGTATTAGCATCTGTTTCCAGGTGTTATCCCAGTCTCTTGGG
+>Cluster12;size=19
+TTGGTCCGTGTCTCAGTACCAATGTGGGGGGTTAACCTCTCAGTCCCCCTATGTATCGTGGTCTTGGTGAGCCGTTACCC
+CACCAACTAACTAATACAACGCATGCCCATCCATTACCACCGGAGTTTTCAACCCAAGAAGATGCCTCCCTGGATGTTAT
+GGGGTATTAGTACCGATTTCTCAGTGTTATCCCCCTGTAATGGGTAGGTTGCATACGCGTTACGCACCCGTGCGCCGGTC
+GCCGACAAT
+>Cluster30;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTTGGTGGGCCGTTACCC
+CGCCAACTAGCTAATGCGCCGCATGGCCATCCGTAGCCGGTGTTACCCTTTAAACCCCAAGAGATGCCTCTCGGAGTTAT
+TACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTACGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster29;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGCCCATCCGCCACCGGTAATCCCTTTGGCGGCACCGGGATGCCCCGACGCCGCGTC
+ACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTGGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCGG
+TCGCCGG
+>Cluster16;size=16
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCGTTACCC
+CTCCAACCAGCTAATCAGACGCGGGTCCATCCTGTACCACCGGAGTTTTTCACACTGTACCATGCGGTACTGTGCGCTTA
+TGCGGTTTTAGCACCTATTTCTAAGTGTTATCCCCCTGTACAGGGCAGGTTACCCACGCGTTACTCACCCGTCCGCCACT
+>Cluster222;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT
+>Cluster221;size=1
+CTGGGCCGTATCTCAGTCCCAATGTGGCCGTTCAACCTCTCAGTCCGGCTACTGATCGTCGCCTTGGTAGGCCGTTGCCC
+CGCCAACTACCTAATCGGACGCGAGCCCATCTTTCAGCGGATTGCTCCTTTGATTATCTCACCATGCGGCAAAATAATGT
+CATGCGGTATTAGCGTTCGTTTCCAAACGTTATCCCCCTCTGAAAGGCAGGTTGCTCACGCGTT
+>Cluster218;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGGCCACCCTCTCAGGTCGGCTACTGATCGTCACCTTGGTAGGCCGTTACCC
+CACCAACTAGCTAATCAGACGCAAGCCCATCTATCAGCGGATTGCTCCTTTTCTAGCTATATCATGCGATACTACTAGCT
+TATGCGGTATTAGCAATGATTTCTCACTGTTATTCCCCTCTGATAGGCAGG
+>Cluster217;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGAGTCCATCTCAGAGCGATAAATCTTTGATATCCAGAGCCATGCGACCCAGATATATT
+ATGCGGTATTAGCAGCTGTTTCCAGCTGTTATTCCCCATCCAAGGCAGGTT
+>Cluster216;size=1
+CTGGGCCGTGTCTCAGTCCCAGTGTGGCCGTCCGCCCTCTCAGGTCAGCTACTGATCGTCGCCTTGGTAGGCCATTACCC
+TACCAACTAGCTAATCAGACGCGAGGCCATCTCTCAGCGATAAATCTTTGATATATCTGCCATGCGACAAACATATATTA
+TGCGGTATTAGCAGTCGTTTCCAACTGTTGTCCCCCTCTGAAAGGCAGGTT
+>Cluster522;size=10
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT"""
+
+reference_seqs_fp = """>seq1
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>seq2
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>seq3
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>mixed_seq
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCAACATATTTCGGGACAGATTAACACACAAAGGATTTACACAAAAT
+ACATTAGACCAAACCCCAAGATTTAGACAGGATTACAGGATTTACAGATTTTTACCAACATTAGACAGGGG"""
+
+dna_seqs_with_dups = """>seq1
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA
+>seq2
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>seq3
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>seq4
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTT"""
+
+dna_seqs_with_dups2 = """>seq1
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA
+>seq2
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>seq3
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>seq4
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC"""
+
+
+# Expected output file data
+uc_lines1 = """# usearch --id 0.97 --uc usearch_picked_otus/assign_reads_to_otus.uc --query seqs.fna --global --db usearch_picked_otus/enumerated_otus.fasta
+# version=4.2.66
+# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+H\t42\t217\t99.1\t+\t0\t0\t217MI\tPC.test2_1 FLP3FBN02xELBSXx orig_bc=ACAGAGTCGGCG new_bc=ACAGAGTCGGCG,FLP3FBN02x bc_diffs=0\t42
+H\t42\t217\t99.1\t+\t0\t0\t217MI\tPC.test1_2 FLP3FBN03ELBSXx orig_bc=ACAGAGTCGGCG new_bc=ACAGAGTCGGCG,FLP3FBN03 bc_diffs=0\t42
+H\t42\t217\t99.1\t+\t0\t0\t217MI\tPC.634_3 FLP3FBN01ELBSX orig_bc=TCAGAGTCGGCT new_bc=ACAGAGTCGGCT,FLP3FBN01 bc_diffs=1\t42
+H\t19\t243\t100.0\t+\t0\t0\t25MI218M\tPC.634_4 FLP3FBN01EG8AX orig_bc=ACAGAGTCGGCT new_bc=ACAGAGTCGGCT,FLP3FBN01 bc_diffs=0\t19
+N\t*\t219\t*\t*\t*\t*\t*\tPC.481_6\tFLP3FBN01DEHK3 orig_bc=ACCAGCGACTAG new_bc=ACCAGCGACTAG,FLP3FBN01 bc_diffs=0\t*
+H\t6\t211\t99.5\t+\t0\t0\t211M\tPC.269_5 FLP3FBN01EEWKD orig_bc=AGCACGAGCCTA new_bc=AGCACGAGCCTA,FLP3FBN01 bc_diffs=0\t6
+""".split('\n')
+
+expected_otu_assignments = """# Tab-separated fields:
+# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=QueryLabel, 10=TargetLabel
+# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NoHit
+# For C and D types, PctId is average id with seed.
+# QueryStart and SeedStart are zero-based relative to start of sequence.
+# If minus strand, SeedStart is relative to reverse-complemented seed.
+H\t2\t199\t97.5\t.\t0\t0\t119M80D\tref1 ecoli sequence\tusearch_ecoli_seq2
+N\t*\t178\t*\t*\t*\t*\t*\tEU199232 1 1236 Bacteria/Deltaproteobacteria/Desulfurella - Hippea/uncultured\t*
+H\t1\t180\t100.0\t.\t0\t0\t97M83D\tL07864 1 1200 Bacteria/Beta Gammaproteobacteria/Solemya symbiont\tSolemya seq""".split('\n')
+
+expected_enumerated_fasta = """>0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>5
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA""".split('\n')
+
+expected_enumerated_fasta_added_options = """>Big255Ern\tuclust_test_seqs_0 some comment0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Big256Ern\tuclust_test_seqs_1 some comment1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>Big257Ern\tuclust_test_seqs_2 some comment2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>Big258Ern\tuclust_test_seqs_3 some comment3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>Big259Ern\tuclust_test_seqs_4 some comment4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>Big260Ern\tuclust_test_seqs_5 some comment4_again
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>Big261Ern\tuclust_test_seqs_6 some comment6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>Big262Ern\tuclust_test_seqs_7 some comment7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>Big263Ern\tuclust_test_seqs_8 some comment8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>Big264Ern\tuclust_test_seqs_9 some comment9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA""".split('\n')
+
+expected_clusters_w_abundance_default_settings = """>Cluster1;size=326
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT""".split('\n')
+
+expected_clusters_w_abundance_low_setting = """>Cluster1;size=304
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>Cluster11;size=22
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCTAACCC
+CCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT""".split('\n')
+
+expected_reference_filtered_seqs = """>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTG
+ACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAG
+TGGCGGACGGGTGAGTA
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTG
+ACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAG
+TGGCGGACGGGTGAGTATCAAG""".split('\n')
+
+expected_de_novo_chimeras_default = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG
+>Cluster3;size=30
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCAGTCTCTCAACTCGGCTATGCATCATTGCCTTGGTAAGCCGTTACCT
+TACCAACTAGCTAATGCACCGCAGGTCCATCCAAGAGTGATAGCAGAACCATCTTTCAAACTCTAGACATGCGTCTAGTG
+TTGTTATCCGGTATTAGCATCTGTTTCCAGGTGTTATCCCAGTCTCTTGGG
+>Cluster12;size=19
+TTGGTCCGTGTCTCAGTACCAATGTGGGGGGTTAACCTCTCAGTCCCCCTATGTATCGTGGTCTTGGTGAGCCGTTACCC
+CACCAACTAACTAATACAACGCATGCCCATCCATTACCACCGGAGTTTTCAACCCAAGAAGATGCCTCCCTGGATGTTAT
+GGGGTATTAGTACCGATTTCTCAGTGTTATCCCCCTGTAATGGGTAGGTTGCATACGCGTTACGCACCCGTGCGCCGGTC
+GCCGACAAT
+>Cluster29;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGCCCATCCGCCACCGGTAATCCCTTTGGCGGCACCGGGATGCCCCGACGCCGCGTC
+ACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTGGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCGG
+TCGCCGG
+>Cluster30;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTTGGTGGGCCGTTACCC
+CGCCAACTAGCTAATGCGCCGCATGGCCATCCGTAGCCGGTGTTACCCTTTAAACCCCAAGAGATGCCTCTCGGAGTTAT
+TACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTACGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster16;size=16
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCGTTACCC
+CTCCAACCAGCTAATCAGACGCGGGTCCATCCTGTACCACCGGAGTTTTTCACACTGTACCATGCGGTACTGTGCGCTTA
+TGCGGTTTTAGCACCTATTTCTAAGTGTTATCCCCCTGTACAGGGCAGGTTACCCACGCGTTACTCACCCGTCCGCCACT
+>Cluster221;size=1
+CTGGGCCGTATCTCAGTCCCAATGTGGCCGTTCAACCTCTCAGTCCGGCTACTGATCGTCGCCTTGGTAGGCCGTTGCCC
+CGCCAACTACCTAATCGGACGCGAGCCCATCTTTCAGCGGATTGCTCCTTTGATTATCTCACCATGCGGCAAAATAATGT
+CATGCGGTATTAGCGTTCGTTTCCAAACGTTATCCCCCTCTGAAAGGCAGGTTGCTCACGCGTT
+>Cluster218;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGGCCACCCTCTCAGGTCGGCTACTGATCGTCACCTTGGTAGGCCGTTACCC
+CACCAACTAGCTAATCAGACGCAAGCCCATCTATCAGCGGATTGCTCCTTTTCTAGCTATATCATGCGATACTACTAGCT
+TATGCGGTATTAGCAATGATTTCTCACTGTTATTCCCCTCTGATAGGCAGG
+>Cluster217;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGAGTCCATCTCAGAGCGATAAATCTTTGATATCCAGAGCCATGCGACCCAGATATATT
+ATGCGGTATTAGCAGCTGTTTCCAGCTGTTATTCCCCATCCAAGGCAGGTT
+>Cluster216;size=1
+CTGGGCCGTGTCTCAGTCCCAGTGTGGCCGTCCGCCCTCTCAGGTCAGCTACTGATCGTCGCCTTGGTAGGCCATTACCC
+TACCAACTAGCTAATCAGACGCGAGGCCATCTCTCAGCGATAAATCTTTGATATATCTGCCATGCGACAAACATATATTA
+TGCGGTATTAGCAGTCGTTTCCAACTGTTGTCCCCCTCTGAAAGGCAGGTT""".split('\n')
+
+expected_de_novo_chimera_filtered_skew11 = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG
+>Cluster3;size=30
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCAGTCTCTCAACTCGGCTATGCATCATTGCCTTGGTAAGCCGTTACCT
+TACCAACTAGCTAATGCACCGCAGGTCCATCCAAGAGTGATAGCAGAACCATCTTTCAAACTCTAGACATGCGTCTAGTG
+TTGTTATCCGGTATTAGCATCTGTTTCCAGGTGTTATCCCAGTCTCTTGGG
+>Cluster12;size=19
+TTGGTCCGTGTCTCAGTACCAATGTGGGGGGTTAACCTCTCAGTCCCCCTATGTATCGTGGTCTTGGTGAGCCGTTACCC
+CACCAACTAACTAATACAACGCATGCCCATCCATTACCACCGGAGTTTTCAACCCAAGAAGATGCCTCCCTGGATGTTAT
+GGGGTATTAGTACCGATTTCTCAGTGTTATCCCCCTGTAATGGGTAGGTTGCATACGCGTTACGCACCCGTGCGCCGGTC
+GCCGACAAT
+>Cluster29;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGCCCATCCGCCACCGGTAATCCCTTTGGCGGCACCGGGATGCCCCGACGCCGCGTC
+ACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTGGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCGG
+TCGCCGG
+>Cluster30;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTTGGTGGGCCGTTACCC
+CGCCAACTAGCTAATGCGCCGCATGGCCATCCGTAGCCGGTGTTACCCTTTAAACCCCAAGAGATGCCTCTCGGAGTTAT
+TACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTACGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster16;size=16
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCGTTACCC
+CTCCAACCAGCTAATCAGACGCGGGTCCATCCTGTACCACCGGAGTTTTTCACACTGTACCATGCGGTACTGTGCGCTTA
+TGCGGTTTTAGCACCTATTTCTAAGTGTTATCCCCCTGTACAGGGCAGGTTACCCACGCGTTACTCACCCGTCCGCCACT
+>Cluster522;size=10
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT
+>Cluster221;size=1
+CTGGGCCGTATCTCAGTCCCAATGTGGCCGTTCAACCTCTCAGTCCGGCTACTGATCGTCGCCTTGGTAGGCCGTTGCCC
+CGCCAACTACCTAATCGGACGCGAGCCCATCTTTCAGCGGATTGCTCCTTTGATTATCTCACCATGCGGCAAAATAATGT
+CATGCGGTATTAGCGTTCGTTTCCAAACGTTATCCCCCTCTGAAAGGCAGGTTGCTCACGCGTT
+>Cluster218;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGGCCACCCTCTCAGGTCGGCTACTGATCGTCACCTTGGTAGGCCGTTACCC
+CACCAACTAGCTAATCAGACGCAAGCCCATCTATCAGCGGATTGCTCCTTTTCTAGCTATATCATGCGATACTACTAGCT
+TATGCGGTATTAGCAATGATTTCTCACTGTTATTCCCCTCTGATAGGCAGG
+>Cluster217;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGAGTCCATCTCAGAGCGATAAATCTTTGATATCCAGAGCCATGCGACCCAGATATATT
+ATGCGGTATTAGCAGCTGTTTCCAGCTGTTATTCCCCATCCAAGGCAGGTT
+>Cluster216;size=1
+CTGGGCCGTGTCTCAGTCCCAGTGTGGCCGTCCGCCCTCTCAGGTCAGCTACTGATCGTCGCCTTGGTAGGCCATTACCC
+TACCAACTAGCTAATCAGACGCGAGGCCATCTCTCAGCGATAAATCTTTGATATATCTGCCATGCGACAAACATATATTA
+TGCGGTATTAGCAGTCGTTTCCAACTGTTGTCCCCCTCTGAAAGGCAGGTT""".split('\n')
+
+expected_cluster_err_seqs = """>Cluster0;size=326
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT""".split('\n')
+
+expected_sorted_by_abundance_no_filter = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG
+>Cluster3;size=30
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGATCAGTCTCTCAACTCGGCTATGCATCATTGCCTTGGTAAGCCGTTACCT
+TACCAACTAGCTAATGCACCGCAGGTCCATCCAAGAGTGATAGCAGAACCATCTTTCAAACTCTAGACATGCGTCTAGTG
+TTGTTATCCGGTATTAGCATCTGTTTCCAGGTGTTATCCCAGTCTCTTGGG
+>Cluster12;size=19
+TTGGTCCGTGTCTCAGTACCAATGTGGGGGGTTAACCTCTCAGTCCCCCTATGTATCGTGGTCTTGGTGAGCCGTTACCC
+CACCAACTAACTAATACAACGCATGCCCATCCATTACCACCGGAGTTTTCAACCCAAGAAGATGCCTCCCTGGATGTTAT
+GGGGTATTAGTACCGATTTCTCAGTGTTATCCCCCTGTAATGGGTAGGTTGCATACGCGTTACGCACCCGTGCGCCGGTC
+GCCGACAAT
+>Cluster29;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGCCCATCCGCCACCGGTAATCCCTTTGGCGGCACCGGGATGCCCCGACGCCGCGTC
+ACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTGGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCGG
+TCGCCGG
+>Cluster30;size=18
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTTGGTGGGCCGTTACCC
+CGCCAACTAGCTAATGCGCCGCATGGCCATCCGTAGCCGGTGTTACCCTTTAAACCCCAAGAGATGCCTCTCGGAGTTAT
+TACGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTACGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster16;size=16
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCGTTACCC
+CTCCAACCAGCTAATCAGACGCGGGTCCATCCTGTACCACCGGAGTTTTTCACACTGTACCATGCGGTACTGTGCGCTTA
+TGCGGTTTTAGCACCTATTTCTAAGTGTTATCCCCCTGTACAGGGCAGGTTACCCACGCGTTACTCACCCGTCCGCCACT
+>Cluster522;size=10
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT
+>Cluster222;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAT
+GCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTTACT
+>Cluster221;size=1
+CTGGGCCGTATCTCAGTCCCAATGTGGCCGTTCAACCTCTCAGTCCGGCTACTGATCGTCGCCTTGGTAGGCCGTTGCCC
+CGCCAACTACCTAATCGGACGCGAGCCCATCTTTCAGCGGATTGCTCCTTTGATTATCTCACCATGCGGCAAAATAATGT
+CATGCGGTATTAGCGTTCGTTTCCAAACGTTATCCCCCTCTGAAAGGCAGGTTGCTCACGCGTT
+>Cluster218;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGGCCACCCTCTCAGGTCGGCTACTGATCGTCACCTTGGTAGGCCGTTACCC
+CACCAACTAGCTAATCAGACGCAAGCCCATCTATCAGCGGATTGCTCCTTTTCTAGCTATATCATGCGATACTACTAGCT
+TATGCGGTATTAGCAATGATTTCTCACTGTTATTCCCCTCTGATAGGCAGG
+>Cluster217;size=1
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGAGTCCATCTCAGAGCGATAAATCTTTGATATCCAGAGCCATGCGACCCAGATATATT
+ATGCGGTATTAGCAGCTGTTTCCAGCTGTTATTCCCCATCCAAGGCAGGTT
+>Cluster216;size=1
+CTGGGCCGTGTCTCAGTCCCAGTGTGGCCGTCCGCCCTCTCAGGTCAGCTACTGATCGTCGCCTTGGTAGGCCATTACCC
+TACCAACTAGCTAATCAGACGCGAGGCCATCTCTCAGCGATAAATCTTTGATATATCTGCCATGCGACAAACATATATTA
+TGCGGTATTAGCAGTCGTTTCCAACTGTTGTCCCCCTCTGAAAGGCAGGTT""".split('\n')
+
+expected_abundance_sort_filtered = """>Cluster1;size=52
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGGCTTGGTGGTCCGTTACAC
+CGCCAACTACCTAATGCGACGCATGCCCATCCGCTACCGGATCGCTCCTTTGGAATCCCGGGGATGTCCCCGGAACTCGT
+TATGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGTAGCGGGCAGGTTGCATACGTGTTACTCACCCGTGCGCCG
+GTCGCCGG
+>Cluster0;size=50
+TTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGACTAGGTGGGCCGTTACCC
+CGCCTACTATCTAATGGAACGCATCCCCATCGTCTACCGGAATACCTTTAATCATGTGAACATGCGGACTCATGATGCCA
+TCTTGTATTAATCTTCCTTTCAGAAGGCTGTCCAAGAGTAGACGGCAGGTTGGATACGTGTTACTCACCCGG
+>Cluster2;size=45
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCTGTTACCC
+CGCCAACCAGCTAATCAGACGCGGATCCATCGTATACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTATACGGCAGGTTCTCCACGCGTT
+>Cluster10;size=43
+CTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCGCCCTCTCAGGCCGGCTATGCATCATCGTCTTGGTGGGCCTTTACCC
+CGCCAACCAACTAATGCACCGCAGGTCCATCCGCGCCCCATCCCCTAAAGGATGTTTCACAGAAAGAAGATGCCTCCTTC
+CTGTACATCGGGATTTGTTCTCCGTTTCCAGAGCGTATTCCCGGTGCGCGGGCAGGTTCCCTACGTGTTACTCACCCG
+>Cluster4;size=40
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCC
+CGCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGTCCCATGCAGGACCGTGCGCTTA
+TGCGGTATTAGCACCTATTTCTAAGTGTTATCCCCCAGTGCAAGGCAGGTTACCCACGCGTTACTCACCCGTCCG
+>Cluster6;size=40
+CTGGTCCGTGTCTCAGTACCAGTGTGGGGGACCTTCCTCTCAGAACCCCTACGCATCGTCGCCTCGGTGGGCCGTTACCC
+CGCCGACTAGCTAATGCGCCGCATGGCCATCCGCAGCCGATAAATCTTTAAACATCGGGAGATGCCTCCCAACGTTGTTA
+CGCGGTATTAGACGGAATTTCTTCCGCTTATCCCCCTGCTGCGGGCAGGTTCCATACGTGTTACTCACCCGTGCGCCGG""".split('\n')
+
+expected_derep_seqs = """>seq1;size=2
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA
+>seq2;size=2
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC""".split('\n')
+
+expected_derep_seqs_full_len = """>Cluster0;size=1
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTTAA
+>Cluster1;size=2
+TTGGGCCGTGTCTCAGTCCCAATGTGGCCGTCACCCTCTCAGGCCGGCTACTGATCGTCGCCTTGGTGGGCCTTTACCCC
+>Cluster2;size=1
+GCCAACCAGCTAATCAGACGCGGGTCCATCTTGCACCACCGGAGTTTTTCACACTGCTTCATGCGAAGCTGTGCGCTT""".split('\n')
+
+expected_len_sorted_seqs = """>chimera
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTATCAAG
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTA""".split('\n')
+
+expected_combined_dna_seqs_1_seqs_usearch = """>uclust_test_seqs_0 some comment0
+AACCCCCACGGTGGATGCCACACGCCCCATACAAAGGGTAGGATGCTTAAGACACATCGCGTCAGGTTTGTGTCAGGCCT
+>uclust_test_seqs_1 some comment1
+ACCCACACGGTGGATGCAACAGATCCCATACACCGAGTTGGATGCTTAAGACGCATCGCGTGAGTTTTGCGTCAAGGCT
+>uclust_test_seqs_2 some comment2
+CCCCCACGGTGGCAGCAACACGTCACATACAACGGGTTGGATTCTAAAGACAAACCGCGTCAAAGTTGTGTCAGAACT
+>uclust_test_seqs_3 some comment3
+CCCCACGGTAGCTGCAACACGTCCCATACCACGGGTAGGATGCTAAAGACACATCGGGTCTGTTTTGTGTCAGGGCT
+>uclust_test_seqs_4 some comment4
+GCCACGGTGGGTACAACACGTCCACTACATCGGCTTGGAAGGTAAAGACACGTCGCGTCAGTATTGCGTCAGGGCT
+>uclust_test_seqs_5 some comment4_again
+CCGCGGTAGGTGCAACACGTCCCATACAACGGGTTGGAAGGTTAAGACACAACGCGTTAATTTTGTGTCAGGGCA
+>uclust_test_seqs_6 some comment6
+CGCGGTGGCTGCAAGACGTCCCATACAACGGGTTGGATGCTTAAGACACATCGCAACAGTTTTGAGTCAGGGCT
+>uclust_test_seqs_7 some comment7
+ACGGTGGCTACAAGACGTCCCATCCAACGGGTTGGATACTTAAGGCACATCACGTCAGTTTTGTGTCAGAGCT
+>uclust_test_seqs_8 some comment8
+CGGTGGCTGCAACACGTGGCATACAACGGGTTGGATGCTTAAGACACATCGCCTCAGTTTTGTGTCAGGGCT
+>uclust_test_seqs_9 some comment9
+GGTGGCTGAAACACATCCCATACAACGGGTTGGATGCTTAAGACACATCGCATCAGTTTTATGTCAGGGGA
+>usearch_ecoli_seq
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGT
+>Solemya seq
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTA
+>usearch_ecoli_seq2
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTCCAT
+>Solemya_seq2
+GGCTCAGATTGAACGCTGGCGGCATGCCTAACACATGCAAGTCGAACGGTAACAGGCGGAGCTTGCTCTGCGCTGACGAGTGGCGGACGGGTGAGTATCAAG
+>chimera
+CGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAGGGAGTAAAGTTAATACCTTTGCTCATTGACCCCTAGGGTGGGAATAACCCGGGGAAACCCGGGCTAATACCGAATAAGACCACAGGAGGCGACTCCAGAGGGTCAAAGGGAGCCTTGGCCTCCCCC""".split('\n')
+
+expected_retained_chimeras_union = """>seq1
+ACAGGCC
+>seq2
+ACAGGCCCCC
+>seq3
+TTATCCATT
+>seq4
+ACAGGCCCCC
+>seq5
+TTATCCATT""".split('\n')
+
+expected_retained_chimeras_intersection = """>seq3
+TTATCCATT""".split('\n')
+
+expected_usearch61_denovo_uchime_file = """0.0000\tCluster1;size=52\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster0;size=50\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster2;size=45\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster10;size=43\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster4;size=40\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster6;size=40\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster3;size=30\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0263\tCluster12;size=19\tCluster2;size=45\tCluster1;size=52\tCluster1;size=52\t75.6\t73.3\t76.5\t67.3\t75.6\t20\t21\t26\t6\t1\t3\t*\tN
+0.0000\tCluster30;size=18\t*\t*\tCluster6;size=40\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0924\tCluster29;size=18\tCluster6;size=40\tCluster1;size=52\tCluster1;size=52\t92.0\t88.6\t89.0\t86.5\t88.7\t7\t0\t0\t12\t7\t14\t3.3\tN
+0.0187\tCluster16;size=16\tCluster2;size=45\tCluster4;size=40\tCluster4;size=40\t94.5\t92.3\t94.1\t90.9\t94.0\t2\t1\t0\t9\t4\t7\t0.5\tN
+0.4232\tCluster222;size=1\tCluster4;size=40\tCluster2;size=45\tCluster2;size=45\t100.0\t94.1\t97.3\t91.3\t96.8\t7\t1\t0\t13\t0\t0\t3.2\tY
+0.0759\tCluster221;size=1\tCluster16;size=16\tCluster1;size=52\tCluster16;size=16\t74.5\t75.9\t67.3\t66.8\t75.4\t15\t0\t5\t16\t19\t32\t*\tN
+0.0107\tCluster218;size=1\tCluster2;size=45\tCluster4;size=40\tCluster4;size=40\t81.7\t80.7\t80.7\t90.6\t78.7\t6\t5\t28\t2\t0\t3\t3.0\tN
+0.0086\tCluster217;size=1\tCluster4;size=40\tCluster2;size=45\tCluster4;size=40\t83.1\t83.1\t80.7\t90.8\t82.1\t4\t0\t1\t2\t4\t33\t1.0\tN
+0.0000\tCluster216;size=1\t*\t*\tCluster16;size=16\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.4232\tCluster522;size=10\tCluster4;size=40\tCluster2;size=45\tCluster2;size=45\t100.0\t94.1\t97.3\t91.3\t96.8\t7\t1\t0\t13\t0\t0\t3.2\tY""".split('\n')
+
+expected_usearch61_ref_uchime_file = """0.0000\tCluster1;size=52\t*\t*\tseq1\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster0;size=50\t*\t*\tseq2\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\tCluster2;size=45\t*\t*\tseq3\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.1074\tCluster10;size=43\tmixed_seq\tseq1\tseq1\t70.3\t67.0\t65.1\t54.1\t65.7\t11\t0\t1\t31\t27\t33\t4.6\tN
+0.6322\tCluster4;size=40\tmixed_seq\tseq3\tseq3\t96.0\t77.6\t92.5\t73.6\t91.0\t6\t0\t0\t38\t2\t5\t5.0\tY
+0.1101\tCluster6;size=40\tseq2\tseq1\tseq1\t82.6\t71.3\t85.2\t69.6\t85.1\t12\t19\t16\t25\t0\t4\t*\tN
+0.0258\tCluster3;size=30\tmixed_seq\tseq3\tseq3\t71.6\t66.0\t68.0\t71.1\t66.4\t12\t7\t36\t16\t7\t5\t5.3\tN
+0.0263\tCluster12;size=19\tseq3\tseq1\tseq1\t75.6\t73.3\t76.5\t67.3\t75.6\t20\t21\t26\t6\t1\t3\t*\tN
+0.0530\tCluster30;size=18\tseq2\tseq1\tseq1\t79.6\t68.3\t85.7\t70.4\t85.9\t8\t24\t16\t25\t0\t6\t*\tN
+0.0534\tCluster29;size=18\tseq2\tseq1\tseq1\t80.9\t70.4\t88.3\t70.0\t88.7\t7\t25\t17\t23\t0\t2\t*\tN
+0.0699\tCluster16;size=16\tmixed_seq\tseq3\tseq3\t94.0\t74.6\t93.5\t73.6\t91.9\t2\t2\t2\t41\t3\t5\t2.1\tN
+1.2277\tCluster222;size=1\tmixed_seq\tseq3\tseq3\t100.0\t78.4\t97.1\t75.5\t96.8\t6\t1\t0\t44\t0\t0\t3.2\tY
+0.0855\tCluster221;size=1\tseq3\tseq1\tseq3\t75.8\t77.2\t68.8\t65.1\t72.9\t14\t0\t4\t17\t18\t28\t2.9\tN
+0.0174\tCluster218;size=1\tmixed_seq\tseq3\tseq3\t81.7\t70.3\t80.7\t70.3\t78.0\t1\t0\t4\t34\t12\t21\t3.6\tN
+0.0713\tCluster217;size=1\tmixed_seq\tseq3\tseq3\t83.3\t77.5\t79.9\t68.6\t79.7\t4\t0\t1\t27\t12\t17\t3.6\tN
+0.0505\tCluster216;size=1\tmixed_seq\tseq3\tseq3\t77.5\t72.5\t71.6\t70.1\t72.0\t14\t4\t27\t15\t5\t8\t5.4\tN
+1.2277\tCluster522;size=10\tmixed_seq\tseq3\tseq3\t100.0\t78.4\t97.1\t75.5\t96.8\t6\t1\t0\t44\t0\t0\t3.2\tY""".split('\n')
+
+usearch61_dereplicated_uc_lines = """S	0	80	*	*	*	*	*	seq2	*
+H	0	80	100.0	*	0	0	*	seq3	seq2
+H	0	80	100.0	*	0	0	*	seq4	seq2
+S	1	80	*	*	*	*	*	seq1	*
+C	0	3	*	*	*	*	*	seq2	*
+C	1	1	*	*	*	*	*	seq1	*""".split('\n')
+
+usearch61_clustered_uc_lines = """S	0	80	*	*	*	*	*	seq2;size=3;	*
+S	1	80	*	*	*	*	*	seq1;size=1;	*
+C	0	1	*	*	*	*	*	seq2;size=3;	*
+C	1	1	*	*	*	*	*	seq1;size=1;	*""".split('\n')
+
+usearch61_clustered_uc_lines_ref = """H	3	80	100.0	+	0	0	80M	seq2;size=3;	seq4
+H	0	80	100.0	+	0	0	80M	seq1;size=1;	seq1""".split('\n')
+
+usearch61_clustered_ref_lines = """H	0	80	100.0	+	0	0	80M	seq2;size=3;	seq2
+N	*	*	*	.	*	*	*	seq1;size=1;	*""".split('\n')
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/tests/test_vsearch.py b/bfillings/tests/test_vsearch.py
new file mode 100644
index 0000000..68c87a6
--- /dev/null
+++ b/bfillings/tests/test_vsearch.py
@@ -0,0 +1,1686 @@
+#!/usr/bin/env python
+
+# -----------------------------------------------------------------------------
+# Copyright (c) 2015--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# -----------------------------------------------------------------------------
+"""
+Unit tests for the VSEARCH version 1.1.1 Application controller
+===============================================================
+"""
+
+
+from unittest import TestCase, main
+from os import close
+from os.path import exists, join, dirname
+from tempfile import mkstemp, mkdtemp
+from shutil import rmtree
+
+from skbio.util import remove_files
+from skbio.parse.sequences import parse_fasta
+
+from bfillings.vsearch import (vsearch_dereplicate_exact_seqs,
+                               vsearch_sort_by_abundance,
+                               vsearch_chimera_filter_de_novo,
+                               vsearch_chimera_filter_ref)
+
+
+# Test class and cases
+class VsearchTests(TestCase):
+    """ Tests for VSEARCH version 1.1.1 functionality """
+
+    def setUp(self):
+        self.output_dir = mkdtemp()
+        self.seqs_to_derep = seqs_to_derep
+        self.seqs_to_derep_max_min_abundance =\
+            seqs_to_derep_max_min_abundance
+        self.seqs_to_derep_merged_derep_files =\
+            seqs_to_derep_merged_derep_files
+        self.seqs_to_sort = seqs_to_sort
+        self.amplicon_reads = amplicon_reads
+        self.single_chimera = single_chimera
+        self.single_chimera_ref = single_chimera_ref
+        self.uchime_ref_db = uchime_ref_db
+        self.uchime_single_ref_db = uchime_single_ref_db
+
+        # temporary file for seqs_to_derep
+        f, self.seqs_to_derep_fp = mkstemp(prefix='tmp_seqs_to_derep_',
+                                           suffix='.fasta')
+        close(f)
+
+        # write seqs_to_derep to file
+        with open(self.seqs_to_derep_fp, 'w') as tmp:
+            tmp.write(self.seqs_to_derep)
+
+        # temporary file for seqs_to_derep_max_min_abundance
+        f, self.seqs_to_derep_max_min_abundance_fp =\
+            mkstemp(prefix='tmp_seqs_to_derep_abun_',
+                    suffix='.fasta')
+        close(f)
+
+        # write seqs_to_derep_max_min_abundance to file
+        with open(self.seqs_to_derep_max_min_abundance_fp, 'w') as tmp:
+            tmp.write(self.seqs_to_derep_max_min_abundance)
+
+        # temporary file for seqs_to_derep_merged_derep_files
+        f, self.seqs_to_derep_merged_derep_files_fp =\
+            mkstemp(prefix='tmp_seqs_to_derep_concat_',
+                    suffix='.fasta')
+        close(f)
+
+        # write seqs_to_derep_merged_derep_files to file
+        with open(self.seqs_to_derep_merged_derep_files_fp, 'w') as tmp:
+            tmp.write(self.seqs_to_derep_merged_derep_files)
+
+        # temporary file for seqs_to_sort
+        f, self.seqs_to_sort_fp = mkstemp(prefix='tmp_seqs_to_sort_',
+                                          suffix='.fasta')
+        close(f)
+
+        # write seqs_to_sort to file
+        with open(self.seqs_to_sort_fp, 'w') as tmp:
+            tmp.write(self.seqs_to_sort)
+
+        # temporary file for amplicon_reads
+        f, self.amplicon_reads_fp = mkstemp(prefix='tmp_amplicon_reads_',
+                                            suffix='.fasta')
+        close(f)
+
+        # write amplicon_reads to file
+        with open(self.amplicon_reads_fp, 'w') as tmp:
+            tmp.write(self.amplicon_reads)
+
+        # temporary file for single_chimera
+        f, self.single_chimera_fp = mkstemp(prefix='tmp_single_chimera_',
+                                            suffix='.fasta')
+        close(f)
+
+        # write single_chimera to file
+        # (de novo chimera checking)
+        with open(self.single_chimera_fp, 'w') as tmp:
+            tmp.write(self.single_chimera)
+
+        # temporary file for single_chimera_ref
+        f, self.single_chimera_ref_fp = mkstemp(prefix='tmp_single_chimera_',
+                                                suffix='.fasta')
+        close(f)
+
+        # write single_chimera_ref to file
+        # (reference chimera checking)
+        with open(self.single_chimera_ref_fp, 'w') as tmp:
+            tmp.write(self.single_chimera_ref)
+
+        # temporary file for uchime_ref_db
+        f, self.uchime_ref_db_fp = mkstemp(prefix='tmp_uchime_ref_db_',
+                                           suffix='.fasta')
+        close(f)
+
+        # write uchime_ref_db to file
+        with open(self.uchime_ref_db_fp, 'w') as tmp:
+            tmp.write(self.uchime_ref_db)
+
+        # temporary file for uchime_single_ref_db
+        f, self.uchime_single_ref_db_fp =\
+            mkstemp(prefix='tmp_uchime_single_ref_db_',
+                    suffix='.fasta')
+        close(f)
+
+        # write uchime_single_ref_db to file
+        with open(self.uchime_single_ref_db_fp, 'w') as tmp:
+            tmp.write(self.uchime_single_ref_db)
+
+        # list of files to remove
+        self.files_to_remove = [self.seqs_to_derep_fp,
+                                self.seqs_to_derep_max_min_abundance_fp,
+                                self.seqs_to_derep_merged_derep_files_fp,
+                                self.seqs_to_sort_fp,
+                                self.amplicon_reads_fp,
+                                self.single_chimera_fp,
+                                self.single_chimera_ref_fp,
+                                self.uchime_ref_db_fp,
+                                self.uchime_single_ref_db_fp]
+
+    def tearDown(self):
+        remove_files(self.files_to_remove)
+        rmtree(self.output_dir)
+
+    def test_vsearch_chimera_filter_ref(self):
+        """ Test reference chimera filter, output only
+            chimeric sequences and log
+        """
+        chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+            vsearch_chimera_filter_ref(
+                self.amplicon_reads_fp,
+                self.output_dir,
+                self.uchime_ref_db_fp,
+                output_chimeras=True,
+                output_nonchimeras=False,
+                output_alns=False,
+                output_tabular=False,
+                log_name="vsearch_uchime_ref_chimera_filtering.log",
+                HALT_EXEC=False)
+
+        self.assertTrue(nonchimeras_fp is None)
+        self.assertTrue(alns_fp is None)
+        self.assertTrue(tabular_fp is None)
+        self.assertTrue(exists(log_fp))
+
+        expected_chimeras = ['251;size=2;', '320;size=2;', '36;size=2;',
+                             '672;size=2;', '142;size=1;', '201;size=1;',
+                             '241;size=1;', '279;size=1;', '299;size=1;',
+                             '359;size=1;', '375;size=1;', '407;size=1;',
+                             '423;size=1;', '516;size=1;', '618;size=1;',
+                             '717;size=1;', '902;size=1;', '918;size=1;',
+                             '941;size=1;']
+
+        num_seqs = 0
+
+        with open(chimeras_fp, "U") as chimeras_f:
+            for label, seq in parse_fasta(chimeras_f):
+                # check label represents chimeric sequence
+                self.assertTrue(label in expected_chimeras)
+                # check sequence exists
+                self.assertTrue(len(seq) > 0)
+                num_seqs += 1
+
+        self.assertTrue(num_seqs, 19)
+
+    def test_vsearch_chimera_filter_ref_output(self):
+        """ Raise error when no output is selected for
+            reference chimera filtering
+        """
+
+        self.assertRaises(ValueError,
+                          vsearch_chimera_filter_ref,
+                          fasta_filepath=self.amplicon_reads_fp,
+                          working_dir=self.output_dir,
+                          db_filepath=self.uchime_ref_db_fp,
+                          output_chimeras=False,
+                          output_nonchimeras=False,
+                          output_alns=False,
+                          output_tabular=False,
+                          log_name="vsearch_uchime_ref_chimera_filtering.log",
+                          HALT_EXEC=False)
+
+    def test_vsearch_chimera_filter_ref_output_nonchimeras(self):
+        """ Test ref chimera filter, output nonchimeric sequences
+        """
+        chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+            vsearch_chimera_filter_ref(
+                self.amplicon_reads_fp,
+                self.output_dir,
+                self.uchime_ref_db_fp,
+                output_chimeras=False,
+                output_nonchimeras=True,
+                output_alns=False,
+                output_tabular=False,
+                log_name="vsearch_uchime_ref_chimera_filtering.log",
+                HALT_EXEC=False)
+
+        self.assertTrue(chimeras_fp is None)
+        self.assertTrue(alns_fp is None)
+        self.assertTrue(tabular_fp is None)
+        self.assertTrue(exists(log_fp))
+
+        expected_nonchimeras =\
+            ['3;size=102;', '16;size=95;', '22;size=93;', '2;size=87;', '39;size=84;',
+             '4;size=79;', '6;size=72;', '11;size=70;', '45;size=67;', '1;size=65;',
+             '425;size=2;', '100;size=1;', '102;size=1;', '10;size=1;', '115;size=1;',
+             '123;size=1;', '132;size=1;', '134;size=1;', '140;size=1;', '144;size=1;',
+             '148;size=1;', '14;size=1;', '156;size=1;', '15;size=1;', '161;size=1;',
+             '162;size=1;', '186;size=1;', '203;size=1;', '217;size=1;', '218;size=1;',
+             '21;size=1;', '221;size=1;', '222;size=1;', '225;size=1;', '233;size=1;',
+             '234;size=1;', '235;size=1;', '249;size=1;', '24;size=1;', '259;size=1;',
+             '266;size=1;', '26;size=1;', '27;size=1;', '296;size=1;', '303;size=1;',
+             '306;size=1;', '307;size=1;', '322;size=1;', '326;size=1;', '32;size=1;',
+             '332;size=1;', '333;size=1;', '338;size=1;', '360;size=1;', '362;size=1;',
+             '364;size=1;', '366;size=1;', '369;size=1;', '371;size=1;', '373;size=1;',
+             '374;size=1;', '37;size=1;', '386;size=1;', '387;size=1;', '392;size=1;',
+             '393;size=1;', '397;size=1;', '405;size=1;', '414;size=1;', '418;size=1;',
+             '431;size=1;', '436;size=1;', '444;size=1;', '445;size=1;', '456;size=1;',
+             '460;size=1;', '469;size=1;', '470;size=1;', '477;size=1;', '479;size=1;',
+             '486;size=1;', '500;size=1;', '515;size=1;', '528;size=1;', '530;size=1;',
+             '531;size=1;', '549;size=1;', '551;size=1;', '557;size=1;', '559;size=1;',
+             '561;size=1;', '562;size=1;', '564;size=1;', '566;size=1;', '568;size=1;',
+             '570;size=1;', '578;size=1;', '57;size=1;', '586;size=1;', '596;size=1;',
+             '600;size=1;', '612;size=1;', '625;size=1;', '632;size=1;', '649;size=1;',
+             '650;size=1;', '651;size=1;', '664;size=1;', '66;size=1;', '673;size=1;',
+             '675;size=1;', '682;size=1;', '690;size=1;', '699;size=1;', '709;size=1;',
+             '73;size=1;', '740;size=1;', '745;size=1;', '746;size=1;', '748;size=1;',
+             '760;size=1;', '766;size=1;', '778;size=1;', '77;size=1;', '791;size=1;',
+             '797;size=1;', '7;size=1;', '809;size=1;', '813;size=1;', '814;size=1;',
+             '816;size=1;', '817;size=1;', '821;size=1;', '824;size=1;', '827;size=1;',
+             '82;size=1;', '83;size=1;', '842;size=1;', '851;size=1;', '853;size=1;',
+             '862;size=1;', '863;size=1;', '866;size=1;', '871;size=1;', '879;size=1;',
+             '886;size=1;', '892;size=1;', '895;size=1;', '897;size=1;', '904;size=1;',
+             '912;size=1;', '916;size=1;', '91;size=1;', '920;size=1;', '921;size=1;',
+             '925;size=1;', '930;size=1;', '942;size=1;', '945;size=1;', '947;size=1;',
+             '948;size=1;', '952;size=1;', '956;size=1;', '958;size=1;', '964;size=1;',
+             '967;size=1;', '984;size=1;', '992;size=1;', '993;size=1;']
+
+        num_seqs = 0
+
+        # check nonchimeras fasta file
+        with open(nonchimeras_fp, "U") as nonchimeras_f:
+            for label, seq in parse_fasta(nonchimeras_f):
+                # check label represents chimeric sequence
+                self.assertTrue(label in expected_nonchimeras)
+                # check sequence exists
+                self.assertTrue(len(seq) > 0)
+                num_seqs += 1
+
+        self.assertTrue(num_seqs, 169)
+
+    def test_vsearch_chimera_filter_ref_output_alns_tab(self):
+        """ Test ref chimera filter, output only
+            chimeric alignments and tabular format
+        """
+        chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+            vsearch_chimera_filter_ref(
+                self.single_chimera_ref_fp,
+                self.output_dir,
+                self.uchime_single_ref_db_fp,
+                output_chimeras=False,
+                output_nonchimeras=False,
+                output_alns=True,
+                output_tabular=True,
+                log_name="vsearch_uchime_ref_chimera_filtering.log",
+                HALT_EXEC=False)
+
+        self.assertTrue(chimeras_fp is None)
+        self.assertTrue(nonchimeras_fp is None)
+        self.assertTrue(exists(log_fp))
+
+        # check alignment is correct
+        with open(alns_fp, 'U') as alns_f:
+            actual_alns = alns_f.read()
+        self.assertEquals(single_chimera_ref_aln, actual_alns)
+
+        # check tabular output is correct
+        with open(tabular_fp, 'U') as tabular_f:
+            actual_tab = tabular_f.read()
+
+        self.assertEquals(single_chimera_ref_tab, actual_tab)
+
+    def test_vsearch_chimera_filter_de_novo(self):
+        """ Test de novo chimera filter, output only
+            chimeric sequences
+        """
+        chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+            vsearch_chimera_filter_de_novo(
+                self.amplicon_reads_fp,
+                self.output_dir,
+                output_chimeras=True,
+                output_nonchimeras=False,
+                output_alns=False,
+                output_tabular=False,
+                log_name="vsearch_uchime_de_novo_chimera_filtering.log",
+                HALT_EXEC=False)
+
+        self.assertTrue(nonchimeras_fp is None)
+        self.assertTrue(alns_fp is None)
+        self.assertTrue(tabular_fp is None)
+        self.assertTrue(exists(log_fp))
+
+        expected_chimeras = ['251;size=2;', '320;size=2;', '36;size=2;',
+                             '672;size=2;', '142;size=1;', '201;size=1;',
+                             '241;size=1;', '279;size=1;', '299;size=1;',
+                             '359;size=1;', '375;size=1;', '407;size=1;',
+                             '423;size=1;', '516;size=1;', '618;size=1;',
+                             '717;size=1;', '902;size=1;', '918;size=1;',
+                             '941;size=1;']
+
+        num_seqs = 0
+
+        with open(chimeras_fp, "U") as chimeras_f:
+            for label, seq in parse_fasta(chimeras_f):
+                # check label represents chimeric sequence
+                self.assertTrue(label in expected_chimeras)
+                # check sequence exists
+                self.assertTrue(len(seq) > 0)
+                num_seqs += 1
+
+        self.assertTrue(num_seqs, 19)
+
+    def test_vsearch_chimera_filter_de_novo_output(self):
+        """ Raise error when no output is selected for
+            de novo chimera filtering
+        """
+
+        self.assertRaises(ValueError,
+                          vsearch_chimera_filter_de_novo,
+                          fasta_filepath=self.amplicon_reads_fp,
+                          working_dir=self.output_dir,
+                          output_chimeras=False,
+                          output_nonchimeras=False,
+                          output_alns=False,
+                          output_tabular=False,
+                          log_name="vsearch_uchime_de_novo_chimera_filter.log",
+                          HALT_EXEC=False)
+
+    def test_vsearch_chimera_filter_de_novo_output_nonchimeras(self):
+        """ Test de novo chimera filter, output nonchimeric sequences
+        """
+        chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+            vsearch_chimera_filter_de_novo(
+                self.amplicon_reads_fp,
+                self.output_dir,
+                output_chimeras=False,
+                output_nonchimeras=True,
+                output_alns=False,
+                output_tabular=False,
+                log_name="vsearch_uchime_de_novo_chimera_filter.log",
+                HALT_EXEC=False)
+
+        self.assertTrue(chimeras_fp is None)
+        self.assertTrue(alns_fp is None)
+        self.assertTrue(tabular_fp is None)
+        self.assertTrue(exists(log_fp))
+
+        expected_nonchimeras =\
+            ['3;size=102;', '16;size=95;', '22;size=93;', '2;size=87;', '39;size=84;',
+             '4;size=79;', '6;size=72;', '11;size=70;', '45;size=67;', '1;size=65;',
+             '425;size=2;', '100;size=1;', '102;size=1;', '10;size=1;', '115;size=1;',
+             '123;size=1;', '132;size=1;', '134;size=1;', '140;size=1;', '144;size=1;',
+             '148;size=1;', '14;size=1;', '156;size=1;', '15;size=1;', '161;size=1;',
+             '162;size=1;', '186;size=1;', '203;size=1;', '217;size=1;', '218;size=1;',
+             '21;size=1;', '221;size=1;', '222;size=1;', '225;size=1;', '233;size=1;',
+             '234;size=1;', '235;size=1;', '249;size=1;', '24;size=1;', '259;size=1;',
+             '266;size=1;', '26;size=1;', '27;size=1;', '296;size=1;', '303;size=1;',
+             '306;size=1;', '307;size=1;', '322;size=1;', '326;size=1;', '32;size=1;',
+             '332;size=1;', '333;size=1;', '338;size=1;', '360;size=1;', '362;size=1;',
+             '364;size=1;', '366;size=1;', '369;size=1;', '371;size=1;', '373;size=1;',
+             '374;size=1;', '37;size=1;', '386;size=1;', '387;size=1;', '392;size=1;',
+             '393;size=1;', '397;size=1;', '405;size=1;', '414;size=1;', '418;size=1;',
+             '431;size=1;', '436;size=1;', '444;size=1;', '445;size=1;', '456;size=1;',
+             '460;size=1;', '469;size=1;', '470;size=1;', '477;size=1;', '479;size=1;',
+             '486;size=1;', '500;size=1;', '515;size=1;', '528;size=1;', '530;size=1;',
+             '531;size=1;', '549;size=1;', '551;size=1;', '557;size=1;', '559;size=1;',
+             '561;size=1;', '562;size=1;', '564;size=1;', '566;size=1;', '568;size=1;',
+             '570;size=1;', '578;size=1;', '57;size=1;', '586;size=1;', '596;size=1;',
+             '600;size=1;', '612;size=1;', '625;size=1;', '632;size=1;', '649;size=1;',
+             '650;size=1;', '651;size=1;', '664;size=1;', '66;size=1;', '673;size=1;',
+             '675;size=1;', '682;size=1;', '690;size=1;', '699;size=1;', '709;size=1;',
+             '73;size=1;', '740;size=1;', '745;size=1;', '746;size=1;', '748;size=1;',
+             '760;size=1;', '766;size=1;', '778;size=1;', '77;size=1;', '791;size=1;',
+             '797;size=1;', '7;size=1;', '809;size=1;', '813;size=1;', '814;size=1;',
+             '816;size=1;', '817;size=1;', '821;size=1;', '824;size=1;', '827;size=1;',
+             '82;size=1;', '83;size=1;', '842;size=1;', '851;size=1;', '853;size=1;',
+             '862;size=1;', '863;size=1;', '866;size=1;', '871;size=1;', '879;size=1;',
+             '886;size=1;', '892;size=1;', '895;size=1;', '897;size=1;', '904;size=1;',
+             '912;size=1;', '916;size=1;', '91;size=1;', '920;size=1;', '921;size=1;',
+             '925;size=1;', '930;size=1;', '942;size=1;', '945;size=1;', '947;size=1;',
+             '948;size=1;', '952;size=1;', '956;size=1;', '958;size=1;', '964;size=1;',
+             '967;size=1;', '984;size=1;', '992;size=1;', '993;size=1;']
+
+        num_seqs = 0
+
+        # check nonchimeras fasta file
+        with open(nonchimeras_fp, "U") as nonchimeras_f:
+            for label, seq in parse_fasta(nonchimeras_f):
+                # check label represents chimeric sequence
+                self.assertTrue(label in expected_nonchimeras)
+                # check sequence exists
+                self.assertTrue(len(seq) > 0)
+                num_seqs += 1
+
+        self.assertTrue(num_seqs, 169)
+
+    def test_vsearch_chimera_filter_de_novo_output_alns_tab(self):
+        """ Test de novo chimera filter, output only
+            chimeric alignments and tabular format
+        """
+        chimeras_fp, nonchimeras_fp, alns_fp, tabular_fp, log_fp =\
+            vsearch_chimera_filter_de_novo(
+                self.single_chimera_fp,
+                self.output_dir,
+                output_chimeras=False,
+                output_nonchimeras=False,
+                output_alns=True,
+                output_tabular=True,
+                log_name="vsearch_uchime_de_novo_chimera_filter.log",
+                HALT_EXEC=False)
+
+        self.assertTrue(chimeras_fp is None)
+        self.assertTrue(nonchimeras_fp is None)
+        self.assertTrue(exists(log_fp))
+
+        # check alignment is correct
+        with open(alns_fp, 'U') as alns_f:
+            actual_alns = alns_f.read()
+        self.assertEquals(single_chimera_aln, actual_alns)
+
+        # check tabular output is correct
+        with open(tabular_fp, 'U') as tabular_f:
+            actual_tab = tabular_f.read()
+        self.assertEquals(single_chimera_tab, actual_tab)
+
+    def test_vsearch_sort_by_abundance(self):
+        """ Test sorting sequences by abundance
+        """
+        tmp_fp = join(self.output_dir, "tmp_sorted_reads.fasta")
+
+        output_sorted, log_fp = vsearch_sort_by_abundance(
+            self.seqs_to_sort_fp,
+            tmp_fp,
+            working_dir=None,
+            minsize=None,
+            maxsize=None,
+            log_name="abundance_sort.log",
+            HALT_EXEC=False)
+
+        self.assertTrue(exists(log_fp))
+
+        expected_order = ['HWI-ST157_0368:1:2107:19923:3944#0/1;size=100;',
+                          'HWI-ST157_0368:1:1201:8401:113582#0/1;size=10;',
+                          'HWI-ST157_0368:1:2204:20491:181552#0/1;size=10;',
+                          'HWI-ST157_0368:1:2105:3428:36721#0/1;size=5;',
+                          'HWI-ST157_0368:1:2105:6731:137157#0/1;size=4;',
+                          'HWI-ST157_0368:1:2106:18272:88408#0/1;size=2;',
+                          'HWI-ST157_0368:1:1106:12200:200135#0/1;size=1;',
+                          'HWI-ST157_0368:1:2208:9135:145970#0/1;size=1;']
+
+        num_seqs = 0
+
+        with open(output_sorted, "U") as tmp_f:
+            for label, seq in parse_fasta(tmp_f):
+                # check label is in correct order
+                self.assertEquals(label, expected_order[num_seqs])
+                # check sequence exists
+                self.assertTrue(len(seq) > 0)
+                num_seqs += 1
+
+        self.assertTrue(num_seqs, 8)
+
+    def test_vsearch_sort_by_abundance_minsize_1_maxsize_10(self):
+        """ Test sorting sequences by abundance,
+            discard sequences with an abundance value smaller
+            than 1 and greater than 10
+        """
+        tmp_fp = join(self.output_dir, "tmp_sorted_reads.fasta")
+
+        output_sorted, log_fp = vsearch_sort_by_abundance(
+            self.seqs_to_sort_fp,
+            tmp_fp,
+            working_dir=None,
+            minsize=2,
+            maxsize=10,
+            log_name="abundance_sort.log",
+            HALT_EXEC=False)
+
+        self.assertTrue(exists(log_fp))
+
+        expected_order = ['HWI-ST157_0368:1:1201:8401:113582#0/1;size=10;',
+                          'HWI-ST157_0368:1:2204:20491:181552#0/1;size=10;',
+                          'HWI-ST157_0368:1:2105:3428:36721#0/1;size=5;',
+                          'HWI-ST157_0368:1:2105:6731:137157#0/1;size=4;',
+                          'HWI-ST157_0368:1:2106:18272:88408#0/1;size=2;']
+
+        num_seqs = 0
+
+        with open(output_sorted, "U") as tmp_f:
+            for label, seq in parse_fasta(tmp_f):
+                # check label is in correct order
+                self.assertEquals(label, expected_order[num_seqs])
+                # check sequence exists
+                self.assertTrue(len(seq) > 0)
+                num_seqs += 1
+
+        self.assertTrue(num_seqs, 5)
+
+    def test_vsearch_dereplicate_exact_seqs(self):
+    	""" Test dereplicating sequences
+        """
+        tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+        dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+            self.seqs_to_derep_fp,
+            tmp_fp,
+            output_uc=False,
+            working_dir=self.output_dir,
+            strand="both",
+            maxuniquesize=None,
+            minuniquesize=None,
+            sizein=False,
+            sizeout=True)
+
+        # no output for .uc
+        self.assertTrue(uc_fp is None)
+        self.assertTrue(exists(log_fp))
+
+        num_seqs = 0
+        expected_derep = ['HWI-ST157_0368:1:1207:16180:126921#0/1;size=3;',
+                          'HWI-ST157_0368:1:2103:7895:197066#0/1;size=3;',
+                          'HWI-ST157_0368:1:1106:11378:83198#0/1;size=1;',
+                          'HWI-ST157_0368:1:2102:15078:69955#0/1;size=1;']
+
+        with open(tmp_fp, "U") as tmp_f:
+            for label, seq in parse_fasta(tmp_f):
+                num_seqs += 1
+                # check output labels are correct
+                self.assertTrue(label in expected_derep)
+                # check sequence exists
+                self.assertTrue(len(seq) > 0)
+
+        # check there are 4 sequences after dereplication
+        self.assertEquals(num_seqs, 4)
+
+    def test_vsearch_dereplicate_exact_seqs_uc(self):
+        """ Test dereplicating sequences with .uc output
+        """
+        tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+        dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+            self.seqs_to_derep_fp,
+            tmp_fp,
+            output_uc=True,
+            working_dir=self.output_dir,
+            strand="both",
+            maxuniquesize=None,
+            minuniquesize=None,
+            sizein=False,
+            sizeout=True)
+
+        # .uc exists
+        self.assertTrue(exists(uc_fp))
+        self.assertTrue(exists(log_fp))
+
+        id_to_count = {}
+
+        num_seqs = 0
+        expected_derep = {'HWI-ST157_0368:1:1207:16180:126921#0/1': 3,
+                          'HWI-ST157_0368:1:2103:7895:197066#0/1': 3,
+                          'HWI-ST157_0368:1:1106:11378:83198#0/1': 1,
+                          'HWI-ST157_0368:1:2102:15078:69955#0/1': 1}
+
+        with open(uc_fp, 'U') as uc_f:
+            for line in uc_f:
+                if line.startswith('S'):
+                    num_seqs += 1
+                    label = line.strip().split('\t')[8]
+                    # check output labels are correct
+                    self.assertTrue(label in expected_derep)
+                    id_to_count[label] = 1
+                elif line.startswith('H'):
+                    seed = line.strip().split('\t')[9]
+                    id_to_count[seed] += 1
+
+        # check there are 4 sequences after dereplication
+        self.assertEquals(num_seqs, 4)
+
+        for label in id_to_count:
+            self.assertEquals(expected_derep[label], id_to_count[label])
+
+    def test_vsearch_dereplicate_exact_seqs_empty_working_dir(self):
+        """ Test dereplicating sequences without passing
+            a working directory
+        """
+        tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+        dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+            self.seqs_to_derep_fp,
+            tmp_fp,
+            output_uc=True,
+            working_dir=None,
+            strand="both",
+            maxuniquesize=None,
+            minuniquesize=None,
+            sizein=False,
+            sizeout=True)
+
+        self.assertTrue(exists(log_fp))
+
+        # check dereplicated seqs and uc file in the same
+        # directory (same path as tmp_fp)
+        self.assertEquals(dirname(tmp_fp), dirname(dereplicated_seqs_fp))
+        self.assertEquals(dirname(tmp_fp), dirname(uc_fp))
+
+    def test_vsearch_dereplicate_exact_seqs_abundance(self):
+        """ Test dereplicating sequences and discarding those with
+            abundance < 2 and abundance > 6
+        """
+        tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+        dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+            self.seqs_to_derep_max_min_abundance_fp,
+            tmp_fp,
+            output_uc=False,
+            working_dir=self.output_dir,
+            strand="both",
+            maxuniquesize=6,
+            minuniquesize=2,
+            sizein=False,
+            sizeout=True)
+
+        # no output for .uc
+        self.assertTrue(uc_fp is None)
+        self.assertTrue(exists(log_fp))
+
+        num_seqs = 0
+        expected_derep = ['HWI-ST157_0368:1:1106:10560:153880#0/1;size=6;',
+                          'HWI-ST157_0368:1:2103:12440:90119#0/1;size=2;',
+                          'HWI-ST157_0368:1:1106:15269:103850#0/1;size=3;',
+                          'HWI-ST157_0368:1:1205:9745:86166#0/1;size=5;']
+
+        with open(tmp_fp, "U") as tmp_f:
+            for label, seq in parse_fasta(tmp_f):
+                num_seqs += 1
+                # check output labels are correct
+                self.assertTrue(label in expected_derep)
+                # check sequence exists
+                self.assertTrue(len(seq) > 0)
+
+        # check there are 4 sequences after dereplication
+        self.assertEquals(num_seqs, 4)
+
+    def test_vsearch_dereplicate_exact_seqs_merged(self):
+        """ Test dereplicating sequences which already contain
+            abundance information in the label from previous
+            dereplication (ex. two dereplicated files have been
+            merged into a new file for dereplication)
+        """
+        tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+        dereplicated_seqs_fp, uc_fp, log_fp = vsearch_dereplicate_exact_seqs(
+            self.seqs_to_derep_merged_derep_files_fp,
+            tmp_fp,
+            output_uc=False,
+            working_dir=self.output_dir,
+            strand="both",
+            maxuniquesize=None,
+            minuniquesize=None,
+            sizein=True,
+            sizeout=True)
+
+        # no output for .uc
+        self.assertTrue(uc_fp is None)
+        self.assertTrue(exists(log_fp))
+
+        num_seqs = 0
+        expected_derep = ['HWI-ST157_0368:1:1207:16180:126921#0/1;size=6;',
+                          'HWI-ST157_0368:1:2103:7895:197066#0/1;size=6;',
+                          'HWI-ST157_0368:1:1106:11378:83198#0/1;size=2;',
+                          'HWI-ST157_0368:1:2102:15078:69955#0/1;size=2;']
+
+        with open(tmp_fp, "U") as tmp_f:
+            for label, seq in parse_fasta(tmp_f):
+                num_seqs += 1
+                # check output labels are correct
+                self.assertTrue(label in expected_derep)
+                # check sequence exists
+                self.assertTrue(len(seq) > 0)
+
+        # check there are 4 sequences after dereplication
+        self.assertEquals(num_seqs, 4)
+
+    def test_vsearch_dereplicate_exact_seqs_strand(self):
+        """ Raise error when strand parameter is something
+            other than 'plus' or 'both'
+        """
+        tmp_fp = join(self.output_dir, "tmp_derep_reads.fasta")
+
+        self.assertRaises(ValueError,
+                          vsearch_dereplicate_exact_seqs,
+                          fasta_filepath=self.seqs_to_derep_fp,
+                          output_filepath=tmp_fp,
+                          output_uc=False,
+                          working_dir=None,
+                          strand="minus",
+                          maxuniquesize=None,
+                          minuniquesize=None,
+                          sizein=False,
+                          sizeout=True,
+                          log_name="derep.log",
+                          HALT_EXEC=False)
+
+
+# Test dereplicating sequences using default parameters
+seqs_to_derep = """>HWI-ST157_0368:1:2102:15078:69955#0/1
+TACGTAGGGCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGTGCGCAGGCGGTCTGTTAAGTCTGTAGTTAAAGGCTGTGGCTCAACTATGGTTAGTT
+>HWI-ST157_0368:1:2103:7895:197066#0/1
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#0/1
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+>HWI-ST157_0368:1:1106:11378:83198#0/1
+TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGC
+>HWI-ST157_0368:1:2103:7895:197066#0/2
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:2103:7895:197066#0/3
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#0/2
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#0/3
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+"""
+
+# Test dereplicating a file which is a concatenation of two separately
+# dereplicated files. The input fasta file contains abundance information.
+seqs_to_derep_merged_derep_files = """>HWI-ST157_0368:1:2102:15078:69955#0/1;size=1;
+TACGTAGGGCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGTGCGCAGGCGGTCTGTTAAGTCTGTAGTTAAAGGCTGTGGCTCAACTATGGTTAGTT
+>HWI-ST157_0368:1:2103:7895:197066#0/1;size=3;
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#0/1;size=3;
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+>HWI-ST157_0368:1:1106:11378:83198#0/1;size=1;
+TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGC
+>HWI-ST157_0368:1:2102:15078:69955#1/1;size=1;
+TACGTAGGGCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGTGCGCAGGCGGTCTGTTAAGTCTGTAGTTAAAGGCTGTGGCTCAACTATGGTTAGTT
+>HWI-ST157_0368:1:2103:7895:197066#1/1;size=3;
+TACGTAGGGGGCAAGCGTTGTCCGAATTTACTGGGTGTAAAGGGAGCGCAGACGGCACGGCAAGCCAGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1207:16180:126921#1/1;size=3;
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGATACTTAAGTCTGGTGTGAAAACCTAGGGCTCAACCCTGGGACTGC
+>HWI-ST157_0368:1:1106:11378:83198#1/1;size=1;
+TACGTAGGGAGCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGC
+"""
+
+# Sequences to dereplicate with final clusters as follows:
+# 2 clusters with abundance 6 and 7
+# 3 clusters with abundance 1
+# 3 clusters with abundance 2, 3, 5
+seqs_to_derep_max_min_abundance = """>HWI-ST157_0368:1:1106:10560:153880#0/1
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/2
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/3
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/4
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/5
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:1106:10560:153880#0/6
+TACGTAGGTGGCGAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGCGTAGGCGGGAATGCAAGTCAGATGTGAAATCCAGGGGCTTAACCCTTGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/1
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/2
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/3
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/4
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/5
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/6
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:2104:14337:180515#0/7
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGCGCGCAGGCGGTCTGGCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC
+>HWI-ST157_0368:1:1102:8490:14349#0/1
+AACGTAGGTCACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTCAACCCATAAACTGC
+>HWI-ST157_0368:1:1205:18016:113727#0/1
+TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTGATTAAGTTAGATGTGAAATCCCCGGGCTTAACCTGGGGATGGC
+>HWI-ST157_0368:1:1201:16382:127646#0/1
+TACAGAGGTCTCAAGCGTTGTTCGGAATCACTGGGCGTAAAGCGTGCGTAGGCGGTTTCGTAAGTCGGGTGTGAAAGGCGGGGGCTTAACGCCCGGACTGG
+>HWI-ST157_0368:1:2103:12440:90119#0/1
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCACGCCAAGTCAGCGGTGAAATTTCCGGGCTCAACCCGGAGTGTGC
+>HWI-ST157_0368:1:2103:12440:90119#0/2
+TACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGCACGCCAAGTCAGCGGTGAAATTTCCGGGCTCAACCCGGAGTGTGC
+>HWI-ST157_0368:1:1106:15269:103850#0/1
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGGTTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGC
+>HWI-ST157_0368:1:1106:15269:103850#0/2
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGGTTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGC
+>HWI-ST157_0368:1:1106:15269:103850#0/3
+TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGGTTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/1
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/2
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/3
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/4
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+>HWI-ST157_0368:1:1205:9745:86166#0/5
+TACGTAGGTCCCGAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATGATTAAGTGGGATGTGAAATACCCGGGCTCAACTTGGGTGCTGC
+"""
+
+# Test sort by abundance functionality in VSEARCH
+seqs_to_sort = """>HWI-ST157_0368:1:2105:3428:36721#0/1;size=5;
+TACGTAGGGTGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGCGTCCGGTGTGAAAGTCCATCGCTTAACGGTGGATCTGC
+>HWI-ST157_0368:1:2106:18272:88408#0/1;size=2;
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGGAGCAAGTCTGAAGTGAAAGCCCGGGGCTCAACCCCGGGACTGC
+>HWI-ST157_0368:1:1106:12200:200135#0/1;size=1;
+TACGTAGGTGGCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGGGAGCAGGCGGCACTAAGGGTCTGTGGTGAAAGATCGAAGCTTAACTTCGGTAAGCC
+>HWI-ST157_0368:1:1201:8401:113582#0/1;size=10;
+TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTGTGGCAAGTCTGATGTGAAAGGCATGGGCTTAACCTGTGGACTGC
+>HWI-ST157_0368:1:2208:9135:145970#0/1;size=1;
+TACGTAGGGGGCGAGCGTTGTCCGGAATTACTGGGCGTAAGGGGAGCGTAGGCGGTCGATTAAGTTAGATGTGAAACCCCCGGGCTTAACTTGGGGACTGC
+>HWI-ST157_0368:1:2204:20491:181552#0/1;size=10;
+TACGTAGGGTGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGAGTCTGGTGTGAAAGTCCATCGCTTAACGGTGGATCCGC
+>HWI-ST157_0368:1:2105:6731:137157#0/1;size=4;
+TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGGCTGTGGCTCAACCATAGTTCGCT
+>HWI-ST157_0368:1:2107:19923:3944#0/1;size=100;
+TGCATTTTCTCTTATCGAAAACCTTCAGCGTTCTGATCTGAATCCCGTCGAAGAGGCTAAGGGCTATCGCCAACTCATTGATGCCAGCGGGATGACCCAGG
+"""
+
+# Grinder simulated chimeric reads using Greengenes 13.8 release
+# command used: grinder -random_seed 100 -reference_file 97_otus_gg_13_8.fasta \
+#                       -forward_reverse ./primers.fna -length_bias 0 -copy_bias 1 \
+#                       -unidirectional 1 -read_dist 150 -mutation_dist uniform 0.1 \
+#                       -mutation_ratio 100 0 -total_reads 1000 -diversity 10 -chimera_perc 10 \
+#                       -od grinder_chimeric_reads_illumina
+# primers.fna contain 515f and 806r primers
+# reads >251 reference=4370324,646991 amplicon=488..779,499..789 position=1..150
+#       >320 reference=646991,4370324,646991 amplicon=499..789,488..779,499..789 position=1..150
+#       >36 reference=4370324,814974 amplicon=488..779,479..769 position=1..150
+#       >672 reference=814974,160832 amplicon=479..769,436..727 position=1..150
+#       >142 reference=160832,814974 amplicon=436..727,479..769 position=1..150 errors=2%G
+#       >201 reference=4304512,510574 amplicon=451..742,501..793 position=1..150
+#       >241 reference=646991,4370324 amplicon=499..789,488..779 position=1..150 errors=13%A
+#       >279 reference=311922,160832,510574 amplicon=481..773,436..727,501..793 position=1..150
+#       >299 reference=4370324,4304512 amplicon=488..779,451..742 position=1..150
+#       >359 reference=646991,4370324 amplicon=499..789,488..779 position=1..150 errors=52%A
+#       >375 reference=4304512,769294 amplicon=451..742,504..795 position=1..150
+#       >407 reference=4304512,579954 amplicon=451..742,488..779 position=1..150
+#       >423 reference=4370324,579954 amplicon=488..779,488..779 position=1..150
+#       >516 reference=814974,579954 amplicon=479..769,488..779 position=1..150
+#       >618 reference=814974,646991 amplicon=479..769,499..789 position=1..150 errors=32%C
+#       >717 reference=814974,510574 amplicon=479..769,501..793 position=1..150
+#       >902 reference=510574,579954 amplicon=501..793,488..779 position=1..150
+#       >918 reference=814974,4370324 amplicon=479..769,488..779 position=1..150
+#       >941 reference=579954,4304512 amplicon=488..779,451..742 position=1..150
+#       are chimeric
+amplicon_reads = """>3;size=102;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>16;size=95;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>22;size=93;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>2;size=87;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>39;size=84;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>4;size=79;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>6;size=72;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>11;size=70;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>45;size=67;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>1;size=65;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>251;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>30;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>320;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>36;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>425;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>672;size=2;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>10;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GGGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>100;size=1;
+TTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>102;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGACTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>115;size=1;
+GTGCCAGCAGCCGCGGTATTACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>123;size=1;
+GTGACAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>132;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGACACTGCAAGTCTTGAGATCGGAAG
+>134;size=1;
+GTGCCGGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>14;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCACGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>140;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGGAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>142;size=1;
+GGGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>144;size=1;
+GTGTCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>148;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTATCATTCTTGAGTATAGATG
+>15;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGAGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>156;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGACGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>161;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGGGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>162;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGCTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>186;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTAATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>201;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>203;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAACGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>21;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTCAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>217;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGTAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>218;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAACGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>221;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAATACTGGAG
+>222;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGGGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>225;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGAGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>233;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTGTACTTGAGTGTTGTAA
+>234;size=1;
+GTGCCAGCAGCCTCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>235;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAACGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>24;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAAGTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>241;size=1;
+GTGCCAGCAGCCACGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>249;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTCAAACTGCAAGTCTTGAGATCGGAAG
+>259;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCGCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>26;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACATGATACTGCCTTGCTCGAGTACTGGAG
+>266;size=1;
+GTGCCAGCGGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>27;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGTGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>279;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>296;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGCGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>299;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>303;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCAGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>306;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAA
+>307;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGTTACTGGTATACTTGAGTGTTGTAA
+>32;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGGTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>322;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGGCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>326;size=1;
+GTGCCAGCAGCCGCGGTAATACGGTGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>332;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGGTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>333;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTATGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>338;size=1;
+GTGCCAGCAGCCGCGGTATTACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>359;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTAGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>360;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGTTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>362;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGAATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>364;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCATTTAGAACTGGTTAACTAGAGTATTGGAG
+>366;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCCTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>369;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTAATGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>37;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACTTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>371;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGCGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>373;size=1;
+GTGCCAGCAGACGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>374;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGCTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>375;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>386;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGTAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>387;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACAGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>392;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAACCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>393;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGTGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>397;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTTCCATTGATACTGGTATACTTGAGTGTTGTAA
+>405;size=1;
+GTACCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>407;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>414;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGTGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>418;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTTAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>423;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>431;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AGGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>436;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGTAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>444;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTGAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>445;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGCGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>456;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGCCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>460;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+CAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>469;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGAGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>470;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCGCGAGTACTGGAG
+>477;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCGGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>479;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGGGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>486;size=1;
+GTGCCAGCAGCTGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>500;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTACAAGTCTTGAGATCGGAAG
+>515;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGCAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>516;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>528;size=1;
+GTTCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>530;size=1;
+GTGCCAGCAGGCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>531;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGGCTTGTAG
+>549;size=1;
+GTCCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>551;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCAGTTGATACTGCCTTGCTCGAGTACTGGAG
+>557;size=1;
+GTGCCAGCAGCCGCTGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>559;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGCTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>561;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCATGTCTTGAGATCGGAAG
+>562;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCATTGATACTGGATGTCTTGAGTGTGAGAG
+>564;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAATTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>566;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTAATGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>568;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTTAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>57;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGAGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>570;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGCGTTACATAGA
+>578;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATGTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>586;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATATCTTGAGTGTGAGAG
+>587;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>596;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGTGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>600;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGTTTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>612;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCAGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>618;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACCAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>625;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTATTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>632;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGCGCAGACTTGAGTGATGTAG
+>649;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCGACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>650;size=1;
+GTGCCAGCAGCCGTGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>651;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACCGGCAGACTTGAGTGATGTAG
+>66;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTGGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>664;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTGATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>673;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGCGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>675;size=1;
+GTGCCAGCAGCCGCGGTAATACCTAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>682;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTTT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>690;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTGGATACTGCCTTGCTCGAGTACTGGAG
+>699;size=1;
+GTTCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>7;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCATAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>709;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTCTGGCTTGAGTTCGGCAG
+>717;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>73;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCGCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>740;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGTGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>745;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGAGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>746;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGTCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>748;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCATTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>760;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGCGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>766;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTAATACTGGTATACTTGAGTGTTGTAA
+>77;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCGGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>778;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTTGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>791;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+TAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>797;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTTTGGCTTGAGTTCGGCAG
+>809;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGAGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>813;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACAGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>814;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>816;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGAGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>817;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>82;size=1;
+GTGCCAGCAGGCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>821;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGTGTACTGGAG
+>824;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGTGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>827;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTAGTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>83;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAAGTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>842;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGGGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>851;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGTCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>853;size=1;
+GTTCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>862;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTGGAGTGTTGTAA
+>863;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTGAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>866;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGCAGGCTAGAGTCTTGTAG
+>871;size=1;
+GCGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>879;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCTGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>886;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGAGG
+>892;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTT
+AAGTCAGATGTGAAAGCCCGGGGCTCAACCTCGGAACCGCATTTGAAACTGGGAGGCTAGAGTCTTGTAG
+>895;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGATCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>897;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGAATTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAGCCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>902;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>904;size=1;
+CTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>91;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAC
+>912;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCATTTTAGAACTGGTTAACTAGAGTATTGGAG
+>916;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGGGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>918;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>920;size=1;
+GTGCCAGCAGCCGCGGTAATACGTCGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>921;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGCGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>925;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGGAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>930;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGGGGCAAGCGTTGTCCGGATTTACTGGGTATAAAGGGCTCGCAGGCGGGTTTGT
+AAGTCAGAGGTGAAATCCTACAGCTTAACTGTAGAACTGCCTTTGAAACTGCAAGTCTTGAGATCGGAAG
+>941;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCTGGAACTGCCTTTGATACTGGATGTCTTGAGTGTGAGAG
+>942;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTATAGTATTGGAG
+>945;size=1;
+GTGCCAGCAGCCGTGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGCTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+>947;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTCATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>948;size=1;
+GTGCCAGCAGCCGCGGTAATACATAGGTCACAAGCGTTATCCGGATTTATTGGGCGTAAAGCGTTCGTAGGCGGTTTGTT
+AAGTCTAGAGTTAAAACCTGGGGTTCAACCCCAGCCCGCTTTGGATACTGACAAACTAGAGTTACATAGA
+>952;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGCGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>956;size=1;
+GTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGAGCGTAGGCGGACATTT
+AAGTCAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCGTTTGATACTGGATGTCTTGAGTGTGAGAG
+>958;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCCGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTC
+GCGTCTGCCGTGAAAGTCCGGGGCTTAACTCCGGATCTGCGGTGGGTACGGGCAGACTTGAGTGATGTAG
+>964;size=1;
+GTGCCAGCAGCCGCCGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTAATT
+AAGCTAGAAGTGAAAGCCCTGCGCTCAACGTGGGAAGGCCTTTTAGAACTGGTTAACTAGAGTATTGGAG
+>967;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAGTCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTAGAG
+>984;size=1;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTAGAGATG
+>992;size=1;
+GTGCCAGCAGCCGCGGTAATACAGAGGTCTCAAGCGTTGTTCGGATTCATTGGGCGTAAAGGGTGCGTAGGTGGCGGGGT
+AAATCAGGTGTGAAATCTCGGAGCTCAACTCCGAAACTGCACTTGATACTGCCTTGCTCGAGTACTGGAG
+>993;size=1;
+GTGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGACTACAT
+AAGACAGGTGTGAAATCCCCGGGCTCAACCTGGGAATGGCGCTTGTGACTGTGTGGCTTGAGTTCGGCAG
+"""
+
+# Single chimeric sequence (251) with both parents to test alignment
+# and tabular output de novo
+single_chimera = """>22;size=93;
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTGTCCGGATTTATTGGGTTTAAAGGGTACGTAGGCGGTGTATT
+AAGTCAGTGGTGAAAGCCTGCGGCTCAACCGTAGGAGTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+>45;size=67;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGTCATTCTTGAGTATAGATG
+>251;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+"""
+
+# Alignment for chimeric sequence (251) against parents (22 and 45) de novo
+single_chimera_aln = """
+------------------------------------------------------------------------
+Query   (  150 nt) 251;size=2;
+ParentA (  150 nt) 45;size=67;
+ParentB (  150 nt) 22;size=93;
+
+A     1 GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT 80
+Q     1 GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT 80
+B     1 GTGCCAGCAGCCGCGGTAATACGGAGGgTGCGAGCGTTgTCCGGATTTATTGGGTTTAAAGGGTaCGTAGGCGGtgTatT 80
+Diffs                              A          A                         A         AA AA 
+Votes                              +          +                         +         ++ ++ 
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A    81 AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGtcATtCTTGAGTaTaGatg 150
+Q    81 AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAGTGTTGTAA 150
+B    81 AAGTCAGTGGTGAAAgCCTGCgGCTCAACcGTAGgagTGCCATTGATACTGGTATACTTGAGTGTTGTAA 150
+Diffs                  A     A       A    AAA              BB  B       B B BBB
+Votes                  +     +       +    +++              ++  +       + + +++
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAxxxxxxxxxxxxxxBBBBBBBBBBBBBBBBBBB
+
+Ids.  QA 94.7%, QB 91.3%, AB 86.0%, QModel 100.0%, Div. +5.6%
+Diffs Left 13: N 0, A 0, Y 13 (100.0%); Right 8: N 0, A 0, Y 8 (100.0%), Score 0.8291
+"""
+
+# Tabular format for UCHIME output
+single_chimera_tab = """0.0000\t22;size=93;\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.0000\t45;size=67;\t*\t*\t*\t*\t*\t*\t*\t*\t0\t0\t0\t0\t0\t0\t*\tN
+0.8291\t251;size=2;\t45;size=67;\t22;size=93;\t45;size=67;\t100.0\t94.7\t91.3\t86.0\t94.7\t13\t0\t0\t8\t0\t0\t5.3\tY
+"""
+
+# Single chimeric sequence for reference chimera checking
+single_chimera_ref = """>251;size=2;
+GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGAATGGT
+AAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAGTGTTGTAA
+"""
+
+# Reference database for UCHIME ref
+uchime_ref_db = """>4304512
+TGAGTTTGATCCTGGCTCAGAACGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGCGTCCTTCGGGACGAGTGGCAGACGGGTGAGTAACGCGTGGGAACGTACCCTTTGGTTCGGAACAACTCCGGGAAACTGGAGCTAATACCGGATAAGCCCTTCGGGGGAAAGATTTATCGCCTTTAGAGCGGCCCGCGTCTGATTAGCTAGTTGGTGGTGTAATGGACCACCAAGGCGACGATCAGTAGCTGGTCTGAGAGGATGACCAGCCACATTGGGACTGAGACACGGCTCAAACTCCTACGGGAGGCAGCAGTGGGGAATCTTGCGCAATGGGCGAAAGCCTGACGCAGCCATGCCGCGTGTATGATGAAGGTCTTAGGATTGTAAAATACTTTCACCGGTGAAGATAATGACTGTAGCCGGAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTC [...]
+>814974
+GATGAACGCTAGCCGTGTGCCTAATACATGCATGTCGTACGAGAGTACTTGTACTCTAGTGGCGAATGGGTGAGTAACACGTACCTAACCTACTTTTAAGATTGGAATAACTACTGGAAACAGTAGCTAATGCCGAATACGTATTAACTTCGCATGAAGATAATATAAAAGGAGCGTTTGCTCCGCTTAGAAATGGGGGTGCGCCATATTAGTTAGTTGGTAGGGTAATGGCCTACCAAGACGATGATATGTAGCCGGGCTGAGAAGCTGATCGGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTAGGGAATTTTCCGCAATGAGCGAAAGCTTGACGGAGCGACACGGCGTGCAGGATGAAGGTCTTCGGATCGTAAACTGCTGTGGTTAGGGAAGAAAAGCAAAATAGGAAATGATTTTGCCCTGACGGTACCTAACTAGAAAGTGACGGCTAACTATGTGCCAGCAGCCGC [...]
+>160832
+GTCGAGCGGCGGACGGGTGAGTAACGGCTGGGAACCTGCCCTGACGCGGGGGATAACCGTTGGAAACGACGGCTAATACCGCATAATGTCTTAGTTCATTACGAGCTGGGACCAAAGGTGGCCTCTACATGTAAGCTATCGCGTTGGGATGGGCCCAGTTAGGATTAGCTAGTTGGTAAGGTAATGGCTTACCAAGGCRACGATCCTTAKCTGGTTTGAGAGGATGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGGGAGACCCTGATGCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCAGTGAGGAAGGTGGTGTACTTAATAAGTGCATGGCTTGACGTTAGCTGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGT [...]
+>573757
+GCCTAAGGCAGGCAAGTCGAACGATGATCTCCAGCCTGCTGGGGGGATTAGAGGAGAACGGGTGAGTAACACGTGAGTAACCTGCCCTTGACTCTGGGATAAGCCTGGGAAACTGGGTCTAATACTGGATACGACCTTCCCACGCATGTGGTGTTGGTGGAAAGCTTTTGTGGTTTTGGATGGACTCGCGGCCTATCAGCTTGTTGGTGGGGTAATGGCCTACCAAGGCGACGACGGGTAGCCGGCCTGAGAGGGTGGACGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGAGGGATGAAGGCCTTCGGGTTGTAAACCTCTTTCAGTAGGGAAGAAGCGAAAGTGACGGTACCTGCAGAAGAAGCGCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAA [...]
+>579954
+TGGCGGCGTGGATAAGACATGCAAGTCGAACGGGATATTGTTTGTAGCAATACAAGCGATGTCTAGTGGCGTAAGGGTGCGTAACACGTGGGGAATCTGCCGAGAAGTGGGGGATAGCTCGCCGAAAGGCGAATTAATACCGCATGTGGTTAGGGAAGACATCTTCCCGACACTAAAGCCGGGGCAACCTGGCGCTTCTTGATGACCCCGCGGCCTATCAGCTAGTCGGTGAGGTAACGGCTCACCAAGGCTATGACGGGTAGCTGGTCTGAGAGGACGACCAGCCACACTGGAACTGAGACACGGTCCAGACACCTACGGGTGGCAGCAGTCGAGAATTTTTCTCAATGGGGGAAACCCTGAAGGAGCGACGCCGCGTGGAGGATGAAGGTCTTCGGATTGTAAACTCCTGTCATGCGGGAACAATTGTCACCGATTAACTGTCGGGGGCTTGATAGTACCAGAAGAGGAAGAGACGGCTAACTCTGTGCC [...]
+>311922
+GATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACATGAAGTGCTTGCACTTTGATGACGAGTGGCGGACGGGTGAGTAATGCTTGGGAATTTGCCTTTGCGCGGGGGATAACCATTGGAAACGATGGCTAATACCGCATAATGTCTACGGACCAAAGGGGGCTTAGGCTCCCACGTGAAGAGAAGCCCAAGTGAGATTAGCTAGTTGGTGGGGTAAAGGCTCACCAAGGCGACGATCTCTAGCTGTTCCGAGAGGAAGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCGCAATGGGGGGAACCCTGACGCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGTTATGAGGAAAGGTTGTTGGTTAATACCCAGCAGCTGTGACGTTAATAACAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCC [...]
+>4370324
+TCAGGATGAACGCTAGCGACAGGCCTAACACATGCAAGTCGAGGGGTAACATTGGTAGCTTGCTACCAGATGACGACCGGCGCACGGGTGAGTAACGCGTATGCAACCTTCCTTTAACAGGAGAATAGCCCCCGGAAACGGGGATTAATGCTCCATGGCACTCTAATTTCGCATGGAATAAGAGTTAAAGTTCCGACGGTTAAAGATGGGCATGCGTGACATTAGCCAGTTGGCGGGGTAACGGCCCACCAAAGCAACGATGTCTAGGGGTTCTGAGAGGAAGGTCCCCCACACTGGTACTGAGACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGACGCAAGTCTGAACCAGCCATGTCGCGTGCAGGATGACTGCCCTATGGGTTGTAAACTGCTTTTGTACGGGAAGAAATGTACTTACGAGTAAGTATTTGCCGGTACCGTACGAATAAGCATCGGCTAACTCCGTGCC [...]
+>646991
+AGTTTGATCTTGGCTCAGGATGAACGCTAGCGGCAGGCCTAATACATGCAAGTCGTGGGGCATCAGCGCCTTCGGGCGGCTGGCGACCGGCGCACGGGTGCGTAACGCGTATGCAACCTGCCCACAACAGGGGGACAGCCTTCGGAAACGAGGATTAATACCCCATGATACAGGGGTACCGCATGGTGCCTTTCGTCAAAGGTTTCGGCCGGTTGTGGATGGGCATGCGTCCCATTAGCTAGTAGGCGGGGTAACGGCCCACCTAGGCTATGATGGGTAGGGGTTCTGAGAGGACGATCCCCCACACTGGTACTGAGATACGGACCAGACTCCTACGGGAGGCAGCAGTAGGGAATATTGGGCAATGGGCGGAAGCCTGACCCAGCCATGCCGCGTGCAGGACGAAGGCCCTCGGGTCGTAAACTGCTTTTATACGGGAAGAACTGCGTCCTGCGGGACGCGCTGACGGTACCGTACGAATAAGCACCGGCT [...]
+>510574
+GAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGAATGCTTTACACATGCAAGTCGAGCGGCAGCGCGGGGGCAACCCTGGCGGCGAGCGGCGAACGGGTGAGTAACACATCGGAACGTACCCAATTGAGGGGGATAGCCCGGCGAAAGCCGGATTAATACCGCATAAGTCCTGAGGGAGAAAGCGGGGGACCGCAAGGCCTCGCGCGATTGGAGCGGCCGATGTCGGATTAGCTAGTTGGTGGGGTAAAGGCTCACCAAGGCGACGATCCGTAGCTGGTCTGAGAGGATGATCAGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGCAAGCCTGATCCAGCCATTCCGCGTGAGTGAAGAAGGCCTTCGGGTTGTAAAGCTCTTTCGGACGGAAAGAAATCGCCCGGGTAAATAATCCGGGTGGATGACGGTACCGTAAGAAGAAGCACCGG [...]
+>769294
+AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCAAGGGGAAAGTTTTCTTCGGAGAATTAGTATACTGGCGCACGGGTGAGTAATGTATAAGTAATCTACCTATAGGAAAGGAATAACTCTAAGAAATTGGGGCTAATACCATATAATGCAGCGGCACCGCATGGTGATGTTGTTAAAGTAATTTATTACGCCTATAGATGAGCTTGTATTCGATTAGCTTGTTGGTAAGGTAACGGCTTACCAAGGCGACGATCGATAGCTGGTCTGAGAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGGCAATGGACGAAAGTCTGACCCAGCAACGCCGCGTGGAGGATGAAGGTCGTAAGATCGTAAACTCCTTTTTTGGGGGAAGAAAAAACAGGTTTGTAGCCTGTATTGACTGTACCCTAAGAATAAGCCC [...]
+"""
+
+# Reference database for single chimera sequence (ref)
+uchime_single_ref_db = """>646991
+AGTTTGATCTTGGCTCAGGATGAACGCTAGCGGCAGGCCTAATACATGCAAGTCGTGGGGCATCAGCGCCTTCGGGCGGCTGGCGACCGGCGCACGGGTGCGTAACGCGTATGCAACCTGCCCACAACAGGGGGACAGCCTTCGGAAACGAGGATTAATACCCCATGATACAGGGGTACCGCATGGTGCCTTTCGTCAAAGGTTTCGGCCGGTTGTGGATGGGCATGCGTCCCATTAGCTAGTAGGCGGGGTAACGGCCCACCTAGGCTATGATGGGTAGGGGTTCTGAGAGGACGATCCCCCACACTGGTACTGAGATACGGACCAGACTCCTACGGGAGGCAGCAGTAGGGAATATTGGGCAATGGGCGGAAGCCTGACCCAGCCATGCCGCGTGCAGGACGAAGGCCCTCGGGTCGTAAACTGCTTTTATACGGGAAGAACTGCGTCCTGCGGGACGCGCTGACGGTACCGTACGAATAAGCACCGGCT [...]
+>4370324
+TCAGGATGAACGCTAGCGACAGGCCTAACACATGCAAGTCGAGGGGTAACATTGGTAGCTTGCTACCAGATGACGACCGGCGCACGGGTGAGTAACGCGTATGCAACCTTCCTTTAACAGGAGAATAGCCCCCGGAAACGGGGATTAATGCTCCATGGCACTCTAATTTCGCATGGAATAAGAGTTAAAGTTCCGACGGTTAAAGATGGGCATGCGTGACATTAGCCAGTTGGCGGGGTAACGGCCCACCAAAGCAACGATGTCTAGGGGTTCTGAGAGGAAGGTCCCCCACACTGGTACTGAGACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGACGCAAGTCTGAACCAGCCATGTCGCGTGCAGGATGACTGCCCTATGGGTTGTAAACTGCTTTTGTACGGGAAGAAATGTACTTACGAGTAAGTATTTGCCGGTACCGTACGAATAAGCATCGGCTAACTCCGTGCC [...]
+"""
+
+# 3-way alignment for single chimeric sequence against reference
+# database using UCHIME
+single_chimera_ref_aln = """
+------------------------------------------------------------------------
+Query   (  150 nt) 251;size=2;
+ParentA ( 1403 nt) 4370324
+ParentB ( 1480 nt) 646991
+
+A     1 tcaggatgaacgctagcgacaggcctaacacatgcaagtcgaggggtaacattggtagcttgctaccagatgacgaccgg 80
+Q     1 -------------------------------------------------------------------------------- 0
+B     1 agtttgatcttggctcaggatgaacgctagcggcaggcctaatacatgcaagtcgtggggcatcagcgccttcgggcggc 80
+Diffs                                                                                   
+Votes                                                                                   
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A    81 cgcacgggtgagtaacgcgtatgcaaccttcctttaacaggagaatagcccccggaaacggggattaatgctccatggca 160
+Q     1 -------------------------------------------------------------------------------- 0
+B    81 tggcgaccggcgcacgggtgcgtaacgcgtatgcaacctgcccacaacagggggacagccttcggaaacgaggattaata 160
+Diffs                                                                                   
+Votes                                                                                   
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A   161 ctctaatttcgcatggaataagagttaaagttccgacggttaaagatgggcatgcgtgacattagccagttggcggggta 240
+Q     1 -------------------------------------------------------------------------------- 0
+B   161 ccccatgatacaggggtaccgcatggtgcctttcgtcaaaggtttcggccggttgtggatgggcatgcgtcccattagct 240
+Diffs                                                                                   
+Votes                                                                                   
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A   241 acggcccaccaaagcaacgatgtctaggggttctgagaggaaggtcccccacactggtactgagacacggaccagactcc 320
+Q     1 -------------------------------------------------------------------------------- 0
+B   241 agtaggcggggtaacggcccacctaggctatgatgggtaggggttctgagaggacgatcccccacactggtactgagata 320
+Diffs                                                                                   
+Votes                                                                                   
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A   321 tacgggaggcagcagtgaggaatattggtcaatggacgcaagtctgaaccagccatgtcgcgtgcaggatgactgcccta 400
+Q     1 -------------------------------------------------------------------------------- 0
+B   321 cggaccagactcctacgggaggcagcagtagggaatattgggcaatgggcggaagcctgacccagccatgccgcgtgcag 400
+Diffs                                                                                   
+Votes                                                                                   
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A   401 tgggttgtaaactgcttttgtacgggaagaaatgtacttacgagtaagtatttgccggtaccgtacgaataagcatcggc 480
+Q     1 -------------------------------------------------------------------------------- 0
+B   401 gacgaaggccctcgggtcgtaaactgcttttatacgggaagaactgcgtcctgcgggacgcgctgacggtaccgtacgaa 480
+Diffs                                                                                   
+Votes                                                                                   
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A   481 taactcc-----------GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGG 549
+Q     1 ------------------GTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGG 62
+B   481 taagcaccggctaactccGTGCCAGCAGCCGCGGTAATACGGAGGgTGCGAGCGTTgTCCGGATTTATTGGGTTTAAAGG 560
+Diffs                                                A          A                       
+Votes                                                +          +                       
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
+A   550 GTGCGTAGGCGGAATGGTAAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGtcATtCTTGAG 629
+Q    63 GTGCGTAGGCGGAATGGTAAGTCAGTGGTGAAATCCTGCAGCTCAACTGTAGAGTTGCCATTGATACTGGTATACTTGAG 142
+B   561 GTaCGTAGGCGGtgTatTAAGTCAGTGGTGAAAgCCTGCgGCTCAACcGTAGgagTGCCATTGATACTGGTATACTTGAG 640
+Diffs     A         AA AA                A     A       A    AAA              BB  B      
+Votes     +         ++ ++                +     +       +    +++              ++  +      
+Model   AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAxxxxxxxxxxxxxxBBBBBBBBBBB
+
+A   630 TaTaGatgaggtaggcggaatgagtagtgtagcggtgaaatgcatagatattactcagaacaccaattgcgaaggcagct 709
+Q   143 TGTTGTAA------------------------------------------------------------------------ 150
+B   641 TGTTGTAAgggtgggcggaattccgcatgtagcggtgaaatgcatagatatgcggaggaacaccgagagcgaaggcagct 720
+Diffs    B B BBB                                                                        
+Votes    + + ++                                                                         
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A   710 tactaaactataactgacgctgaagcacgaaagcgtgggtatcaaacaggattagataccctggtagtccacgccgtaaa 789
+Q   151 -------------------------------------------------------------------------------- 150
+B   721 cactaggcacgactgacgctgaggtacgaaagcgtggggagcgaacaggattagataccctggtagtccacgccgtaaac 800
+Diffs                                                                                   
+Votes                                                                                   
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A   790 cgatgattactggttgtttgcaatacaccgcaagcgactgagcgaaagcattaagtaatccacctggggagtacgtcggc 869
+Q   151 -------------------------------------------------------------------------------- 150
+B   801 gatggtaactaggtgtgtgcgacacagagtgcgcgcccaagcgaaagcgataagttacccacctggggagtacgctcgca 880
+Diffs                                                                                   
+Votes                                                                                   
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A   870 aacgatgaaactcaaaggaattgacgggggcccgcacaagcggtggaacatgtggtttaattcgatgatacgcgaggaac 949
+Q   151 -------------------------------------------------------------------------------- 150
+B   881 agagtgaaactcaaaggaattgacgggggtccgcacaagcggtggagcatgtggtttaattcgatgatacgcgaggaacc 960
+Diffs                                                                                   
+Votes                                                                                   
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A   950 cttacctgggtttaaatgggaagtgacaggggtagaaatacctttttcttcggacacttttcaaggtgctgcatggttgt 1029
+Q   151 -------------------------------------------------------------------------------- 150
+B   961 ttacctgggctcgaatggcctatgacaggcccagagatgggcccttcctcggacataggtcaaggtgctgcatggctgtc 1040
+Diffs                                                                                   
+Votes                                                                                   
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A  1030 cgtcagctcgtgccgtgaggtgtcgggttaagtcccataacgagcgcaacccctgttgttagttaccagcatgtaaagat 1109
+Q   151 -------------------------------------------------------------------------------- 150
+B  1041 gtcagctcgtgccgtgaggtgttgggttaagtcccgcaacgagcgcaacccttgcccctagttgccatcaggtaaagctg 1120
+Diffs                                                                                   
+Votes                                                                                   
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A  1110 ggggactctaacaagactgccggtgtaaaccgcgaggaaggtggggatgacgtcaaatcagcacggcccttacatccagg 1189
+Q   151 -------------------------------------------------------------------------------- 150
+B  1121 gggactctagggggactgcctgcgcaagcagagaggaaggaggggacgatgtcaagtcatcatggcccttacgcccaggg 1200
+Diffs                                                                                   
+Votes                                                                                   
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A  1190 gctacacacgtgttacaatggcaggtacaaagggcagctacacagcgatgtgatgctaatctcgaaaacctgtcccagtt 1269
+Q   151 -------------------------------------------------------------------------------- 150
+B  1201 ctacacacgtgctacaatggcgcatacagagggtagccacctggcgacagggcgccaatctcaaaaagtgcgtctcagtt 1280
+Diffs                                                                                   
+Votes                                                                                   
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A  1270 cggattgaagtctgcaacccgacttcatgaagctggaatcgctagtaatcgcgcatcagccatggcgcggtgaatacgtt 1349
+Q   151 -------------------------------------------------------------------------------- 150
+B  1281 cggatcggggcctgcaactcggccccgtgaagtcggaatcgctagtaatcgcagatcagccatgctgcggtgaatacgtt 1360
+Diffs                                                                                   
+Votes                                                                                   
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A  1350 cccgggccttgtacactccgcccgtcaagccatggaagccgggagtacctgaag-------------------------- 1403
+Q   151 -------------------------------------------------------------------------------- 150
+B  1361 cccgggccttgtacacaccgcccgtcaagccatggaagccgggggcacctgaagtcgggggtaacaacccgcctagggtg 1440
+Diffs                                                                                   
+Votes                                                                                   
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+A  1404 ---------------------------------------- 1403
+Q   151 ---------------------------------------- 150
+B  1441 aaactggtaactggggctaagtcgtaacaaggtaaccgta 1480
+Diffs                                           
+Votes                                           
+Model   BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
+
+Ids.  QA 95.3%, QB 91.2%, AB 86.5%, QModel 100.0%, Div. +5.0%
+Diffs Left 13: N 0, A 0, Y 13 (100.0%); Right 7: N 0, A 0, Y 7 (100.0%), Score 0.7254
+"""
+
+# UCHIME tabular output for single chimeric sequence
+single_chimera_ref_tab = """0.7254\t251;size=2;\t4370324\t646991\t4370324\t100.0\t95.3\t91.2\t86.5\t95.3\t13\t0\t0\t7\t0\t0\t4.7\tY
+"""
+
+
+if __name__ == '__main__':
+    main()
diff --git a/bfillings/uclust.py b/bfillings/uclust.py
new file mode 100644
index 0000000..9c89691
--- /dev/null
+++ b/bfillings/uclust.py
@@ -0,0 +1,606 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for uclust version 1.1.579
+
+Includes application controllers for uclust and
+convenience wrappers for different functions of uclust including
+sorting fasta files, finding clusters, converting to cd-hit format and
+searching and aligning against a database. Also contains
+a parser for the resulting .clstr file.
+
+Modified from cogent.app.cd_hit.py on 1-21-10, written by Daniel McDonald.
+"""
+
+from os.path import splitext, basename, join
+from tempfile import gettempdir, mkstemp
+
+from cogent import DNA
+
+from burrito.util import (CommandLineApplication, ResultPath,
+                            ApplicationError, ApplicationNotFoundError)
+from burrito.parameters import ValuedParameter, FlagParameter
+from skbio.parse.sequences import parse_fasta
+from skbio.util import remove_files
+
+
+class UclustParseError(Exception):
+    pass
+
+
+class Uclust(CommandLineApplication):
+
+    """ Uclust ApplicationController
+
+    """
+
+    _command = 'uclust'
+    _input_handler = '_input_as_parameters'
+    _parameters = {
+
+        # Fasta input file for merge-sort function
+        '--mergesort': ValuedParameter('--', Name='mergesort', Delimiter=' ',
+                                       IsPath=True),
+
+        # Output file, used by several difference functions
+        '--output': ValuedParameter('--', Name='output', Delimiter=' ',
+                                    IsPath=True),
+
+        # Sets temp directory for uclust to create temp fasta file
+        '--tmpdir': ValuedParameter('--', Name='tmpdir', Delimiter=' ',
+                                    IsPath=True),
+
+        # input filename, fasta format
+        '--input': ValuedParameter('--', Name='input', Delimiter=' ',
+                                   IsPath=True),
+
+        # Output filename will be in uclust (.uc) format
+        # Output cluster file, required parameter
+        '--uc': ValuedParameter('--', Name='uc', Delimiter=' ',
+                                IsPath=True),
+
+        # ID percent for OTU, by default is 97%
+        '--id': ValuedParameter('--', Name='id', Delimiter=' ', IsPath=False),
+
+        # Disable reverse comparison option, if norev is disabled
+        # memory usage is expected to double for uclust
+        '--rev': FlagParameter('--', Name='rev'),
+
+        # 'library' file -- a reference of sequences representing pre-existing
+        # clusters
+        '--lib': ValuedParameter('--', Name='lib', Delimiter=' ', IsPath=True),
+
+        # only compare sequences to the library file, don't add new clusters
+        # for sequences which don't hit the library
+        '--libonly': FlagParameter('--', Name='libonly'),
+
+        # Maximum hits before quitting search (default 1, 0=infinity).
+        '--maxaccepts':
+        ValuedParameter('--', Name='maxaccepts', Delimiter=' '),
+
+        # Maximum rejects before quitting search (default 8, 0=infinity).
+        '--maxrejects':
+        ValuedParameter('--', Name='maxrejects', Delimiter=' '),
+
+        # Target nr. of common words (default 8, 0=don't step)
+        '--stepwords': ValuedParameter('--', Name='stepwords', Delimiter=' '),
+
+        # Word length for windex (default 5 aa.s, 8 nuc.s).
+        '--w': ValuedParameter('--', Name='w', Delimiter=' '),
+
+        # output fp for pairwise aligned sequences
+        '--fastapairs': ValuedParameter('--', Name='fastapairs', Delimiter=' ',
+                                        IsPath=True),
+
+        # input filename, .uc format
+        '--uc2clstr': ValuedParameter('--', Name='uc2clstr', Delimiter=' ',
+                                      IsPath=True),
+
+        # Don't assume input is sorted by length (default assume sorted).
+        '--usersort': FlagParameter('--', Name='usersort'),
+
+        # Same as --maxrejects 0 --nowordcountreject.
+        # comes with a performance hit.
+        '--exact': FlagParameter('--', Name='exact'),
+
+        # Same as --maxrejects 0 --maxaccepts 0 --nowordcountreject --
+        # comes with a performance hit.
+        '--optimal': FlagParameter('--', Name='optimal'),
+
+        '--stable_sort': FlagParameter('--', Name='stable_sort'),
+
+        # From uclust help:
+        # Write all accepts to .uc file (default top hit/no match only).
+        '--allhits': FlagParameter('--', Name='allhits'),
+    }
+
+    _suppress_stdout = False
+    _suppress_stderr = False
+
+    def _input_as_parameters(self, data):
+        """ Set the input path (a fasta filepath)
+        """
+        # The list of values which can be passed on a per-run basis
+        allowed_values = ['--input', '--uc', '--fastapairs',
+                          '--uc2clstr', '--output', '--mergesort']
+
+        unsupported_parameters = set(data.keys()) - set(allowed_values)
+        if unsupported_parameters:
+            raise ApplicationError(
+                "Unsupported parameter(s) passed when calling uclust: %s" %
+                ' '.join(unsupported_parameters))
+
+        for v in allowed_values:
+            # turn the parameter off so subsequent runs are not
+            # affected by parameter settings from previous runs
+            self.Parameters[v].off()
+            if v in data:
+                # turn the parameter on if specified by the user
+                self.Parameters[v].on(data[v])
+
+        return ''
+
+    def _get_result_paths(self, data):
+        """ Set the result paths """
+
+        result = {}
+
+        result['Output'] = ResultPath(
+            Path=self.Parameters['--output'].Value,
+            IsWritten=self.Parameters['--output'].isOn())
+
+        result['ClusterFile'] = ResultPath(
+            Path=self.Parameters['--uc'].Value,
+            IsWritten=self.Parameters['--uc'].isOn())
+
+        result['PairwiseAlignments'] = ResultPath(
+            Path=self.Parameters['--fastapairs'].Value,
+            IsWritten=self.Parameters['--fastapairs'].isOn())
+
+        return result
+
+    def _accept_exit_status(self, exit_status):
+        """ Test for acceptable exit status
+
+            uclust can seg fault and still generate a parsable .uc file
+            so we explicitly check the exit status
+
+        """
+        return exit_status == 0
+
+    def getHelp(self):
+        """Method that points to documentation"""
+        help_str =\
+            """
+        UCLUST is hosted at:
+        http://www.drive5.com/uclust/
+
+        The following papers should be cited if this resource is used:
+
+        Paper pending. Check with Robert Edgar who is writing the paper
+        for uclust as of March 2010.  Cite the above URL for the time being.
+        """
+        return help_str
+
+# Start functions for processing uclust output files
+
+
+def get_next_record_type(lines, types):
+    for line in lines:
+        line = line.strip()
+        if line and line[0] in types:
+            yield line.split('\t')
+    return
+
+
+def get_next_two_fasta_records(lines):
+    result = []
+    for record in parse_fasta(lines):
+        result.append(record)
+        if len(result) == 2:
+            yield result
+            result = []
+    return
+
+
+def process_uclust_pw_alignment_results(fasta_pairs_lines, uc_lines):
+    """ Process results of uclust search and align """
+    alignments = get_next_two_fasta_records(fasta_pairs_lines)
+    for hit in get_next_record_type(uc_lines, 'H'):
+        matching_strand = hit[4]
+        if matching_strand == '-':
+            strand_id = '-'
+            target_rev_match = True
+        elif matching_strand == '+':
+            strand_id = '+'
+            target_rev_match = False
+        elif matching_strand == '.':
+            # protein sequence, so no strand information
+            strand_id = ''
+            target_rev_match = False
+        else:
+            raise UclustParseError("Unknown strand type: %s" % matching_strand)
+        uc_query_id = hit[8]
+        uc_target_id = hit[9]
+        percent_id = float(hit[3])
+
+        fasta_pair = alignments.next()
+
+        fasta_query_id = fasta_pair[0][0]
+        aligned_query = fasta_pair[0][1]
+
+        if fasta_query_id != uc_query_id:
+            raise UclustParseError("Order of fasta and uc files do not match." +
+                                   " Got query %s but expected %s." %
+                                   (fasta_query_id, uc_query_id))
+
+        fasta_target_id = fasta_pair[1][0]
+        aligned_target = fasta_pair[1][1]
+
+        if fasta_target_id != uc_target_id + strand_id:
+            raise UclustParseError("Order of fasta and uc files do not match." +
+                                   " Got target %s but expected %s." %
+                                   (fasta_target_id, uc_target_id + strand_id))
+
+        if target_rev_match:
+            query_id = uc_query_id + ' RC'
+            aligned_query = DNA.rc(aligned_query)
+            target_id = uc_target_id
+            aligned_target = DNA.rc(aligned_target)
+        else:
+            query_id = uc_query_id
+            aligned_query = aligned_query
+            target_id = uc_target_id
+            aligned_target = aligned_target
+
+        yield (query_id, target_id, aligned_query, aligned_target, percent_id)
+
+
+def clusters_from_uc_file(uc_lines,
+                          error_on_multiple_hits=True):
+    """ Given an open .uc file, return lists (clusters, failures, new_seeds)
+
+        uc_lines: open .uc file, or similar object -- this is the output
+         generated by uclust's -uc parameter
+        error_on_multiple_hits: if True (default), when a single query hits
+         to multiple seeds, as can happen when --allhits is passed to uclust,
+         throw a UclustParseError. if False, when a single query hits to
+         multiple seeds, it will appear in each cluster.
+
+        This function processes all hit (H), seed (S), and no hit (N) lines
+         to return all clusters, failures, and new_seeds generated in
+         a uclust run. failures should only arise when users have passed
+         --lib and --libonly, and a sequence doesn't cluster to any existing
+         reference database sequences.
+
+    """
+    clusters = {}
+    failures = []
+    seeds = []
+    all_hits = set()
+    # the types of hit lines we're interested in here
+    # are hit (H), seed (S), library seed (L) and no hit (N)
+    hit_types = {}.fromkeys(list('HSNL'))
+    for record in get_next_record_type(uc_lines, hit_types):
+        hit_type = record[0]
+        # sequence identifiers from the fasta header lines only
+        # (no comment data) are stored to identify a sequence in
+        # a cluster -- strip off any comments here as this value
+        # is used in several places
+        query_id = record[8].split()[0]
+        target_cluster = record[9].split()[0]
+        if hit_type == 'H':
+            if error_on_multiple_hits and query_id in all_hits:
+                raise UclustParseError("Query id " + query_id + " hit multiple seeds. "
+                                       "This can happen if --allhits is "
+                                       "enabled in the call to uclust, which isn't supported by default. "
+                                       "Call clusters_from_uc_file(lines, error_on_multiple_hits=False) to "
+                                       "allow a query to cluster to multiple seeds.")
+            else:
+                # add the hit to its existing cluster (either library
+                # or new cluster)
+                clusters[target_cluster].append(query_id)
+                all_hits.add(query_id)
+        elif hit_type == 'S':
+            # a new seed was identified -- create a cluster with this
+            # sequence as the first instance
+            if query_id in clusters:
+                raise UclustParseError("A seq id was provided as a seed, but that seq id already "
+                                       "represents a cluster. Are there overlapping seq ids in your "
+                                       "reference and input files or repeated seq ids in either? "
+                                       "Offending seq id is %s" % query_id)
+            clusters[query_id] = [query_id]
+            seeds.append(query_id)
+        elif hit_type == 'L':
+            # a library seed was identified -- create a cluster with this
+            # id as the index, but don't give it any instances yet bc the hit
+            # line will be specified separately. note we need to handle these
+            # lines separately from the H lines to detect overlapping seq ids
+            # between the reference and the input fasta files
+            if query_id in clusters:
+                raise UclustParseError("A seq id was provided as a seed, but that seq id already "
+                                       "represents a cluster. Are there overlapping seq ids in your "
+                                       "reference and input files or repeated seq ids in either? "
+                                       "Offending seq id is %s" % query_id)
+            clusters[query_id] = []
+        elif hit_type == 'N':
+            # a failure was identified -- add it to the failures list
+            failures.append(query_id)
+        else:
+            # shouldn't be possible to get here, but provided for
+            # clarity
+            raise UclustParseError(
+                "Unexpected result parsing line:\n%s" %
+                '\t'.join(record))
+
+    # will need to return the full clusters dict, I think, to support
+    # useful identifiers in reference database clustering
+    # return  clusters.values(), failures, seeds
+    return clusters, failures, seeds
+
+# End functions for processing uclust output files
+
+
+# Start uclust convenience functions
+def uclust_fasta_sort_from_filepath(
+        fasta_filepath,
+        output_filepath=None,
+        tmp_dir=gettempdir(),
+        HALT_EXEC=False):
+    """Generates sorted fasta file via uclust --mergesort."""
+    if not output_filepath:
+        _, output_filepath = mkstemp(dir=tmp_dir, prefix='uclust_fasta_sort',
+                                     suffix='.fasta')
+
+    app = Uclust(params={'--tmpdir': tmp_dir},
+                 TmpDir=tmp_dir, HALT_EXEC=HALT_EXEC)
+
+    app_result = app(data={'--mergesort': fasta_filepath,
+                           '--output': output_filepath})
+
+    return app_result
+
+
+def uclust_search_and_align_from_fasta_filepath(
+        query_fasta_filepath,
+        subject_fasta_filepath,
+        percent_ID=0.75,
+        enable_rev_strand_matching=True,
+        max_accepts=8,
+        max_rejects=32,
+        tmp_dir=gettempdir(),
+        HALT_EXEC=False):
+    """ query seqs against subject fasta using uclust,
+
+       return global pw alignment of best match
+    """
+
+    # Explanation of parameter settings
+    #  id - min percent id to count a match
+    #  maxaccepts = 8 , searches for best match rather than first match
+    #                   (0 => infinite accepts, or good matches before
+    #                    quitting search)
+    #  maxaccepts = 32,
+    #  libonly = True , does not add sequences to the library if they don't
+    #                   match something there already. this effectively makes
+    #                   uclust a search tool rather than a clustering tool
+
+    params = {'--id': percent_ID,
+              '--maxaccepts': max_accepts,
+              '--maxrejects': max_rejects,
+              '--libonly': True,
+              '--lib': subject_fasta_filepath,
+              '--tmpdir': tmp_dir}
+
+    if enable_rev_strand_matching:
+        params['--rev'] = True
+
+    # instantiate the application controller
+    app = Uclust(params,
+                 TmpDir=tmp_dir, HALT_EXEC=HALT_EXEC)
+
+    # apply uclust
+    _, alignment_filepath = mkstemp(dir=tmp_dir, prefix='uclust_alignments',
+                                    suffix='.fasta')
+    _, uc_filepath = mkstemp(dir=tmp_dir, prefix='uclust_results',
+                             suffix='.uc')
+    input_data = {'--input': query_fasta_filepath,
+                  '--fastapairs': alignment_filepath,
+                  '--uc': uc_filepath}
+    app_result = app(input_data)
+
+    # yield the pairwise alignments
+    for result in process_uclust_pw_alignment_results(
+            app_result['PairwiseAlignments'], app_result['ClusterFile']):
+        try:
+            yield result
+        except GeneratorExit:
+            break
+
+    # clean up the temp files that were generated
+    app_result.cleanUp()
+
+    return
+
+
+def uclust_cluster_from_sorted_fasta_filepath(
+        fasta_filepath,
+        uc_save_filepath=None,
+        percent_ID=0.97,
+        max_accepts=1,
+        max_rejects=8,
+        stepwords=8,
+        word_length=8,
+        optimal=False,
+        exact=False,
+        suppress_sort=False,
+        enable_rev_strand_matching=False,
+        subject_fasta_filepath=None,
+        suppress_new_clusters=False,
+        stable_sort=False,
+        tmp_dir=gettempdir(),
+        HALT_EXEC=False):
+    """ Returns clustered uclust file from sorted fasta"""
+    output_filepath = uc_save_filepath
+    if not output_filepath:
+        _, output_filepath = mkstemp(dir=tmp_dir, prefix='uclust_clusters',
+                                     suffix='.uc')
+
+    params = {'--id': percent_ID,
+              '--maxaccepts': max_accepts,
+              '--maxrejects': max_rejects,
+              '--stepwords': stepwords,
+              '--w': word_length,
+              '--tmpdir': tmp_dir}
+    app = Uclust(params,
+                 TmpDir=tmp_dir, HALT_EXEC=HALT_EXEC)
+
+    # Set any additional parameters specified by the user
+    if enable_rev_strand_matching:
+        app.Parameters['--rev'].on()
+    if optimal:
+        app.Parameters['--optimal'].on()
+    if exact:
+        app.Parameters['--exact'].on()
+    if suppress_sort:
+        app.Parameters['--usersort'].on()
+    if subject_fasta_filepath:
+        app.Parameters['--lib'].on(subject_fasta_filepath)
+    if suppress_new_clusters:
+        app.Parameters['--libonly'].on()
+    if stable_sort:
+        app.Parameters['--stable_sort'].on()
+
+    app_result = app({'--input': fasta_filepath, '--uc': output_filepath})
+    return app_result
+
+
+def get_output_filepaths(output_dir, fasta_filepath):
+    """ Returns filepaths for intermediate file to be kept """
+    return join(output_dir,
+                splitext(basename(fasta_filepath))[0] + '_clusters.uc')
+
+
+def get_clusters_from_fasta_filepath(
+        fasta_filepath,
+        original_fasta_path,
+        percent_ID=0.97,
+        max_accepts=1,
+        max_rejects=8,
+        stepwords=8,
+        word_length=8,
+        optimal=False,
+        exact=False,
+        suppress_sort=False,
+        output_dir=None,
+        enable_rev_strand_matching=False,
+        subject_fasta_filepath=None,
+        suppress_new_clusters=False,
+        return_cluster_maps=False,
+        stable_sort=False,
+        tmp_dir=gettempdir(),
+        save_uc_files=True,
+        HALT_EXEC=False):
+    """ Main convenience wrapper for using uclust to generate cluster files
+
+    A source fasta file is required for the fasta_filepath.  This will be
+    sorted to be in order of longest to shortest length sequences.  Following
+    this, the sorted fasta file is used to generate a cluster file in the
+    uclust (.uc) format.  Next the .uc file is converted to cd-hit format
+    (.clstr).  Finally this file is parsed and returned as a list of lists,
+    where each sublist a cluster of sequences.  If an output_dir is
+    specified, the intermediate files will be preserved, otherwise all
+    files created are temporary and will be deleted at the end of this
+    function
+
+    The percent_ID parameter specifies the percent identity for a clusters,
+    i.e., if 99% were the parameter, all sequences that were 99% identical
+    would be grouped as a cluster.
+    """
+
+    # Create readable intermediate filenames if they are to be kept
+    fasta_output_filepath = None
+    uc_output_filepath = None
+    cd_hit_filepath = None
+
+    if output_dir and not output_dir.endswith('/'):
+        output_dir += '/'
+
+    if save_uc_files:
+        uc_save_filepath = get_output_filepaths(
+            output_dir,
+            original_fasta_path)
+    else:
+        uc_save_filepath = None
+
+    sorted_fasta_filepath = ""
+    uc_filepath = ""
+    clstr_filepath = ""
+
+    # Error check in case any app controller fails
+    files_to_remove = []
+    try:
+        if not suppress_sort:
+            # Sort fasta input file from largest to smallest sequence
+            sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath,
+                                                         output_filepath=fasta_output_filepath)
+
+            # Get sorted fasta name from application wrapper
+            sorted_fasta_filepath = sort_fasta['Output'].name
+            files_to_remove.append(sorted_fasta_filepath)
+
+        else:
+            sort_fasta = None
+            sorted_fasta_filepath = fasta_filepath
+
+        # Generate uclust cluster file (.uc format)
+        uclust_cluster = uclust_cluster_from_sorted_fasta_filepath(
+            sorted_fasta_filepath,
+            uc_save_filepath,
+            percent_ID=percent_ID,
+            max_accepts=max_accepts,
+            max_rejects=max_rejects,
+            stepwords=stepwords,
+            word_length=word_length,
+            optimal=optimal,
+            exact=exact,
+            suppress_sort=suppress_sort,
+            enable_rev_strand_matching=enable_rev_strand_matching,
+            subject_fasta_filepath=subject_fasta_filepath,
+            suppress_new_clusters=suppress_new_clusters,
+            stable_sort=stable_sort,
+            tmp_dir=tmp_dir,
+            HALT_EXEC=HALT_EXEC)
+        # Get cluster file name from application wrapper
+        remove_files(files_to_remove)
+    except ApplicationError:
+        remove_files(files_to_remove)
+        raise ApplicationError('Error running uclust. Possible causes are '
+                               'unsupported version (current supported version is v1.2.22) is installed or '
+                               'improperly formatted input file was provided')
+    except ApplicationNotFoundError:
+        remove_files(files_to_remove)
+        raise ApplicationNotFoundError('uclust not found, is it properly ' +
+                                       'installed?')
+
+    # Get list of lists for each cluster
+    clusters, failures, seeds = \
+        clusters_from_uc_file(uclust_cluster['ClusterFile'])
+
+    # Remove temp files unless user specifies output filepath
+    if not save_uc_files:
+        uclust_cluster.cleanUp()
+
+    if return_cluster_maps:
+        return clusters, failures, seeds
+    else:
+        return clusters.values(), failures, seeds
+
+# End uclust convenience functions
diff --git a/bfillings/usearch.py b/bfillings/usearch.py
new file mode 100644
index 0000000..fda86fc
--- /dev/null
+++ b/bfillings/usearch.py
@@ -0,0 +1,2547 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""Application controller for usearch v5.2.32
+
+Includes application controllers for usearch and
+convenience wrappers for different functions of uclust including
+sorting fasta files, finding clusters, converting to cd-hit format and
+searching and aligning against a database. Also contains
+a parser for the resulting .clstr file.
+
+Modified from pycogent_backports/uclust.py, written by
+Greg Caporaso/William Walters
+"""
+
+from os.path import splitext, abspath, join
+from tempfile import mkstemp, gettempdir
+
+from skbio.parse.sequences import parse_fasta
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+                            ApplicationError, ApplicationNotFoundError)
+from skbio.util import remove_files
+
+
+class UsearchParseError(Exception):
+    pass
+
+
+class Usearch(CommandLineApplication):
+
+    """ Usearch ApplicationController
+
+    """
+
+    _command = 'usearch'
+    _input_handler = '_input_as_parameters'
+    _parameters = {
+
+        # Fasta input file for merge-sort function
+        '--mergesort': ValuedParameter('--', Name='mergesort', Delimiter=' ',
+                                       IsPath=True),
+
+        # Fasta input file for merge-sort function
+        '--evalue': ValuedParameter('--', Name='evalue', Delimiter=' ',
+                                    IsPath=False),
+
+        # Output file, used by several difference functions
+        '--output': ValuedParameter('--', Name='output', Delimiter=' ',
+                                    IsPath=True),
+
+        # Output filename will be in uclust (.uc) format
+        # Output cluster file, required parameter
+        '--uc': ValuedParameter('--', Name='uc', Delimiter=' ',
+                                IsPath=True),
+
+        '--blast6out': ValuedParameter('--', Name='blast6out', Delimiter=' ',
+                                       IsPath=True),
+
+        # ID percent for OTU, by default is 97%
+        '--id': ValuedParameter('--', Name='id', Delimiter=' ', IsPath=False),
+
+        '--evalue':
+        ValuedParameter('--', Name='evalue', Delimiter=' ', IsPath=False),
+
+        '--queryalnfract':
+        ValuedParameter(
+            '--',
+            Name='queryalnfract',
+            Delimiter=' ',
+            IsPath=False),
+
+        '--targetalnfract':
+        ValuedParameter(
+            '--',
+            Name='targetalnfract',
+            Delimiter=' ',
+            IsPath=False),
+
+        # Enable reverse strand matching.  Will double memory.
+        '--rev': FlagParameter('--', Name='rev'),
+
+        # Maximum hits before quitting search (default 1, 0=infinity).
+        '--maxaccepts':
+        ValuedParameter('--', Name='maxaccepts', Delimiter=' '),
+
+        # Maximum rejects before quitting search (default 8, 0=infinity).
+        '--maxrejects':
+        ValuedParameter('--', Name='maxrejects', Delimiter=' '),
+
+        # Target nr. of common words (default 8, 0=don't step)
+        '--stepwords': ValuedParameter('--', Name='stepwords', Delimiter=' '),
+
+        # Word length for windex (default 5 aa.s, 8 nuc.s).
+        '--w': ValuedParameter('--', Name='w', Delimiter=' '),
+
+        # Don't assume input is sorted by length (default assume sorted).
+        '--usersort': FlagParameter('--', Name='usersort'),
+
+        # log filepath
+        '--log': ValuedParameter('--', Name='log', Delimiter=' ', IsPath=True),
+
+        # cluster command
+        '--cluster': ValuedParameter('--', Name='cluster', Delimiter=' ',
+                                     IsPath=True),
+
+
+        # Size of compressed index table. Should be prime, e.g. 40000003.
+        '--slots': ValuedParameter('--', Name='slots', Delimiter=' ',
+                                   IsPath=False),
+
+        # Not specified in usearch helpstring...
+        '--sizein': FlagParameter('--', Name='sizein'),
+
+        # Not specified in usearch helpstring...
+        '--sizeout': FlagParameter('--', Name='sizeout'),
+
+        # Not specified in usearch helpstring...
+        '--minlen': ValuedParameter('--', Name='minlen', Delimiter=' ',
+                                    IsPath=False),
+
+        # output filepath for dereplicated fasta file
+        '--seedsout': ValuedParameter('--', Name='seedsout', Delimiter=' ',
+                                      IsPath=True),
+
+        # Dereplicate exact subsequences
+        '--derep_subseq': FlagParameter('--', Name='derep_subseq'),
+
+        # Dereplicate exact sequences
+        '--derep_fullseq': FlagParameter('--', Name='derep_fullseq'),
+
+        # Sort by abundance
+        '--sortsize': ValuedParameter('--', Name='sortsize', Delimiter=' ',
+                                      IsPath=True),
+
+        # usearch search plus clustering
+        '--consout': ValuedParameter('--', Name='consout', Delimiter=' ',
+                                     IsPath=True),
+
+        # Abundance skew setting for uchime de novo chimera detection
+        '--abskew': ValuedParameter('--', Name='abskew', Delimiter=' ',
+                                    IsPath=False),
+
+        # input fasta filepath for uchime chimera
+        '--uchime': ValuedParameter('--', Name='uchime', Delimiter=' ',
+                                    IsPath=True),
+
+        # output chimera filepath
+        '--chimeras': ValuedParameter('--', Name='chimeras', Delimiter=' ',
+                                      IsPath=True),
+
+        # output non-chimera filepath
+        '--nonchimeras': ValuedParameter('--', Name='nonchimeras',
+                                         Delimiter=' ', IsPath=True),
+
+        # reference sequence database for ref based chimera detection
+        '--db': ValuedParameter('--', Name='db', Delimiter=' ', IsPath=True),
+
+        # output clusters filepath for chimera detection
+        '--uchimeout': ValuedParameter('--', Name='uchimeout', Delimiter=' ',
+                                       IsPath=True),
+
+        # minimum cluster size for quality filtering
+        '--minsize': ValuedParameter('--', Name='minsize', Delimiter=' ',
+                                     IsPath=False),
+
+        # input fasta for blast alignments
+        '--query': ValuedParameter('--', Name='query', Delimiter=' ',
+                                   IsPath=True),
+
+        # global alignment flag
+        '--global': FlagParameter('--', Name='global')
+
+    }
+
+    _suppress_stdout = False
+    _suppress_stderr = False
+
+    def _input_as_parameters(self, data):
+        """ Set the input path (a fasta filepath)
+        """
+        # The list of values which can be passed on a per-run basis
+        allowed_values = ['--uc', '--output', '--mergesort', '--log',
+                          '--cluster', '--seedsout', '--sortsize',
+                          '--consout', '--uchime', '--chimeras',
+                          '--nonchimeras', '--db', '--uchimeout',
+                          '--query', '--blast6out']
+
+        unsupported_parameters = set(data.keys()) - set(allowed_values)
+        if unsupported_parameters:
+            raise ApplicationError(
+                "Unsupported parameter(s) passed when calling usearch: %s" %
+                ' '.join(unsupported_parameters))
+
+        for v in allowed_values:
+            # turn the parameter off so subsequent runs are not
+            # affected by parameter settings from previous runs
+            self.Parameters[v].off()
+            if v in data:
+                # turn the parameter on if specified by the user
+                self.Parameters[v].on(data[v])
+
+        return ''
+
+    def _get_result_paths(self, data):
+        """ Set the result paths """
+
+        result = {}
+
+        result['Output'] = ResultPath(
+            Path=self.Parameters['--output'].Value,
+            IsWritten=self.Parameters['--output'].isOn())
+
+        result['ClusterFile'] = ResultPath(
+            Path=self.Parameters['--uc'].Value,
+            IsWritten=self.Parameters['--uc'].isOn())
+
+        return result
+
+    def _accept_exit_status(self, exit_status):
+        """ Test for acceptable exit status
+
+            usearch can seg fault and still generate a parsable .uc file
+            so we explicitly check the exit status
+
+        """
+        return exit_status == 0
+
+    def getHelp(self):
+        """Method that points to documentation"""
+        help_str =\
+            """
+        USEARCH is hosted at:
+        http://www.drive5.com/usearch/
+
+        The following papers should be cited if this resource is used:
+
+        Paper pending. Check with Robert Edgar who is writing the paper
+        for usearch as of Aug. 2011
+        """
+        return help_str
+
+# Start functions for processing usearch output files
+
+
+def clusters_from_blast_uc_file(uc_lines, otu_id_field=1):
+    """ Parses out hit/miss sequences from usearch blast uc file
+
+    All lines should be 'H'it or 'N'o hit.  Returns a dict of OTU ids: sequence
+    labels of the hits, and a list of all sequence labels that miss.
+
+    uc_lines = open file object of uc file
+
+    otu_id_field: uc field to use as the otu id. 1 is usearch's ClusterNr field,
+     and 9 is usearch's TargetLabel field
+
+    """
+
+    hit_miss_index = 0
+    cluster_id_index = otu_id_field
+    seq_label_index = 8
+
+    otus = {}
+    unassigned_seqs = []
+
+    for line in uc_lines:
+        # skip empty, comment lines
+        if line.startswith('#') or len(line.strip()) == 0:
+            continue
+
+        curr_line = line.split('\t')
+
+        if curr_line[hit_miss_index] == 'N':
+            # only retaining actual sequence label
+            unassigned_seqs.append(curr_line[seq_label_index].split()[0])
+
+        if curr_line[hit_miss_index] == 'H':
+
+            curr_seq_label = curr_line[seq_label_index].split()[0]
+            curr_otu_id = curr_line[cluster_id_index].split()[0]
+            # Append sequence label to dictionary, or create key
+            try:
+                otus[curr_otu_id].append(curr_seq_label)
+            except KeyError:
+                otus[curr_otu_id] = [curr_seq_label]
+
+    return otus, unassigned_seqs
+
+
+# End functions for processing usearch output files
+# Start usearch convenience functions
+def usearch_fasta_sort_from_filepath(
+        fasta_filepath,
+        output_filepath=None,
+        log_name="sortlen.log",
+        HALT_EXEC=False,
+        save_intermediate_files=False,
+        remove_usearch_logs=False,
+        working_dir=None):
+    """Generates sorted fasta file via usearch --mergesort.
+
+    fasta_filepath: filepath to input fasta file
+    output_filepath: filepath for output sorted fasta file.
+    log_name: string to specify log filename
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created."""
+    if not output_filepath:
+        _, output_filepath = mkstemp(prefix='usearch_fasta_sort',
+                                     suffix='.fasta')
+
+    log_filepath = join(working_dir, log_name)
+
+    params = {}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    data = {'--mergesort': fasta_filepath,
+            '--output': output_filepath,
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    app_result = app(data)
+
+    return app_result, output_filepath
+
+
+def usearch_dereplicate_exact_subseqs(
+        fasta_filepath,
+        output_filepath=None,
+        minlen=64,
+        w=64,
+        slots=16769023,
+        sizeout=True,
+        maxrejects=64,
+        log_name="derep.log",
+        usersort=False,
+        HALT_EXEC=False,
+        save_intermediate_files=False,
+        remove_usearch_logs=False,
+        working_dir=None):
+    """ Generates clusters and fasta file of dereplicated subsequences
+
+    These parameters are those specified by Robert Edgar for optimal use of
+    usearch in clustering/filtering sequences.
+
+    fasta_filepath = input filepath of fasta file to be dereplicated
+    output_filepath = output filepath of dereplicated fasta file
+    minlen = (not specified in usearch helpstring)
+    w = Word length for U-sorting
+    slots = Size of compressed index table. Should be prime, e.g. 40000003.
+     Should also specify --w, typical is --w 16 or --w 32.
+    sizeout = (not specified in usearch helpstring)
+    maxrejects = Max rejected targets, 0=ignore, default 32.
+    log_name: string to specify log filename
+    usersort = Enable if input fasta not sorted by length purposefully, lest
+     usearch will raise an error.
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created."""
+    if not output_filepath:
+        _, output_filepath = mkstemp(prefix='usearch_fasta_dereplicated',
+                                     suffix='.fasta')
+
+    log_filepath = join(working_dir, log_name)
+
+    uc_filepath = join(working_dir, "derep.uc")
+
+    params = {'--derep_subseq': True,
+              '--minlen': minlen,
+              '--w': w,
+              '--slots': slots,
+              '--sizeout': sizeout,
+              '--maxrejects': maxrejects}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if usersort:
+        app.Parameters['--usersort'].on()
+
+    data = {'--cluster': fasta_filepath,
+            '--uc': uc_filepath,
+            '--seedsout': output_filepath
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    app_result = app(data)
+
+    if not save_intermediate_files:
+        remove_files([uc_filepath])
+
+    # Returning output filepath to delete if specified.
+
+    return app_result, output_filepath
+
+
+def usearch_dereplicate_exact_seqs(
+        fasta_filepath,
+        output_filepath=None,
+        minlen=64,
+        w=64,
+        slots=16769023,
+        sizeout=True,
+        maxrejects=64,
+        log_name="derep.log",
+        usersort=False,
+        HALT_EXEC=False,
+        save_intermediate_files=False,
+        remove_usearch_logs=False,
+        working_dir=None):
+    """ Generates clusters and fasta file of dereplicated subsequences
+    for exact sequences.
+
+    These parameters are those specified by Robert Edgar for optimal use of
+    usearch in clustering/filtering sequences.
+
+    fasta_filepath = input filepath of fasta file to be dereplicated
+    output_filepath = output filepath of dereplicated fasta file
+    minlen = (not specified in usearch helpstring)
+    w = Word length for U-sorting
+    slots = Size of compressed index table. Should be prime, e.g. 40000003.
+    Should also specify --w, typical is --w 16 or --w 32.
+    sizeout = (not specified in usearch helpstring)
+    maxrejects = Max rejected targets, 0=ignore, default 32.
+    log_name: string to specify log filename
+    usersort = Enable if input fasta not sorted by length purposefully, lest
+    usearch will raise an error.
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created."""
+    if not output_filepath:
+        _, output_filepath = mkstemp(prefix='usearch_fasta_dereplicated',
+                                     suffix='.fasta')
+
+    log_filepath = join(working_dir, log_name)
+
+    uc_filepath = join(working_dir, "derep.uc")
+
+    params = {'--derep_fullseq': True,
+              '--minlen': minlen,
+              '--w': w,
+              '--slots': slots,
+              '--sizeout': sizeout,
+              '--maxrejects': maxrejects}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if usersort:
+        app.Parameters['--usersort'].on()
+
+    data = {'--cluster': fasta_filepath,
+            '--uc': uc_filepath,
+            '--seedsout': output_filepath
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    app_result = app(data)
+
+    if not save_intermediate_files:
+        remove_files([uc_filepath])
+
+    # Returning output filepath to delete if specified.
+
+    return app_result, output_filepath
+
+
+def usearch_sort_by_abundance(
+        fasta_filepath,
+        output_filepath=None,
+        sizein=True,
+        sizeout=True,
+        minsize=0,
+        log_name="abundance_sort.log",
+        usersort=False,
+        HALT_EXEC=False,
+        save_intermediate_files=False,
+        remove_usearch_logs=False,
+        working_dir=None):
+    """ Sorts fasta file by abundance
+
+    fasta_filepath = input fasta file, generally a dereplicated fasta
+    output_filepath = output abundance sorted fasta filepath
+    sizein = not defined in usearch helpstring
+    sizeout = not defined in usearch helpstring
+    minsize = minimum size of cluster to retain.
+    log_name = string to specify log filename
+    usersort = Use if not sorting by abundance or usearch will raise an error
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created.
+    """
+    if not output_filepath:
+        _, output_filepath = mkstemp(prefix='usearch_abundance_sorted',
+                                     suffix='.fasta')
+
+    log_filepath = join(
+        working_dir,
+        "minsize_" + str(minsize) + "_" + log_name)
+
+    params = {}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if usersort:
+        app.Parameters['--usersort'].on()
+
+    if minsize:
+        app.Parameters['--minsize'].on(minsize)
+
+    if sizein:
+        app.Parameters['--sizein'].on()
+
+    if sizeout:
+        app.Parameters['--sizeout'].on()
+
+    data = {'--sortsize': fasta_filepath,
+            '--output': output_filepath
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    # Can have no data following this filter step, which will raise an
+    # application error, try to catch it here to raise meaningful message.
+
+    try:
+        app_result = app(data)
+    except ApplicationError:
+        raise ValueError('No data following filter steps, please check ' +
+                         'parameter settings for usearch_qf.')
+
+    return app_result, output_filepath
+
+
+def usearch_cluster_error_correction(
+        fasta_filepath,
+        output_filepath=None,
+        output_uc_filepath=None,
+        percent_id_err=0.97,
+        sizein=True,
+        sizeout=True,
+        w=64,
+        slots=16769023,
+        maxrejects=64,
+        log_name="usearch_cluster_err_corrected.log",
+        usersort=False,
+        HALT_EXEC=False,
+        save_intermediate_files=False,
+        remove_usearch_logs=False,
+        working_dir=None):
+    """ Cluster for err. correction at percent_id_err, output consensus fasta
+
+    fasta_filepath = input fasta file, generally a dereplicated fasta
+    output_filepath = output error corrected fasta filepath
+    percent_id_err = minimum identity percent.
+    sizein = not defined in usearch helpstring
+    sizeout = not defined in usearch helpstring
+    w = Word length for U-sorting
+    slots = Size of compressed index table. Should be prime, e.g. 40000003.
+     Should also specify --w, typical is --w 16 or --w 32.
+    maxrejects = Max rejected targets, 0=ignore, default 32.
+    log_name = string specifying output log name
+    usersort = Enable if input fasta not sorted by length purposefully, lest
+     usearch will raise an error.
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created.
+    """
+    if not output_filepath:
+        _, output_filepath = mkstemp(prefix='usearch_cluster_err_corrected',
+                                     suffix='.fasta')
+
+    log_filepath = join(working_dir, log_name)
+
+    params = {'--sizein': sizein,
+              '--sizeout': sizeout,
+              '--id': percent_id_err,
+              '--w': w,
+              '--slots': slots,
+              '--maxrejects': maxrejects}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if usersort:
+        app.Parameters['--usersort'].on()
+
+    data = {'--cluster': fasta_filepath,
+            '--consout': output_filepath
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    if output_uc_filepath:
+        data['--uc'] = output_uc_filepath
+
+    app_result = app(data)
+
+    return app_result, output_filepath
+
+
+def usearch_chimera_filter_de_novo(
+        fasta_filepath,
+        output_chimera_filepath=None,
+        output_non_chimera_filepath=None,
+        abundance_skew=2.0,
+        log_name="uchime_de_novo_chimera_filtering.log",
+        usersort=False,
+        HALT_EXEC=False,
+        save_intermediate_files=False,
+        remove_usearch_logs=False,
+        working_dir=None):
+    """ Chimera filter de novo, output chimeras and non-chimeras to fastas
+
+    fasta_filepath = input fasta file, generally a dereplicated fasta
+    output_chimera_filepath = output chimera filepath
+    output_non_chimera_filepath = output non chimera filepath
+    abundance_skew = abundance skew setting for de novo filtering.
+    usersort = Enable if input fasta not sorted by length purposefully, lest
+     usearch will raise an error.
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created.
+    """
+    if not output_chimera_filepath:
+        _, output_chimera_filepath = mkstemp(prefix='uchime_chimeras_',
+                                             suffix='.fasta')
+
+    if not output_non_chimera_filepath:
+        _, output_non_chimera_filepath = mkstemp(prefix='uchime_non_chimeras_',
+                                                 suffix='.fasta')
+
+    log_filepath = join(working_dir, log_name)
+
+    params = {'--abskew': abundance_skew}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if usersort:
+        app.Parameters['--usersort'].on()
+
+    data = {'--uchime': fasta_filepath,
+            '--chimeras': output_chimera_filepath,
+            '--nonchimeras': output_non_chimera_filepath
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    app_result = app(data)
+
+    if not save_intermediate_files:
+        remove_files([output_chimera_filepath])
+
+    return app_result, output_non_chimera_filepath
+
+
+def usearch_chimera_filter_ref_based(
+        fasta_filepath,
+        db_filepath,
+        output_chimera_filepath=None,
+        output_non_chimera_filepath=None,
+        rev=False,
+        log_name="uchime_reference_chimera_filtering.log",
+        usersort=False,
+        HALT_EXEC=False,
+        save_intermediate_files=False,
+        remove_usearch_logs=False,
+        working_dir=None):
+    """ Chimera filter against a reference database.
+
+    fasta_filepath = input fasta file, generally a dereplicated fasta
+    db_filepath = filepath to reference sequence database
+    output_chimera_filepath = output chimera filepath
+    output_non_chimera_filepath = output non chimera filepath
+    rev = search plus and minus strands of sequences
+    abundance_skew = abundance skew setting for de novo filtering.
+    log_name = string specifying log filename.
+    usersort = Enable if input fasta not sorted by length purposefully, lest
+     usearch will raise an error.
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created.
+    """
+
+    if not output_chimera_filepath:
+        _, output_chimera_filepath = mkstemp(prefix='uchime_chimeras_',
+                                             suffix='.fasta')
+
+    if not output_non_chimera_filepath:
+        _, output_non_chimera_filepath = mkstemp(prefix='uchime_non_chimeras_',
+                                                 suffix='.fasta')
+
+    log_filepath = join(working_dir, log_name)
+
+    # clusters filepath created by usearch
+    cluster_filepath = join(working_dir, "refdb.uc")
+
+    params = {}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if usersort:
+        app.Parameters['--usersort'].on()
+    if rev:
+        app.Parameters['--rev'].on()
+
+    data = {'--uchime': fasta_filepath,
+            '--db': db_filepath,
+            '--chimeras': output_chimera_filepath,
+            '--nonchimeras': output_non_chimera_filepath,
+            '--uchimeout': cluster_filepath
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    app_result = app(data)
+
+    if not save_intermediate_files:
+        remove_files([cluster_filepath, output_chimera_filepath])
+
+    return app_result, output_non_chimera_filepath
+
+
+def usearch_cluster_seqs(
+    fasta_filepath,
+    output_filepath=None,
+    percent_id=0.97,
+    sizein=True,
+    sizeout=True,
+    w=64,
+    slots=16769023,
+    maxrejects=64,
+    log_name="usearch_cluster_seqs.log",
+    usersort=True,
+    HALT_EXEC=False,
+    save_intermediate_files=False,
+    remove_usearch_logs=False,
+    working_dir=None
+):
+    """ Cluster seqs at percent_id, output consensus fasta
+
+    fasta_filepath = input fasta file, generally a dereplicated fasta
+    output_filepath = output error corrected fasta filepath
+    percent_id = minimum identity percent.
+    sizein = not defined in usearch helpstring
+    sizeout = not defined in usearch helpstring
+    w = Word length for U-sorting
+    slots = Size of compressed index table. Should be prime, e.g. 40000003.
+     Should also specify --w, typical is --w 16 or --w 32.
+    maxrejects = Max rejected targets, 0=ignore, default 32.
+    log_name = string specifying output log name
+    usersort = Enable if input fasta not sorted by length purposefully, lest
+     usearch will raise an error.  In post chimera checked sequences, the seqs
+     are sorted by abundance, so this should be set to True.
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created.
+    """
+    if not output_filepath:
+        _, output_filepath = mkstemp(prefix='usearch_cluster', suffix='.fasta')
+
+    log_filepath = join(working_dir, log_name)
+
+    uc_filepath = join(working_dir, "clustered_seqs_post_chimera.uc")
+
+    params = {'--sizein': sizein,
+              '--sizeout': sizeout,
+              '--id': percent_id,
+              '--w': w,
+              '--slots': slots,
+              '--maxrejects': maxrejects}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if usersort:
+        app.Parameters['--usersort'].on()
+
+    data = {'--cluster': fasta_filepath,
+            '--seedsout': output_filepath,
+            '--uc': uc_filepath
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    app_result = app(data)
+
+    if not save_intermediate_files:
+        remove_files([uc_filepath])
+
+    return app_result, output_filepath
+
+
+def usearch_cluster_seqs_ref(
+        fasta_filepath,
+        output_filepath=None,
+        percent_id=0.97,
+        sizein=True,
+        sizeout=True,
+        w=64,
+        slots=16769023,
+        maxrejects=64,
+        log_name="usearch_cluster_seqs.log",
+        usersort=True,
+        HALT_EXEC=False,
+        save_intermediate_files=False,
+        remove_usearch_logs=False,
+        suppress_new_clusters=False,
+        refseqs_fp=None,
+        output_dir=None,
+        working_dir=None,
+        rev=False):
+    """ Cluster seqs at percent_id, output consensus fasta
+
+    Also appends de novo clustered seqs if suppress_new_clusters is False.
+    Forced to handle reference + de novo in hackish fashion as usearch does not
+    work as listed in the helpstrings.  Any failures are clustered de novo,
+    and given unique cluster IDs.
+
+    fasta_filepath = input fasta file, generally a dereplicated fasta
+    output_filepath = output reference clustered uc filepath
+    percent_id = minimum identity percent.
+    sizein = not defined in usearch helpstring
+    sizeout = not defined in usearch helpstring
+    w = Word length for U-sorting
+    slots = Size of compressed index table. Should be prime, e.g. 40000003.
+     Should also specify --w, typical is --w 16 or --w 32.
+    maxrejects = Max rejected targets, 0=ignore, default 32.
+    log_name = string specifying output log name
+    usersort = Enable if input fasta not sorted by length purposefully, lest
+     usearch will raise an error.  In post chimera checked sequences, the seqs
+     are sorted by abundance, so this should be set to True.
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created.
+    suppress_new_clusters: Disables de novo OTUs when ref based OTU picking
+     enabled.
+    refseqs_fp: Filepath for ref based OTU picking
+    output_dir: output directory
+    rev = search plus and minus strands of sequences
+    """
+    if not output_filepath:
+        _, output_filepath = mkstemp(prefix='usearch_cluster_ref_based',
+                                     suffix='.uc')
+
+    log_filepath = join(working_dir, log_name)
+
+    uc_filepath = join(working_dir, "clustered_seqs_post_chimera.uc")
+
+    params = {'--sizein': sizein,
+              '--sizeout': sizeout,
+              '--id': percent_id,
+              '--w': w,
+              '--slots': slots,
+              '--maxrejects': maxrejects}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if usersort:
+        app.Parameters['--usersort'].on()
+    if rev:
+        app.Parameters['--rev'].on()
+
+    data = {'--query': fasta_filepath,
+            '--uc': uc_filepath,
+            '--db': refseqs_fp
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    app_result = app(data)
+
+    files_to_remove = []
+
+    # Need to create fasta file of all hits (with reference IDs),
+    # recluster failures if new clusters allowed, and create complete fasta
+    # file, with unique fasta label IDs.
+
+    if suppress_new_clusters:
+        output_fna_filepath = join(output_dir, 'ref_clustered_seqs.fasta')
+        output_filepath, labels_hits = get_fasta_from_uc_file(fasta_filepath,
+                                                              uc_filepath, hit_type="H", output_dir=output_dir,
+                                                              output_fna_filepath=output_fna_filepath)
+
+        files_to_remove.append(uc_filepath)
+    else:
+        # Get fasta of successful ref based clusters
+        output_fna_clustered = join(output_dir, 'ref_clustered_seqs.fasta')
+        output_filepath_ref_clusters,  labels_hits =\
+            get_fasta_from_uc_file(fasta_filepath, uc_filepath, hit_type="H",
+                                   output_dir=output_dir, output_fna_filepath=output_fna_clustered)
+
+        # get failures and recluster
+        output_fna_failures =\
+            join(output_dir, 'ref_clustered_seqs_failures.fasta')
+        output_filepath_failures, labels_hits =\
+            get_fasta_from_uc_file(fasta_filepath,
+                                   uc_filepath, hit_type="N", output_dir=output_dir,
+                                   output_fna_filepath=output_fna_failures)
+
+        # de novo cluster the failures
+        app_result, output_filepath_clustered_failures =\
+            usearch_cluster_seqs(output_fna_failures, output_filepath=
+                                 join(
+                                     output_dir,
+                                     'clustered_seqs_reference_failures.fasta'),
+                                 percent_id=percent_id, sizein=sizein, sizeout=sizeout, w=w,
+                                 slots=slots, maxrejects=maxrejects,
+                                 save_intermediate_files=save_intermediate_files,
+                                 remove_usearch_logs=remove_usearch_logs, working_dir=working_dir)
+
+        output_filepath = concatenate_fastas(output_fna_clustered,
+                                             output_fna_failures, output_concat_filepath=join(
+                                                 output_dir,
+                                                 'concatenated_reference_denovo_clusters.fasta'))
+
+        files_to_remove.append(output_fna_clustered)
+        files_to_remove.append(output_fna_failures)
+        files_to_remove.append(output_filepath_clustered_failures)
+
+    if not save_intermediate_files:
+        remove_files(files_to_remove)
+
+    return app_result, output_filepath
+
+
+def concatenate_fastas(output_fna_clustered,
+                       output_fna_failures,
+                       output_concat_filepath):
+    """ Concatenates two input fastas, writes to output_concat_filepath
+
+    output_fna_clustered: fasta of successful ref clusters
+    output_fna_failures: de novo fasta of cluster failures
+    output_concat_filepath: path to write combined fastas to
+    """
+
+    output_fp = open(output_concat_filepath, "w")
+
+    for label, seq in parse_fasta(open(output_fna_clustered, "U")):
+        output_fp.write(">%s\n%s\n" % (label, seq))
+    for label, seq in parse_fasta(open(output_fna_failures, "U")):
+        output_fp.write(">%s\n%s\n" % (label, seq))
+
+    return output_concat_filepath
+
+
+def enumerate_otus(fasta_filepath,
+                   output_filepath=None,
+                   label_prefix="",
+                   label_suffix="",
+                   retain_label_as_comment=False,
+                   count_start=0):
+    """ Writes unique, sequential count to OTUs
+
+    fasta_filepath = input fasta filepath
+    output_filepath = output fasta filepath
+    label_prefix = string to place before enumeration
+    label_suffix = string to place after enumeration
+    retain_label_as_comment = if True, will place existing label in sequence
+     comment, after a tab
+    count_start = number to start enumerating OTUs with
+
+    """
+
+    fasta_i = open(fasta_filepath, "U")
+
+    if not output_filepath:
+        _, output_filepath = mkstemp(prefix='enumerated_seqs_',
+                                     suffix='.fasta')
+
+    fasta_o = open(output_filepath, "w")
+
+    for label, seq in parse_fasta(fasta_i):
+        curr_label = ">" + label_prefix + str(count_start) + label_suffix
+        if retain_label_as_comment:
+            curr_label += '\t' + label
+        fasta_o.write(curr_label.strip() + '\n')
+        fasta_o.write(seq.strip() + '\n')
+        count_start += 1
+
+    return output_filepath
+
+
+def get_fasta_from_uc_file(fasta_filepath,
+                           uc_filepath,
+                           hit_type="H",
+                           output_fna_filepath=None,
+                           label_prefix="",
+                           output_dir=None):
+    """ writes fasta of sequences from uc file of type hit_type
+
+    fasta_filepath:  Filepath of original query fasta file
+    uc_filepath:  Filepath of .uc file created by usearch post error filtering
+    hit_type: type to read from first field of .uc file, "H" for hits, "N" for
+     no hits.
+    output_fna_filepath = fasta output filepath
+    label_prefix = Added before each fasta label, important when doing ref
+     based OTU picking plus de novo clustering to preserve label matching.
+    output_dir: output directory
+    """
+
+    hit_type_index = 0
+    seq_label_index = 8
+    target_label_index = 9
+
+    labels_hits = {}
+    labels_to_keep = []
+
+    for line in open(uc_filepath, "U"):
+        if line.startswith("#") or len(line.strip()) == 0:
+            continue
+        curr_line = line.split('\t')
+        if curr_line[0] == hit_type:
+            labels_hits[curr_line[seq_label_index]] =\
+                curr_line[target_label_index].strip()
+            labels_to_keep.append(curr_line[seq_label_index])
+
+    labels_to_keep = set(labels_to_keep)
+
+    out_fna = open(output_fna_filepath, "w")
+
+    for label, seq in parse_fasta(open(fasta_filepath, "U")):
+        if label in labels_to_keep:
+            if hit_type == "H":
+                out_fna.write(">" + labels_hits[label] + "\n%s\n" % seq)
+            if hit_type == "N":
+                out_fna.write(">" + label + "\n%s\n" % seq)
+
+    return output_fna_filepath, labels_hits
+
+
+def get_retained_chimeras(output_fp_de_novo_nonchimeras,
+                          output_fp_ref_nonchimeras,
+                          output_combined_fp,
+                          chimeras_retention='union'):
+    """ Gets union or intersection of two supplied fasta files
+
+    output_fp_de_novo_nonchimeras: filepath of nonchimeras from de novo
+     usearch detection.
+    output_fp_ref_nonchimeras: filepath of nonchimeras from reference based
+     usearch detection.
+    output_combined_fp: filepath to write retained sequences to.
+    chimeras_retention: accepts either 'intersection' or 'union'.  Will test
+     for chimeras against the full input error clustered sequence set, and
+     retain sequences flagged as non-chimeras by either (union) or
+     only those flagged as non-chimeras by both (intersection)."""
+
+    de_novo_non_chimeras = []
+    reference_non_chimeras = []
+
+    de_novo_nonchimeras_f = open(output_fp_de_novo_nonchimeras, "U")
+    reference_nonchimeras_f = open(output_fp_ref_nonchimeras, "U")
+
+    output_combined_f = open(output_combined_fp, "w")
+
+    for label, seq in parse_fasta(de_novo_nonchimeras_f):
+        de_novo_non_chimeras.append(label)
+    de_novo_nonchimeras_f.close()
+    for label, seq in parse_fasta(reference_nonchimeras_f):
+        reference_non_chimeras.append(label)
+    reference_nonchimeras_f.close()
+
+    de_novo_non_chimeras = set(de_novo_non_chimeras)
+    reference_non_chimeras = set(reference_non_chimeras)
+
+    if chimeras_retention == 'union':
+        all_non_chimeras = de_novo_non_chimeras.union(reference_non_chimeras)
+    elif chimeras_retention == 'intersection':
+        all_non_chimeras =\
+            de_novo_non_chimeras.intersection(reference_non_chimeras)
+
+    de_novo_nonchimeras_f = open(output_fp_de_novo_nonchimeras, "U")
+    reference_nonchimeras_f = open(output_fp_ref_nonchimeras, "U")
+
+    # Save a list of already-written labels
+    labels_written = []
+
+    for label, seq in parse_fasta(de_novo_nonchimeras_f):
+        if label in all_non_chimeras:
+            if label not in labels_written:
+                output_combined_f.write('>%s\n%s\n' % (label, seq))
+                labels_written.append(label)
+    de_novo_nonchimeras_f.close()
+    for label, seq in parse_fasta(reference_nonchimeras_f):
+        if label in all_non_chimeras:
+            if label not in labels_written:
+                output_combined_f.write('>%s\n%s\n' % (label, seq))
+                labels_written.append(label)
+    reference_nonchimeras_f.close()
+
+    output_combined_f.close()
+
+    return output_combined_fp
+
+
+def assign_reads_to_otus(original_fasta,
+                         filtered_fasta,
+                         output_filepath=None,
+                         log_name="assign_reads_to_otus.log",
+                         perc_id_blast=0.97,
+                         global_alignment=True,
+                         HALT_EXEC=False,
+                         save_intermediate_files=False,
+                         remove_usearch_logs=False,
+                         working_dir=None):
+    """ Uses original fasta file, blasts to assign reads to filtered fasta
+
+    original_fasta = filepath to original query fasta
+    filtered_fasta = filepath to enumerated, filtered fasta
+    output_filepath = output path to clusters (uc) file
+    log_name = string specifying output log name
+    perc_id_blast = percent ID for blasting original seqs against filtered set
+    usersort = Enable if input fasta not sorted by length purposefully, lest
+     usearch will raise an error.  In post chimera checked sequences, the seqs
+     are sorted by abundance, so this should be set to True.
+    HALT_EXEC: Used for debugging app controller
+    save_intermediate_files: Preserve all intermediate files created.
+    """
+
+    # Not sure if I feel confortable using blast as a way to recapitulate
+    # original read ids....
+    if not output_filepath:
+        _, output_filepath = mkstemp(prefix='assign_reads_to_otus',
+                                     suffix='.uc')
+
+    log_filepath = join(working_dir, log_name)
+
+    params = {'--id': perc_id_blast,
+              '--global': global_alignment}
+
+    app = Usearch(params, WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    data = {'--query': original_fasta,
+            '--db': filtered_fasta,
+            '--uc': output_filepath
+            }
+
+    if not remove_usearch_logs:
+        data['--log'] = log_filepath
+
+    app_result = app(data)
+
+    return app_result, output_filepath
+
+
+def usearch_qf(
+    fasta_filepath,
+    refseqs_fp=None,
+    output_dir=None,
+    percent_id=0.97,
+    percent_id_err=0.97,
+    minsize=4,
+    abundance_skew=2.0,
+    db_filepath=None,
+    rev=False,
+    label_prefix="",
+    label_suffix="",
+    retain_label_as_comment=False,
+    count_start=0,
+    perc_id_blast=0.97,
+    save_intermediate_files=False,
+    HALT_EXEC=False,
+    global_alignment=True,
+    sizein=True,
+    sizeout=True,
+    w=64,
+    slots=16769023,
+    maxrejects=64,
+    minlen=64,
+    de_novo_chimera_detection=True,
+    derep_fullseq=False,
+    reference_chimera_detection=True,
+    cluster_size_filtering=True,
+    remove_usearch_logs=False,
+    usersort=True,
+    suppress_new_clusters=False,
+    chimeras_retention="union",
+    verbose=False
+):
+    """ Main convenience wrapper for using usearch to filter/cluster seqs
+
+    The complete 'usearch_qf' process is a multistep process with many calls
+    to usearch with various parameters.  It is likely to change from the
+    original implementation.  A lot.
+
+    fasta_filepath = fasta filepath to filtering/clustering (e.g., output
+     seqs.fna file from split_libraries.py)
+    refseqs_fp = fasta filepath for ref-based otu picking.
+    output_dir = directory to store the otu mapping file, as well logs and
+     the intermediate files created if save_intermediate_files is True.
+    percent_ID = percent ID for clustering sequences.
+    percent_ID_err = percent ID for filtering out chimeras
+    minsize = Minimum size of cluster for retention after chimera removal.
+    abundance_skew = threshold setting for chimera removal with de novo
+     chimera detection.
+    db_filepath = filepath of reference fasta sequence set for ref based
+     chimera detection.
+    rev = search plus and minus strands of sequences, used in ref based chimera
+     detection.
+    label_prefix = optional prefix added to filtered fasta file.
+    label_suffix = optional suffix added to filtered fasta file.
+    retain_label_as_comment = option to add usearch generated label to
+     enumerated fasta labels.
+    count_start = integer to begin counting at for sequence enumeration.
+    perc_id_blast = percent identity setting for using blast algorithm to
+     assign original sequence labels to filtered fasta.
+    global_alignment = Setting for assignment of original seq labels to filtered
+     seqs.
+    sizein = not defined in usearch helpstring
+    sizeout = not defined in usearch helpstring
+    w = Word length for U-sorting
+    slots = Size of compressed index table. Should be prime, e.g. 40000003.
+     Should also specify --w, typical is --w 16 or --w 32.
+    maxrejects = Max rejected targets, 0=ignore, default 32.
+    save_intermediate_files = retain all the intermediate files created during
+     this process.
+    minlen = (not specified in usearch helpstring), but seems like a good bet
+     that this refers to the minimum length of the sequences for dereplication.
+    HALT_EXEC = used to debug app controller problems.
+    de_novo_chimera_detection = If True, will detect chimeras de novo
+    reference_chimera_detection = If True, will detect chimeras ref based
+    cluster_size_filtering = If True, will filter OTUs according to seq counts.
+    remove_usearch_logs = If True, will not call the --log function for each
+     usearch call.
+    usersort = Used for specifying custom sorting (i.e., non-length based
+     sorting) with usearch/uclust.
+    suppress_new_clusters = with reference based OTU picking, if enabled,
+     will prevent new clusters that do not match the reference from being
+     clustered.
+    chimeras_retention = accepts either 'intersection' or 'union'.  Will test
+     for chimeras against the full input error clustered sequence set, and
+     retain sequences flagged as non-chimeras by either (union) or
+     only those flagged as non-chimeras by both (intersection).
+    """
+
+    # Save a list of intermediate filepaths in case they are to be removed.
+    intermediate_files = []
+
+    # Need absolute paths to avoid problems with app controller
+    if output_dir:
+        output_dir = abspath(output_dir) + '/'
+
+    fasta_filepath = abspath(fasta_filepath)
+
+    try:
+
+        if verbose:
+            print "Sorting sequences by length..."
+        # Sort seqs by length
+        app_result, output_filepath_len_sorted =\
+            usearch_fasta_sort_from_filepath(fasta_filepath, output_filepath=
+                                             join(
+                                                 output_dir,
+                                                 'len_sorted.fasta'),
+                                             save_intermediate_files=save_intermediate_files,
+                                             remove_usearch_logs=remove_usearch_logs,
+                                             working_dir=output_dir, HALT_EXEC=HALT_EXEC)
+
+        intermediate_files.append(output_filepath_len_sorted)
+
+        if verbose:
+            print "Dereplicating sequences..."
+        # Dereplicate sequences
+        app_result, output_filepath_dereplicated =\
+            usearch_dereplicate_exact_subseqs(output_filepath_len_sorted,
+                                              output_filepath=join(
+                                                  output_dir,
+                                                  'dereplicated_seqs.fasta'),
+                                              minlen=minlen, w=w, slots=slots, sizeout=sizeout,
+                                              maxrejects=maxrejects, save_intermediate_files=save_intermediate_files,
+                                              remove_usearch_logs=remove_usearch_logs,
+                                              working_dir=output_dir, HALT_EXEC=HALT_EXEC)
+
+        intermediate_files.append(output_filepath_dereplicated)
+
+        if verbose:
+            print "Sorting by abundance..."
+        # Sort by abundance, initially no filter based on seqs/otu
+        app_result, output_fp =\
+            usearch_sort_by_abundance(output_filepath_dereplicated,
+                                      output_filepath=join(
+                                          output_dir,
+                                          'abundance_sorted.fasta'),
+                                      usersort=True, sizein=sizein, sizeout=sizeout, minsize=0,
+                                      remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+                                      HALT_EXEC=HALT_EXEC)
+
+        intermediate_files.append(output_fp)
+
+        if verbose:
+            print "Clustering sequences for error correction..."
+
+        # Create .uc file of clusters file, to identify original sequences
+        # later
+        output_uc_filepath = output_dir + 'err_corrected_clusters.uc'
+
+        app_result, error_clustered_output_fp =\
+            usearch_cluster_error_correction(output_fp,
+                                             output_filepath=join(output_dir,
+                                                                  'clustered_error_corrected.fasta'),
+                                             output_uc_filepath=output_uc_filepath,
+                                             usersort=True, percent_id_err=percent_id_err, sizein=sizein,
+                                             sizeout=sizeout, w=w, slots=slots, maxrejects=maxrejects,
+                                             remove_usearch_logs=remove_usearch_logs,
+                                             save_intermediate_files=save_intermediate_files,
+                                             working_dir=output_dir, HALT_EXEC=HALT_EXEC)
+
+        intermediate_files.append(error_clustered_output_fp)
+        intermediate_files.append(output_uc_filepath)
+
+        # Series of conditional tests, using generic 'output_fp' name so the
+        # conditional filtering, if any/all are selected, do not matter.
+        if de_novo_chimera_detection:
+
+            if verbose:
+                print "Performing de novo chimera detection..."
+            app_result, output_fp_de_novo_nonchimeras =\
+                usearch_chimera_filter_de_novo(error_clustered_output_fp,
+                                               abundance_skew=abundance_skew, output_chimera_filepath=
+                                               join(
+                                                   output_dir,
+                                                   'de_novo_chimeras.fasta'),
+                                               output_non_chimera_filepath=join(
+                                                   output_dir,
+                                                   'de_novo_non_chimeras.fasta'), usersort=True,
+                                               save_intermediate_files=save_intermediate_files,
+                                               remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+                                               HALT_EXEC=HALT_EXEC)
+
+            intermediate_files.append(output_fp_de_novo_nonchimeras)
+
+            output_fp = output_fp_de_novo_nonchimeras
+
+        if reference_chimera_detection:
+            if verbose:
+                print "Performing reference based chimera detection..."
+
+            app_result, output_fp_ref_nonchimeras =\
+                usearch_chimera_filter_ref_based(error_clustered_output_fp,
+                                                 db_filepath=db_filepath, output_chimera_filepath=
+                                                 join(
+                                                     output_dir,
+                                                     'reference_chimeras.fasta'),
+                                                 output_non_chimera_filepath=
+                                                 join(output_dir, 'reference_non_chimeras.fasta'), usersort=True,
+                                                 save_intermediate_files=save_intermediate_files, rev=rev,
+                                                 remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+                                                 HALT_EXEC=HALT_EXEC)
+
+            intermediate_files.append(output_fp_ref_nonchimeras)
+
+            output_fp = output_fp_ref_nonchimeras
+
+        # get intersection or union if both ref and de novo chimera detection
+        if de_novo_chimera_detection and reference_chimera_detection:
+            if verbose:
+                print "Finding %s of non-chimeras..." % chimeras_retention
+            output_fp = get_retained_chimeras(
+                output_fp_de_novo_nonchimeras, output_fp_ref_nonchimeras,
+                output_combined_fp=
+                join(output_dir, 'combined_non_chimeras.fasta'),
+                chimeras_retention=chimeras_retention)
+
+            intermediate_files.append(output_fp)
+
+        if cluster_size_filtering:
+            # Test for empty filepath following filters, raise error if all seqs
+            # have been removed
+            if verbose:
+                print "Filtering by cluster size..."
+            # chimera detection was not performed, use output file of step 4 as input
+            # to filtering by cluster size
+            if not (reference_chimera_detection and de_novo_chimera_detection):
+                output_fp = error_clustered_output_fp
+            app_result, output_fp =\
+                usearch_sort_by_abundance(output_fp, output_filepath=
+                                          join(output_dir, 'abundance_sorted_minsize_' + str(minsize) +
+                                               '.fasta'),
+                                          minsize=minsize, sizein=sizein, sizeout=sizeout,
+                                          remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+                                          HALT_EXEC=HALT_EXEC)
+
+            intermediate_files.append(output_fp)
+
+        # cluster seqs
+        # Should we add in option to use alternative OTU picking here?
+        # Seems like it will be a bit of a mess...maybe after we determine
+        # if usearch_qf should become standard.
+        if refseqs_fp:
+            if verbose:
+                print "Clustering against reference sequences..."
+            app_result, output_filepath =\
+                usearch_cluster_seqs_ref(output_fp, output_filepath=
+                                         join(
+                                             output_dir,
+                                             'ref_clustered_seqs.uc'),
+                                         percent_id=percent_id, sizein=sizein,
+                                         sizeout=sizeout, w=w, slots=slots, maxrejects=maxrejects,
+                                         save_intermediate_files=save_intermediate_files,
+                                         remove_usearch_logs=remove_usearch_logs,
+                                         suppress_new_clusters=suppress_new_clusters, refseqs_fp=refseqs_fp,
+                                         output_dir=output_dir, working_dir=output_dir, rev=rev,
+                                         HALT_EXEC=HALT_EXEC
+                                         )
+
+        else:
+            if verbose:
+                print "De novo clustering sequences..."
+            app_result, output_filepath =\
+                usearch_cluster_seqs(output_fp, output_filepath=
+                                     join(output_dir, 'clustered_seqs.fasta'),
+                                     percent_id=percent_id, sizein=sizein,
+                                     sizeout=sizeout, w=w, slots=slots, maxrejects=maxrejects,
+                                     save_intermediate_files=save_intermediate_files,
+                                     remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+                                     HALT_EXEC=HALT_EXEC)
+
+        intermediate_files.append(output_filepath)
+
+        # Enumerate the OTUs in the clusters
+        if not suppress_new_clusters:
+            if verbose:
+                print "Enumerating OTUs..."
+            output_filepath =\
+                enumerate_otus(output_filepath, output_filepath=
+                               join(output_dir, 'enumerated_otus.fasta'),
+                               label_prefix=label_prefix,
+                               label_suffix=label_suffix, count_start=count_start,
+                               retain_label_as_comment=retain_label_as_comment)
+
+            intermediate_files.append(output_filepath)
+
+        # Get original sequence label identities
+        if verbose:
+            print "Assigning sequences to clusters..."
+        app_result, clusters_file = assign_reads_to_otus(fasta_filepath,
+                                                         filtered_fasta=output_filepath, output_filepath=join(
+                                                             output_dir,
+                                                             'assign_reads_to_otus.uc'), perc_id_blast=percent_id,
+                                                         global_alignment=global_alignment,
+                                                         remove_usearch_logs=remove_usearch_logs, working_dir=output_dir,
+                                                         HALT_EXEC=HALT_EXEC)
+
+        intermediate_files.append(clusters_file)
+
+    except ApplicationError:
+        raise ApplicationError('Error running usearch. Possible causes are '
+                               'unsupported version (current supported version is usearch ' +
+                               'v5.2.236) is installed or improperly formatted input file was ' +
+                               'provided')
+    except ApplicationNotFoundError:
+        remove_files(files_to_remove)
+        raise ApplicationNotFoundError('usearch not found, is it properly ' +
+                                       'installed?')
+
+    # Get dict of clusters, list of failures
+    # Set OTU ID field to 9 for the case of closed reference OTU picking
+    if suppress_new_clusters:
+        otu_id_field = 9
+    else:
+        otu_id_field = 1
+    clusters, failures = clusters_from_blast_uc_file(open(clusters_file, "U"),
+                                                     otu_id_field)
+
+    # Remove temp files unless user specifies output filepath
+    if not save_intermediate_files:
+        remove_files(intermediate_files)
+
+    return clusters, failures
+
+
+def assign_dna_reads_to_database(query_fasta_fp,
+                                 database_fasta_fp,
+                                 output_fp,
+                                 temp_dir=gettempdir(),
+                                 params={},
+                                 blast6_fp=None,
+                                 HALT_EXEC=False):
+    _params = {'--id': 0.97}
+    _params.update(params)
+
+    if blast6_fp is None:
+        blast6_fp = splitext(output_fp)[0] + '.bl6'
+    data = {'--query': query_fasta_fp,
+            '--uc': output_fp,
+            '--db': database_fasta_fp,
+            '--blast6out': blast6_fp,
+            }
+    app = Usearch(_params,
+                  WorkingDir=temp_dir,
+                  HALT_EXEC=False)
+    app_result = app(data)
+
+assign_dna_reads_to_protein_database =\
+    assign_dna_reads_to_dna_database =\
+    assign_dna_reads_to_database
+# End uclust convenience functions
+
+# Start usearch61 application controller
+
+
+class Usearch61(CommandLineApplication):
+
+    """ Usearch61 ApplicationController
+
+    """
+
+    _command = 'usearch61'
+    _input_handler = '_input_as_parameters'
+    _parameters = {
+
+        # IO filepaths specified by these values
+
+        # Output file, used by several difference functions
+        '--output': ValuedParameter('--', Name='output', Delimiter=' ',
+                                    IsPath=True),
+
+        # Output filename in uclust (.uc) format
+        '--uc': ValuedParameter('--', Name='uc', Delimiter=' ', IsPath=True),
+
+        # log filepath
+        '--log': ValuedParameter('--', Name='log', Delimiter=' ', IsPath=True),
+
+        # Uses to specify input file for reference based clustering
+        '--usearch_global': ValuedParameter('--', Name='usearch_global',
+                                            Delimiter=' ', IsPath=True),
+
+        # Used to specify reference sequences to act as seeds
+        '--db': ValuedParameter('--', Name='db', Delimiter=' ', IsPath=True),
+
+        # Default de novo clustering input fasta filepath, memory efficient
+        '--cluster_smallmem': ValuedParameter('--', Name='cluster_smallmem',
+                                              Delimiter=' ', IsPath=True),
+
+        # Fast de novo clustering input fasta filepath
+        '--cluster_fast': ValuedParameter('--', Name='cluster_fast',
+                                          Delimiter=' ', IsPath=True),
+
+        # Specifies consensus fasta file output for a cluster
+        '--consout': ValuedParameter('--', Name='consout',
+                                     Delimiter=' ', IsPath=True),
+
+        # Specifies input consensus/abundance file for de novo chimeras
+        '--uchime_denovo': ValuedParameter('--', Name='uchime_denovo',
+                                           Delimiter=' ', IsPath=True),
+
+        # Specifies input consensus/abundance file for ref chimera detection
+        '--uchime_ref': ValuedParameter('--', Name='uchime_ref',
+                                        Delimiter=' ', IsPath=True),
+
+        # Specifies output uchime file for chimera results
+        '--uchimeout': ValuedParameter('--', Name='uchimeout',
+                                       Delimiter=' ', IsPath=True),
+
+        # Parameters for sorting raw fasta files
+        # specifies fasta filepath to sort by length
+        '--sortbylength': ValuedParameter('--', Name='sortbylength',
+                                          Delimiter=' ', IsPath=True),
+
+        # specifies fasta filepath to dereplicate, sort by abundance
+        '--derep_fulllength': ValuedParameter('--', Name='derep_fulllength',
+                                              Delimiter=' ', IsPath=True),
+
+        # Adds label showing abundance of dereplicated sequences
+        '--sizeout': FlagParameter('--', Name='sizeout'),
+
+        # Other parameters for clustering/sorting
+
+        # Needed to use data sorted by abundance and use sizeorder option
+        '--usersort': FlagParameter('--', Name='usersort'),
+
+        # specifies percent identity for clustering
+        '--id': ValuedParameter('--', Name='id', Delimiter=' ', IsPath=False),
+
+        # specifies minimum sequence length allowed
+        '--minseqlength': ValuedParameter('--', Name='minseqlength',
+                                          Delimiter=' ', IsPath=False),
+
+        # if set as --strand both will enable reverse strand matching
+        '--strand': ValuedParameter('--', Name='strand', Delimiter=' ',
+                                    IsPath=False),
+
+        # Word length to use, in base pairs
+        '--wordlength': ValuedParameter('--', Name='wordlength',
+                                        Delimiter=' ', IsPath=False),
+
+        # Max rejects, lower = more speed, higher=higher accuracy
+        '--maxrejects': ValuedParameter('--', Name='maxrejects',
+                                        Delimiter=' ', IsPath=False),
+
+        # Max accepts, should be greater than 1 for sizeorder option
+        '--maxaccepts': ValuedParameter('--', Name='maxaccepts',
+                                        Delimiter=' ', IsPath=False),
+
+        # Option to cluster to most abundant seed
+        '--sizeorder': FlagParameter('--', Name='sizeorder'),
+
+        # Chimera-specific parameters
+        # abundance skew for comparing parent/child putative clusters
+        '--abskew': ValuedParameter('--', Name='abskew', Delimiter=' ',
+                                    IsPath=False),
+
+        # min score to be classified as chimeric
+        '--minh': ValuedParameter('--', Name='minh', Delimiter=' ',
+                                  IsPath=False),
+
+        # weight of no vote
+        '--xn': ValuedParameter('--', Name='xn', Delimiter=' ',
+                                IsPath=False),
+
+        # pseudo count prior for no votes
+        '--dn': ValuedParameter('--', Name='dn', Delimiter=' ',
+                                IsPath=False),
+
+        # Minimum number of diffs in a segment
+        '--mindiffs': ValuedParameter('--', Name='mindiffs', Delimiter=' ',
+                                      IsPath=False),
+
+        # Minimum divergence between query and ref sequence
+        '--mindiv': ValuedParameter('--', Name='mindiv', Delimiter=' ',
+                                    IsPath=False),
+
+        # Threads allocated for multithreading calls.
+        '--threads': ValuedParameter('--', Name='threads',
+                                     Delimiter=' ', IsPath=False)
+    }
+
+    _suppress_stdout = False
+    _suppress_stderr = False
+
+    def _input_as_parameters(self, data):
+        """ Set the input path (a fasta filepath)
+        """
+        # The list of values which can be passed on a per-run basis
+        allowed_values = ['--uc', '--output', '--log',
+                          '--sortbylength', '--derep_fulllength', '--sizeout',
+                          '--minseqlength', '--strand', '--wordlength',
+                          '--maxrejects', '--usearch_global', '--db',
+                          '--cluster_smallmem', '--cluster_fast', '--id',
+                          '--maxaccepts', '--sizeorder', '--usersort',
+                          '--abskew', '--minh', '--xn', '--dn', '--mindiffs',
+                          '--mindiv', '--uchime_denovo', '--uchimeout',
+                          '--uchime_ref', '--threads'
+                          ]
+
+        unsupported_parameters = set(data.keys()) - set(allowed_values)
+        if unsupported_parameters:
+            raise ApplicationError(
+                "Unsupported parameter(s) passed when calling %s: %s" %
+                (self._command, ' '.join(unsupported_parameters)))
+
+        for v in allowed_values:
+            # turn the parameter off so subsequent runs are not
+            # affected by parameter settings from previous runs
+            self.Parameters[v].off()
+            if v in data:
+                # turn the parameter on if specified by the user
+                self.Parameters[v].on(data[v])
+
+        return ''
+
+    def _get_result_paths(self, data):
+        """ Set the result paths """
+
+        result = {}
+
+        result['Output'] = ResultPath(
+            Path=self.Parameters['--output'].Value,
+            IsWritten=self.Parameters['--output'].isOn())
+
+        result['ClusterFile'] = ResultPath(
+            Path=self.Parameters['--uc'].Value,
+            IsWritten=self.Parameters['--uc'].isOn())
+
+        return result
+
+    def _accept_exit_status(self, exit_status):
+        """ Test for acceptable exit status
+
+            usearch can seg fault and still generate a parsable .uc file
+            so we explicitly check the exit status
+
+        """
+        return exit_status == 0
+
+    def getHelp(self):
+        """Method that points to documentation"""
+        help_str =\
+            """
+        USEARCH is hosted at:
+        http://www.drive5.com/usearch/
+
+        The following papers should be cited if this resource is used:
+
+        Edgar,RC, Haas,BJ, Clemente,JC, Quince,C, Knight,R (2011) UCHIME
+        improves sensitivity and speed of chimera detection, Bioinformatics
+        """
+        return help_str
+
+# Start Usearch61 convenience functions
+
+
+def usearch61_ref_cluster(seq_path,
+                          refseqs_fp,
+                          percent_id=0.97,
+                          rev=False,
+                          save_intermediate_files=True,
+                          minlen=64,
+                          output_dir='.',
+                          remove_usearch_logs=False,
+                          verbose=False,
+                          wordlength=8,
+                          usearch_fast_cluster=False,
+                          usearch61_sort_method='abundance',
+                          otu_prefix="denovo",
+                          usearch61_maxrejects=32,
+                          usearch61_maxaccepts=1,
+                          sizeorder=False,
+                          suppress_new_clusters=False,
+                          threads=1.0,
+                          HALT_EXEC=False
+                          ):
+    """ Returns dictionary of cluster IDs:seq IDs
+
+    Overall function for reference-based clustering with usearch61
+
+    seq_path:  fasta filepath to be clustered with usearch61
+    refseqs_fp: reference fasta filepath, used to cluster sequences against.
+    percent_id:  percentage id to cluster at
+    rev: enable reverse strand matching for clustering
+    save_intermediate_files: Saves intermediate files created during clustering
+    minlen: minimum sequence length
+    output_dir: directory to output log, OTU mapping, and intermediate files
+    remove_usearch_logs: Saves usearch log files
+    verbose: print current processing step to stdout
+    wordlength: word length to use for clustering
+    usearch_fast_cluster: Use usearch61 fast cluster option, not as memory
+     efficient as the default cluster_smallmem option, requires sorting by
+     length, and does not allow reverse strand matching.
+    usearch61_sort_method:  Sort sequences by abundance or length by using
+     functionality provided by usearch61, or do not sort by using None option.
+    otu_prefix: label to place in front of OTU IDs, used to prevent duplicate
+     IDs from appearing with reference based OTU picking.
+    usearch61_maxrejects: Number of rejects allowed by usearch61
+    usearch61_maxaccepts: Number of accepts allowed by usearch61
+    sizeorder: used for clustering based upon abundance of seeds (only applies
+     when doing open reference de novo clustering)
+    suppress_new_clusters: If True, will allow de novo clustering on top of
+     reference clusters.
+    threads: Specify number of threads used per core per CPU
+    HALT_EXEC: application controller option to halt execution.
+
+    Description of analysis workflows
+    ---------------------------------
+    closed-reference approach:
+      dereplicate sequences first, do reference based clustering,
+      merge clusters/failures and dereplicated data,
+      write OTU mapping and failures file.
+
+    open-reference approach:
+      dereplicate sequences first, do reference based clustering, parse failures,
+      sort failures fasta according to chosen method, cluster failures, merge
+      reference clustering results/de novo results/dereplicated data, write
+      OTU mapping file.
+
+    Dereplication should save processing time for large datasets.
+
+    """
+
+    files_to_remove = []
+
+    # Need absolute paths to avoid potential problems with app controller
+    if output_dir:
+        output_dir = join(abspath(output_dir), '')
+
+    seq_path = abspath(seq_path)
+
+    try:
+
+        if verbose:
+            print "Presorting sequences according to abundance..."
+        intermediate_fasta, dereplicated_uc, app_result =\
+            sort_by_abundance_usearch61(seq_path, output_dir, rev,
+                                        minlen, remove_usearch_logs, HALT_EXEC,
+                                        output_fna_filepath=join(
+                                            output_dir,
+                                            'abundance_sorted.fna'),
+                                        output_uc_filepath=join(
+                                            output_dir,
+                                            'abundance_sorted.uc'),
+                                        threads=threads)
+        if not save_intermediate_files:
+            files_to_remove.append(intermediate_fasta)
+            files_to_remove.append(dereplicated_uc)
+
+        if verbose:
+            print "Performing reference based clustering..."
+        clusters_fp, app_result = usearch61_cluster_ref(intermediate_fasta,
+                                                        refseqs_fp, percent_id, rev, minlen, output_dir,
+                                                        remove_usearch_logs, wordlength, usearch61_maxrejects,
+                                                        usearch61_maxaccepts, HALT_EXEC,
+                                                        output_uc_filepath=join(
+                                                            output_dir,
+                                                            'ref_clustered.uc'),
+                                                        threads=threads)
+        if not save_intermediate_files:
+            files_to_remove.append(clusters_fp)
+
+        clusters, failures =\
+            parse_usearch61_clusters(open(clusters_fp, "U"), otu_prefix="",
+                                     ref_clustered=True)
+        dereplicated_clusters =\
+            parse_dereplicated_uc(open(dereplicated_uc, "U"))
+        clusters = merge_clusters_dereplicated_seqs(clusters,
+                                                    dereplicated_clusters)
+        failures = merge_failures_dereplicated_seqs(failures,
+                                                    dereplicated_clusters)
+
+        if not suppress_new_clusters and failures:
+            if verbose:
+                print "Parsing out sequences that failed to cluster..."
+            failures_fasta = parse_usearch61_failures(seq_path, set(failures),
+                                                      output_fasta_fp=join(output_dir, "failures_parsed.fna"))
+            if not save_intermediate_files:
+                files_to_remove.append(failures_fasta)
+            denovo_clusters = usearch61_denovo_cluster(failures_fasta,
+                                                       percent_id, rev, save_intermediate_files, minlen, output_dir,
+                                                       remove_usearch_logs, verbose, wordlength, usearch_fast_cluster,
+                                                       usearch61_sort_method, otu_prefix, usearch61_maxrejects,
+                                                       usearch61_maxaccepts, sizeorder, threads, HALT_EXEC)
+            failures = []
+
+            # Merge ref and denovo clusters
+            clusters.update(denovo_clusters)
+
+    except ApplicationError:
+        raise ApplicationError('Error running usearch61. Possible causes are '
+                               'unsupported version (current supported version is usearch '
+                               'v6.1.544) is installed or improperly formatted input file was '
+                               'provided')
+
+    except ApplicationNotFoundError:
+        remove_files(files_to_remove)
+        raise ApplicationNotFoundError('usearch61 not found, is it properly '
+                                       'installed?')
+
+    if not save_intermediate_files:
+        remove_files(files_to_remove)
+
+    return clusters, failures
+
+
+def usearch61_denovo_cluster(seq_path,
+                             percent_id=0.97,
+                             rev=False,
+                             save_intermediate_files=True,
+                             minlen=64,
+                             output_dir='.',
+                             remove_usearch_logs=False,
+                             verbose=False,
+                             wordlength=8,
+                             usearch_fast_cluster=False,
+                             usearch61_sort_method='abundance',
+                             otu_prefix="denovo",
+                             usearch61_maxrejects=32,
+                             usearch61_maxaccepts=1,
+                             sizeorder=False,
+                             threads=1.0,
+                             HALT_EXEC=False,
+                             file_prefix="denovo_"
+                             ):
+    """ Returns dictionary of cluster IDs:seq IDs
+
+    Overall function for de novo clustering with usearch61
+
+    seq_path:  fasta filepath to be clustered with usearch61
+    percent_id:  percentage id to cluster at
+    rev: enable reverse strand matching for clustering
+    save_intermediate_files: Saves intermediate files created during clustering
+    minlen: minimum sequence length
+    output_dir: directory to output log, OTU mapping, and intermediate files
+    remove_usearch_logs: Saves usearch log files
+    verbose: print current processing step to stdout
+    wordlength: word length to use for clustering
+    usearch_fast_cluster: Use usearch61 fast cluster option, not as memory
+     efficient as the default cluster_smallmem option, requires sorting by
+     length, and does not allow reverse strand matching.
+    usearch61_sort_method:  Sort sequences by abundance or length by using
+     functionality provided by usearch61, or do not sort by using None option.
+    otu_prefix: label to place in front of OTU IDs, used to prevent duplicate
+     IDs from appearing with reference based OTU picking.
+    usearch61_maxrejects: Number of rejects allowed by usearch61
+    usearch61_maxaccepts: Number of accepts allowed by usearch61
+    sizeorder: used for clustering based upon abundance of seeds
+    threads: Specify number of threads used per core per CPU
+    HALT_EXEC: application controller option to halt execution.
+    """
+
+    files_to_remove = []
+
+    # Need absolute paths to avoid potential problems with app controller
+    if output_dir:
+        output_dir = abspath(output_dir) + '/'
+    seq_path = abspath(seq_path)
+
+    try:
+        if verbose and usearch61_sort_method is not None and\
+                not usearch_fast_cluster:
+            print "Sorting sequences according to %s..." % usearch61_sort_method
+
+        # fast sorting option automatically performs length sorting
+        if usearch61_sort_method == 'abundance' and not usearch_fast_cluster:
+            intermediate_fasta, dereplicated_uc, app_result =\
+                sort_by_abundance_usearch61(seq_path, output_dir, rev,
+                                            minlen, remove_usearch_logs, HALT_EXEC,
+                                            output_fna_filepath=join(
+                                                output_dir,
+                                                file_prefix + 'abundance_sorted.fna'),
+                                            output_uc_filepath=join(output_dir,
+                                                                    file_prefix + 'abundance_sorted.uc'), threads=threads)
+            if not save_intermediate_files:
+                files_to_remove.append(intermediate_fasta)
+                files_to_remove.append(dereplicated_uc)
+        elif usearch61_sort_method == 'length' and not usearch_fast_cluster:
+            intermediate_fasta, app_result =\
+                sort_by_length_usearch61(seq_path, output_dir, minlen,
+                                         remove_usearch_logs, HALT_EXEC,
+                                         output_fna_filepath=join(output_dir,
+                                                                  file_prefix + 'length_sorted.fna'))
+            if not save_intermediate_files:
+                files_to_remove.append(intermediate_fasta)
+        else:
+            intermediate_fasta = seq_path
+
+        if verbose:
+            print "Clustering sequences de novo..."
+
+        if usearch_fast_cluster:
+            clusters_fp, app_result = usearch61_fast_cluster(
+                intermediate_fasta,
+                percent_id, minlen, output_dir, remove_usearch_logs, wordlength,
+                usearch61_maxrejects, usearch61_maxaccepts, HALT_EXEC,
+                output_uc_filepath=join(
+                    output_dir,
+                    file_prefix + 'fast_clustered.uc'), threads=threads)
+            if not save_intermediate_files:
+                files_to_remove.append(clusters_fp)
+        else:
+            clusters_fp, app_result =\
+                usearch61_smallmem_cluster(intermediate_fasta, percent_id,
+                                           minlen, rev, output_dir, remove_usearch_logs, wordlength,
+                                           usearch61_maxrejects, usearch61_maxaccepts, sizeorder, HALT_EXEC,
+                                           output_uc_filepath=join(output_dir,
+                                                                   file_prefix + 'smallmem_clustered.uc'))
+            if not save_intermediate_files:
+                files_to_remove.append(clusters_fp)
+
+    except ApplicationError:
+        raise ApplicationError('Error running usearch61. Possible causes are '
+                               'unsupported version (current supported version is usearch ' +
+                               'v6.1.544) is installed or improperly formatted input file was ' +
+                               'provided')
+
+    except ApplicationNotFoundError:
+        remove_files(files_to_remove)
+        raise ApplicationNotFoundError('usearch61 not found, is it properly ' +
+                                       'installed?')
+
+    if usearch61_sort_method == 'abundance' and not usearch_fast_cluster:
+        de_novo_clusters, failures =\
+            parse_usearch61_clusters(open(clusters_fp, "U"), otu_prefix)
+        dereplicated_clusters =\
+            parse_dereplicated_uc(open(dereplicated_uc, "U"))
+        clusters = merge_clusters_dereplicated_seqs(de_novo_clusters,
+                                                    dereplicated_clusters)
+
+    else:
+        clusters, failures =\
+            parse_usearch61_clusters(open(clusters_fp, "U"), otu_prefix)
+
+    if not save_intermediate_files:
+        remove_files(files_to_remove)
+
+    return clusters
+
+
+#   Start fasta sorting functions
+def sort_by_abundance_usearch61(seq_path,
+                                output_dir='.',
+                                rev=False,
+                                minlen=64,
+                                remove_usearch_logs=False,
+                                HALT_EXEC=False,
+                                output_fna_filepath=None,
+                                output_uc_filepath=None,
+                                log_name="abundance_sorted.log",
+                                threads=1.0):
+    """ usearch61 application call to sort fasta file by abundance.
+
+    seq_path:  fasta filepath to be clustered with usearch61
+    output_dir: directory to output log, OTU mapping, and intermediate files
+    rev: enable reverse strand matching for clustering/sorting
+    minlen: minimum sequence length
+    remove_usearch_logs: Saves usearch log files
+    HALT_EXEC: application controller option to halt execution
+    output_fna_filepath: path to write sorted fasta filepath
+    output_uc_filepath: path to write usearch61 generated .uc file
+    log_name: filepath to write usearch61 generated log file
+    threads: Specify number of threads used per core per CPU
+    """
+
+    if not output_fna_filepath:
+        _, output_fna_filepath = mkstemp(prefix='abundance_sorted',
+                                         suffix='.fna')
+
+    if not output_uc_filepath:
+        _, output_uc_filepath = mkstemp(prefix='abundance_sorted',
+                                        suffix='.uc')
+
+    log_filepath = join(output_dir, log_name)
+
+    params = {'--minseqlength': minlen,
+              '--sizeout': True,
+              '--derep_fulllength': seq_path,
+              '--output': output_fna_filepath,
+              '--uc': output_uc_filepath,
+              '--threads': threads
+              }
+
+    if rev:
+        params['--strand'] = 'both'
+    if not remove_usearch_logs:
+        params['--log'] = log_filepath
+
+    app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+    app_result = app()
+
+    return output_fna_filepath, output_uc_filepath, app_result
+
+
+def sort_by_length_usearch61(seq_path,
+                             output_dir=".",
+                             minlen=64,
+                             remove_usearch_logs=False,
+                             HALT_EXEC=False,
+                             output_fna_filepath=None,
+                             log_name="length_sorted.log"):
+    """ usearch61 application call to sort fasta file by length.
+
+    seq_path:  fasta filepath to be clustered with usearch61
+    output_dir: directory to output log, OTU mapping, and intermediate files
+    minlen: minimum sequence length
+    remove_usearch_logs: Saves usearch log files
+    HALT_EXEC: application controller option to halt execution
+    output_fna_filepath: path to write sorted fasta filepath
+    log_name: filepath to write usearch61 generated log file
+    """
+
+    if not output_fna_filepath:
+        _, output_fna_filepath = mkstemp(prefix='length_sorted', suffix='.fna')
+
+    log_filepath = join(output_dir, log_name)
+
+    params = {'--minseqlength': minlen,
+              '--sortbylength': seq_path,
+              '--output': output_fna_filepath
+              }
+    if not remove_usearch_logs:
+        params['--log'] = log_filepath
+
+    app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+    app_result = app()
+
+    return output_fna_filepath, app_result
+
+#   End fasta sorting functions
+
+#   Start reference clustering functions
+
+
+def usearch61_cluster_ref(intermediate_fasta,
+                          refseqs_fp,
+                          percent_id=0.97,
+                          rev=False,
+                          minlen=64,
+                          output_dir=".",
+                          remove_usearch_logs=False,
+                          wordlength=8,
+                          usearch61_maxrejects=32,
+                          usearch61_maxaccepts=1,
+                          HALT_EXEC=False,
+                          output_uc_filepath=None,
+                          log_filepath="ref_clustered.log",
+                          threads=1.0
+                          ):
+    """ Cluster input fasta seqs against reference database
+
+    seq_path:  fasta filepath to be clustered with usearch61
+    refseqs_fp: reference fasta filepath, used to cluster sequences against.
+    percent_id:  percentage id to cluster at
+    rev: enable reverse strand matching for clustering
+    minlen: minimum sequence length
+    output_dir: directory to output log, OTU mapping, and intermediate files
+    remove_usearch_logs: Saves usearch log files
+    wordlength: word length to use for clustering
+    usearch61_maxrejects: Number of rejects allowed by usearch61
+    usearch61_maxaccepts: Number of accepts allowed by usearch61
+    output_uc_filepath: path to write usearch61 generated .uc file
+    threads: Specify number of threads used per core per CPU
+    HALT_EXEC: application controller option to halt execution.
+    """
+
+    log_filepath = join(output_dir, log_filepath)
+
+    params = {
+        '--usearch_global': intermediate_fasta,
+        '--db': refseqs_fp,
+        '--minseqlength': minlen,
+        '--id': percent_id,
+        '--uc': output_uc_filepath,
+        '--wordlength': wordlength,
+        '--maxrejects': usearch61_maxrejects,
+        '--maxaccepts': usearch61_maxaccepts,
+        '--threads': threads
+    }
+
+    if not remove_usearch_logs:
+        params['--log'] = log_filepath
+    if rev:
+        params['--strand'] = 'both'
+    else:
+        params['--strand'] = 'plus'
+
+    clusters_fp = output_uc_filepath
+
+    app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+    app_result = app()
+
+    return clusters_fp, app_result
+
+#   End reference clustering functions
+
+#   Start de novo clustering functions
+
+
+def usearch61_fast_cluster(intermediate_fasta,
+                           percent_id=0.97,
+                           minlen=64,
+                           output_dir=".",
+                           remove_usearch_logs=False,
+                           wordlength=8,
+                           usearch61_maxrejects=8,
+                           usearch61_maxaccepts=1,
+                           HALT_EXEC=False,
+                           output_uc_filepath=None,
+                           log_name="fast_clustered.log",
+                           threads=1.0):
+    """ Performs usearch61 de novo fast clustering via cluster_fast option
+
+    Only supposed to be used with length sorted data (and performs length
+    sorting automatically) and does not support reverse strand matching
+
+    intermediate_fasta:  fasta filepath to be clustered with usearch61
+    percent_id:  percentage id to cluster at
+    minlen: minimum sequence length
+    output_dir: directory to output log, OTU mapping, and intermediate files
+    remove_usearch_logs: Saves usearch log files
+    wordlength: word length to use for initial high probability sequence matches
+    usearch61_maxrejects: Set to 'default' or an int value specifying max
+     rejects
+    usearch61_maxaccepts: Number of accepts allowed by usearch61
+    HALT_EXEC: application controller option to halt execution
+    output_uc_filepath: Path to write clusters (.uc) file.
+    log_name: filepath to write usearch61 generated log file
+    threads: Specify number of threads used per core per CPU
+    """
+
+    log_filepath = join(output_dir, log_name)
+
+    params = {'--minseqlength': minlen,
+              '--cluster_fast': intermediate_fasta,
+              '--id': percent_id,
+              '--uc': output_uc_filepath,
+              '--wordlength': wordlength,
+              '--maxrejects': usearch61_maxrejects,
+              '--maxaccepts': usearch61_maxaccepts,
+              '--usersort': True,
+              '--threads': threads
+              }
+
+    if not remove_usearch_logs:
+        params['--log'] = log_filepath
+
+    clusters_fp = output_uc_filepath
+
+    app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+    app_result = app()
+
+    return clusters_fp, app_result
+
+
+def usearch61_smallmem_cluster(intermediate_fasta,
+                               percent_id=0.97,
+                               minlen=64,
+                               rev=False,
+                               output_dir=".",
+                               remove_usearch_logs=False,
+                               wordlength=8,
+                               usearch61_maxrejects=32,
+                               usearch61_maxaccepts=1,
+                               sizeorder=False,
+                               HALT_EXEC=False,
+                               output_uc_filepath=None,
+                               log_name="smallmem_clustered.log",
+                               sizeout=False,
+                               consout_filepath=None):
+    """ Performs usearch61 de novo clustering via cluster_smallmem option
+
+    Only supposed to be used with length sorted data (and performs length
+    sorting automatically) and does not support reverse strand matching
+
+    intermediate_fasta:  fasta filepath to be clustered with usearch61
+    percent_id:  percentage id to cluster at
+    minlen: minimum sequence length
+    rev: will enable reverse strand matching if True
+    output_dir: directory to output log, OTU mapping, and intermediate files
+    remove_usearch_logs: Saves usearch log files
+    wordlength: word length to use for initial high probability sequence matches
+    usearch61_maxrejects: Set to 'default' or an int value specifying max
+     rejects
+    usearch61_maxaccepts: Number of accepts allowed by usearch61
+    HALT_EXEC: application controller option to halt execution
+    output_uc_filepath: Path to write clusters (.uc) file.
+    log_name: filepath to write usearch61 generated log file
+    sizeout: If True, will save abundance data in output fasta labels.
+    consout_filepath: Needs to be set to save clustered consensus fasta
+     filepath used for chimera checking.
+    """
+
+    log_filepath = join(output_dir, log_name)
+
+    params = {'--minseqlength': minlen,
+              '--cluster_smallmem': intermediate_fasta,
+              '--id': percent_id,
+              '--uc': output_uc_filepath,
+              '--wordlength': wordlength,
+              '--maxrejects': usearch61_maxrejects,
+              '--maxaccepts': usearch61_maxaccepts,
+              '--usersort': True
+              }
+
+    if sizeorder:
+        params['--sizeorder'] = True
+    if not remove_usearch_logs:
+        params['--log'] = log_filepath
+    if rev:
+        params['--strand'] = 'both'
+    else:
+        params['--strand'] = 'plus'
+    if sizeout:
+        params['--sizeout'] = True
+    if consout_filepath:
+        params['--consout'] = consout_filepath
+
+    clusters_fp = output_uc_filepath
+
+    app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+    app_result = app()
+
+    return clusters_fp, app_result
+
+#   End de novo clustering functions
+
+#   Start Chimera checking functions
+
+
+def usearch61_chimera_check_denovo(abundance_fp,
+                                   uchime_denovo_fp,
+                                   minlen=64,
+                                   output_dir=".",
+                                   remove_usearch_logs=False,
+                                   uchime_denovo_log_fp="uchime_denovo.log",
+                                   usearch61_minh=0.28,
+                                   usearch61_xn=8.0,
+                                   usearch61_dn=1.4,
+                                   usearch61_mindiffs=3,
+                                   usearch61_mindiv=0.8,
+                                   usearch61_abundance_skew=2.0,
+                                   HALT_EXEC=False):
+    """ Does de novo, abundance based chimera checking with usearch61
+
+    abundance_fp: input consensus fasta file with abundance information for
+     each cluster.
+    uchime_denovo_fp: output uchime file for chimera results.
+    minlen: minimum sequence length for usearch input fasta seqs.
+    output_dir: output directory
+    removed_usearch_logs: suppresses creation of log file.
+    uchime_denovo_log_fp: output filepath for log file.
+    usearch61_minh: Minimum score (h) to be classified as chimera.
+     Increasing this value tends to the number of false positives (and also
+     sensitivity).
+    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
+     number of false positives (and also sensitivity).
+    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this
+     value tends to the number of false positives (and also sensitivity).
+    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
+     value tends to reduce the number of false positives while reducing
+     sensitivity to very low-divergence chimeras.
+    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the
+     query and closest reference database sequence. Expressed as a percentage,
+     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
+     to a reference sequence.
+    usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
+    HALTEXEC: halt execution and returns command used for app controller.
+    """
+
+    params = {'--minseqlength': minlen,
+              '--uchime_denovo': abundance_fp,
+              '--uchimeout': uchime_denovo_fp,
+              '--minh': usearch61_minh,
+              '--xn': usearch61_xn,
+              '--dn': usearch61_dn,
+              '--mindiffs': usearch61_mindiffs,
+              '--mindiv': usearch61_mindiv,
+              '--abskew': usearch61_abundance_skew
+              }
+
+    if not remove_usearch_logs:
+        params['--log'] = uchime_denovo_log_fp
+
+    app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+    app_result = app()
+
+    return uchime_denovo_fp, app_result
+
+
+def usearch61_chimera_check_ref(abundance_fp,
+                                uchime_ref_fp,
+                                reference_seqs_fp,
+                                minlen=64,
+                                output_dir=".",
+                                remove_usearch_logs=False,
+                                uchime_ref_log_fp="uchime_ref.log",
+                                usearch61_minh=0.28,
+                                usearch61_xn=8.0,
+                                usearch61_dn=1.4,
+                                usearch61_mindiffs=3,
+                                usearch61_mindiv=0.8,
+                                threads=1.0,
+                                HALT_EXEC=False):
+    """ Does reference based chimera checking with usearch61
+
+    abundance_fp: input consensus fasta file with abundance information for
+     each cluster.
+    uchime_ref_fp: output uchime filepath for reference results
+    reference_seqs_fp: reference fasta database for chimera checking.
+    minlen: minimum sequence length for usearch input fasta seqs.
+    output_dir: output directory
+    removed_usearch_logs: suppresses creation of log file.
+    uchime_denovo_log_fp: output filepath for log file.
+    usearch61_minh: Minimum score (h) to be classified as chimera.
+     Increasing this value tends to the number of false positives (and also
+     sensitivity).
+    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
+     number of false positives (and also sensitivity).
+    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this
+     value tends to the number of false positives (and also sensitivity).
+    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
+     value tends to reduce the number of false positives while reducing
+     sensitivity to very low-divergence chimeras.
+    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the
+     query and closest reference database sequence. Expressed as a percentage,
+     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
+     to a reference sequence.
+    threads: Specify number of threads used per core per CPU
+    HALTEXEC: halt execution and returns command used for app controller.
+    """
+
+    params = {'--minseqlength': minlen,
+              '--uchime_ref': abundance_fp,
+              '--uchimeout': uchime_ref_fp,
+              '--db': reference_seqs_fp,
+              '--minh': usearch61_minh,
+              '--xn': usearch61_xn,
+              '--dn': usearch61_dn,
+              '--mindiffs': usearch61_mindiffs,
+              '--mindiv': usearch61_mindiv,
+              # Only works in plus according to usearch doc
+              '--strand': 'plus',
+              '--threads': threads
+              }
+
+    if not remove_usearch_logs:
+        params['--log'] = uchime_ref_log_fp
+
+    app = Usearch61(params, WorkingDir=output_dir, HALT_EXEC=HALT_EXEC)
+
+    app_result = app()
+
+    return uchime_ref_fp, app_result
+
+#   End chimera checking functions
+
+#   Start parsing functions
+
+
+def parse_dereplicated_uc(dereplicated_uc_lines):
+    """ Return dict of seq ID:dereplicated seq IDs from dereplicated .uc lines
+
+    dereplicated_uc_lines: list of lines of .uc file from dereplicated seqs from
+     usearch61 (i.e. open file of abundance sorted .uc data)
+    """
+
+    dereplicated_clusters = {}
+
+    seed_hit_ix = 0
+    seq_id_ix = 8
+    seed_id_ix = 9
+
+    for line in dereplicated_uc_lines:
+        if line.startswith("#") or len(line.strip()) == 0:
+            continue
+        curr_line = line.strip().split('\t')
+        if curr_line[seed_hit_ix] == "S":
+            dereplicated_clusters[curr_line[seq_id_ix]] = []
+        if curr_line[seed_hit_ix] == "H":
+            curr_seq_id = curr_line[seq_id_ix]
+            dereplicated_clusters[curr_line[seed_id_ix]].append(curr_seq_id)
+
+    return dereplicated_clusters
+
+
+def parse_usearch61_clusters(clustered_uc_lines,
+                             otu_prefix='denovo',
+                             ref_clustered=False):
+    """ Returns dict of cluster ID:seq IDs
+
+    clustered_uc_lines: lines from .uc file resulting from de novo clustering
+    otu_prefix: string added to beginning of OTU ID.
+    ref_clustered: If True, will attempt to create dict keys for clusters as
+     they are read from the .uc file, rather than from seed lines.
+    """
+
+    clusters = {}
+    failures = []
+
+    seed_hit_ix = 0
+    otu_id_ix = 1
+    seq_id_ix = 8
+    ref_id_ix = 9
+
+    for line in clustered_uc_lines:
+        if line.startswith("#") or len(line.strip()) == 0:
+            continue
+        curr_line = line.strip().split('\t')
+        if curr_line[seed_hit_ix] == "S":
+            # Need to split on semicolons for sequence IDs to handle case of
+            # abundance sorted data
+            clusters[otu_prefix + curr_line[otu_id_ix]] =\
+                [curr_line[seq_id_ix].split(';')[0].split()[0]]
+        if curr_line[seed_hit_ix] == "H":
+            curr_id = curr_line[seq_id_ix].split(';')[0].split()[0]
+            if ref_clustered:
+                try:
+                    clusters[otu_prefix + curr_line[ref_id_ix]].append(curr_id)
+                except KeyError:
+                    clusters[otu_prefix + curr_line[ref_id_ix]] = [curr_id]
+            else:
+                clusters[otu_prefix +
+                         curr_line[otu_id_ix]].append(curr_id)
+        if curr_line[seed_hit_ix] == "N":
+            failures.append(curr_line[seq_id_ix].split(';')[0])
+
+    return clusters, failures
+
+
+def merge_clusters_dereplicated_seqs(de_novo_clusters,
+                                     dereplicated_clusters):
+    """ combines de novo clusters and dereplicated seqs to OTU id:seqs dict
+
+    de_novo_clusters: dict of OTU ID:clustered sequences
+    dereplicated_clusters:  dict of seq IDs: dereplicated seq IDs
+    """
+
+    clusters = {}
+
+    for curr_denovo_key in de_novo_clusters.keys():
+        clusters[curr_denovo_key] = de_novo_clusters[curr_denovo_key]
+        curr_clusters = []
+        for curr_denovo_id in de_novo_clusters[curr_denovo_key]:
+            curr_clusters += dereplicated_clusters[curr_denovo_id]
+        clusters[curr_denovo_key] += curr_clusters
+
+    return clusters
+
+
+def merge_failures_dereplicated_seqs(failures,
+                                     dereplicated_clusters):
+    """ Appends failures from dereplicated seqs to failures list
+
+    failures: list of failures
+    dereplicated_clusters:  dict of seq IDs: dereplicated seq IDs
+    """
+
+    curr_failures = set(failures)
+    dereplicated_ids = set(dereplicated_clusters)
+
+    for curr_failure in curr_failures:
+        if curr_failure in dereplicated_ids:
+            failures += dereplicated_clusters[curr_failure]
+
+    return failures
+
+
+def parse_usearch61_failures(seq_path,
+                             failures,
+                             output_fasta_fp):
+    """ Parses seq IDs from failures list, writes to output_fasta_fp
+
+    seq_path: filepath of original input fasta file.
+    failures: list/set of failure seq IDs
+    output_fasta_fp: path to write parsed sequences
+    """
+
+    parsed_out = open(output_fasta_fp, "w")
+
+    for label, seq in parse_fasta(open(seq_path), "U"):
+        curr_label = label.split()[0]
+        if curr_label in failures:
+            parsed_out.write(">%s\n%s\n" % (label, seq))
+    parsed_out.close()
+    return output_fasta_fp
+
+#   End parsing functions
diff --git a/bfillings/vsearch.py b/bfillings/vsearch.py
new file mode 100644
index 0000000..5b1969b
--- /dev/null
+++ b/bfillings/vsearch.py
@@ -0,0 +1,575 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2015--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# -----------------------------------------------------------------------------
+
+""" Application controller for vsearch v1.1.1 """
+
+from os.path import abspath, join, dirname
+
+from burrito.parameters import ValuedParameter, FlagParameter
+from burrito.util import (CommandLineApplication, ResultPath,
+                          ApplicationError)
+
+
+class Vsearch(CommandLineApplication):
+
+    """ Vsearch ApplicationController """
+
+    _command = 'vsearch'
+    _input_handler = '_input_as_parameters'
+    _parameters = {
+        # Output to specified FASTA file
+        '--output': ValuedParameter('--', Name='output', Delimiter=' ',
+                                    IsPath=True),
+
+        # Filename for UCLUST-like output
+        '--uc': ValuedParameter('--', Name='uc', Delimiter=' ',
+                                IsPath=True),
+
+        # Filename for BLAST-like tab-separated output
+        '--blast6out': ValuedParameter('--', Name='blast6out', Delimiter=' ',
+                                       IsPath=True),
+
+        # ID percent for OTU, by default is 97%
+        '--id': ValuedParameter('--', Name='id', Delimiter=' ',
+                                IsPath=False, Value=None),
+
+        # ID definition, 0-4=CD-HIT,all,int,MBL,BLAST (default vsearch: 2)
+        '--iddef': ValuedParameter('--', Name='iddef',
+                                   Delimiter=' ', IsPath=False,
+                                   Value=None),
+
+        # Number of hits to accept and show per strand (default vsearch: 1)
+        '--maxaccepts':
+        ValuedParameter('--', Name='maxaccepts', Delimiter=' ', Value=None),
+
+        # Number of non-matching hits to consider (default vsearch: 32)
+        '--maxrejects':
+        ValuedParameter('--', Name='maxrejects', Delimiter=' ', Value=None),
+
+        # Indicate that input sequences are presorted
+        '--usersort': FlagParameter('--', Name='usersort'),
+
+        # Take into account the abundance annotations present
+        # in the input fasta file
+        '--sizein': FlagParameter('--', Name='sizein'),
+
+        # Add abundance annotations to the output fasta files
+        '--sizeout': FlagParameter('--', Name='sizeout'),
+
+        # Dereplicate exact sequences in the given FASTA file
+        '--derep_fulllength': ValuedParameter('--', Name='derep_fulllength',
+                                              Delimiter=' ', IsPath=True),
+
+        # Dereplicate plus or both strands (default vsearch: plus)
+        '--strand': ValuedParameter('--', Name='strand', Delimiter=' ',
+                                    IsPath=False),
+
+        # Discard sequences with an abundance value greater than integer
+        '--maxuniquesize': ValuedParameter('--', Name='maxuniquesize',
+                                           Delimiter=' ', IsPath=False),
+
+        # Discard sequences with an abundance value smaller than integer
+        '--minuniquesize': ValuedParameter('--', Name='minuniquesize',
+                                           Delimiter=' ',
+                                           IsPath=False),
+
+        # Abundance sort sequences in given FASTA file
+        '--sortbysize': ValuedParameter('--', Name='sortbysize', Delimiter=' ',
+                                        IsPath=True),
+
+        # When using --sortbysize, discard sequences
+        # with an abundance value greater than maxsize
+        '--maxsize': ValuedParameter('--', Name='maxsize', Delimiter=' ',
+                                     IsPath=False),
+
+        # When using --sortbysize, discard sequences
+        # with an abundance value smaller than minsize
+        '--minsize': ValuedParameter('--', Name='minsize', Delimiter=' ',
+                                     IsPath=False),
+
+        # Output cluster consensus sequences to FASTA file
+        '--consout': ValuedParameter('--', Name='consout', Delimiter=' ',
+                                     IsPath=True),
+
+        # Chimera detection: min abundance ratio of parent vs chimera
+        # (default vsearch: 2.0)
+        '--abskew': ValuedParameter('--', Name='abskew', Delimiter=' ',
+                                    IsPath=False, Value=None),
+        # Detect chimeras de novo
+        '--uchime_denovo': ValuedParameter('--', Name='uchime_denovo',
+                                           Delimiter=' ', IsPath=True),
+
+        # Detect chimeras using a reference database
+        '--uchime_ref': ValuedParameter('--', Name='uchime_ref',
+                                        Delimiter=' ', IsPath=True),
+
+        # Output chimera alignments to 3-way alignment file (filepath)
+        '--uchimealns': ValuedParameter('--', Name='uchimealns', Delimiter=' ',
+                                        IsPath=True),
+
+        # Output chimeric sequences to file (filepath)
+        '--chimeras': ValuedParameter('--', Name='chimeras',
+                                      Delimiter=' ', IsPath=True),
+
+        # Output non-chimera filepath
+        '--nonchimeras': ValuedParameter('--', Name='nonchimeras',
+                                         Delimiter=' ', IsPath=True),
+
+        # Reference database for --uchime_ref
+        '--db': ValuedParameter('--', Name='db', Delimiter=' ', IsPath=True),
+
+        # Output to chimera info to tab-separated file
+        '--uchimeout': ValuedParameter('--', Name='uchimeout', Delimiter=' ',
+                                       IsPath=True),
+
+        # Number of computation threads to use (1 to 256)
+        # note: by default, keep the value set to 1 for all commands
+        # since otherwise (if no other value is given) VSEARCH will use
+        # all available cores
+        '--threads': ValuedParameter('--', Name='threads', Delimiter=' ',
+                                     IsPath=False, Value="1"),
+
+        # Write messages, timing and memory info to file
+        '--log': ValuedParameter('--', Name='log', Delimiter=' ',
+                                 IsPath=True)
+    }
+
+    _suppress_stdout = False
+    _suppress_stderr = False
+
+    def _input_as_parameters(self, data):
+        """ Set the input path (a fasta filepath)
+        """
+        # The list of values which can be passed on a per-run basis
+        allowed_values = ['--uc', '--output', '--sortbysize',
+                          '--consout', '--uchime_denovo',
+                          '--derep_fulllength', '--maxuniquesize',
+                          '--minuniquesize', '--sizein',
+                          '--sizeout', '--strand', '--threads',
+                          '--uchime_ref', '--chimeras',
+                          '--nonchimeras', '--db', '--uchimeout',
+                          '--blast6out', '--abskew',
+                          '--sortbysize', '--maxsize', '--minsize']
+
+        unsupported_parameters = set(data.keys()) - set(allowed_values)
+        if unsupported_parameters:
+            raise ApplicationError(
+                "Unsupported parameter(s) passed when calling vsearch: %s" %
+                ' '.join(unsupported_parameters))
+
+        for v in allowed_values:
+            # turn the parameter off so subsequent runs are not
+            # affected by parameter settings from previous runs
+            self.Parameters[v].off()
+            if v in data:
+                # turn the parameter on if specified by the user
+                self.Parameters[v].on(data[v])
+
+        return ''
+
+    def _get_result_paths(self, data):
+        """ Set the result paths """
+
+        result = {}
+
+        result['Output'] = ResultPath(
+            Path=self.Parameters['--output'].Value,
+            IsWritten=self.Parameters['--output'].isOn())
+
+        result['ClusterFile'] = ResultPath(
+            Path=self.Parameters['--uc'].Value,
+            IsWritten=self.Parameters['--uc'].isOn())
+
+        # uchime 3-way global alignments
+        result['Output_aln'] = ResultPath(
+            Path=self.Parameters['--uchimealns'].Value,
+            IsWritten=self.Parameters['--uchimealns'].isOn())
+
+        # uchime tab-separated format
+        result['Output_tabular'] = ResultPath(
+            Path=self.Parameters['--uchimeout'].Value,
+            IsWritten=self.Parameters['--uchimeout'].isOn())
+
+        # chimeras fasta file output
+        result['Output_chimeras'] = ResultPath(
+            Path=self.Parameters['--chimeras'].Value,
+            IsWritten=self.Parameters['--chimeras'].isOn())
+
+        # nonchimeras fasta file output
+        result['Output_nonchimeras'] = ResultPath(
+            Path=self.Parameters['--nonchimeras'].Value,
+            IsWritten=self.Parameters['--nonchimeras'].isOn())
+
+        # log file
+        result['LogFile'] = ResultPath(
+            Path=self.Parameters['--log'].Value,
+            IsWritten=self.Parameters['--log'].isOn())
+
+        return result
+
+    def getHelp(self):
+        """Method that points to documentation"""
+        help_str = """
+        VSEARCH is hosted at:
+        https://github.com/torognes/vsearch
+        Please cite the above URL if this wrapper is used in published work.
+        """
+        return help_str
+
+
+def vsearch_dereplicate_exact_seqs(
+    fasta_filepath,
+    output_filepath,
+    output_uc=False,
+    working_dir=None,
+    strand="both",
+    maxuniquesize=None,
+    minuniquesize=None,
+    sizein=False,
+    sizeout=True,
+    log_name="derep.log",
+    HALT_EXEC=False):
+    """ Generates clusters and fasta file of
+        dereplicated subsequences
+
+        Parameters
+        ----------
+
+        fasta_filepath : string
+           input filepath of fasta file to be dereplicated
+        output_filepath : string
+           write the dereplicated sequences to output_filepath
+        working_dir : string, optional
+           directory path for storing intermediate output
+        output_uc : boolean, optional
+           uutput dereplication results in a file using a
+           uclust-like format
+        strand : string, optional
+           when searching for strictly identical sequences,
+           check the 'strand' only (default: both) or
+           check the plus strand only
+        maxuniquesize : integer, optional
+           discard sequences with an abundance value greater
+           than maxuniquesize
+        minuniquesize : integer, optional
+           discard sequences with an abundance value smaller
+           than integer
+        sizein : boolean, optional
+           take into account the abundance annotations present in
+           the input fasta file,  (search for the pattern
+           "[>;]size=integer[;]" in sequence headers)
+        sizeout : boolean, optional
+           add abundance annotations to the output fasta file
+           (add the pattern ";size=integer;" to sequence headers)
+        log_name : string, optional
+           specifies log filename
+        HALT_EXEC : boolean, optional
+           used for debugging app controller
+
+        Return
+        ------
+
+        output_filepath : string
+           filepath to dereplicated fasta file
+        uc_filepath : string
+           filepath to dereplication results in uclust-like format
+        log_filepath : string
+           filepath to log file
+    """
+
+    # write all vsearch output files to same directory
+    # as output_filepath if working_dir is not specified
+    if not working_dir:
+        working_dir = dirname(abspath(output_filepath))
+
+    app = Vsearch(WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    log_filepath = join(working_dir, log_name)
+    uc_filepath = None
+    if output_uc:
+        uc_filepath = join(working_dir, 'vsearch_uc_dereplicated.uc')
+        app.Parameters['--uc'].on(uc_filepath)
+
+    if maxuniquesize:
+        app.Parameters['--maxuniquesize'].on(maxuniquesize)
+    if minuniquesize:
+        app.Parameters['--minuniquesize'].on(minuniquesize)
+    if sizein:
+        app.Parameters['--sizein'].on()
+    if sizeout:
+        app.Parameters['--sizeout'].on()
+    if (strand == "both" or strand == "plus"):
+        app.Parameters['--strand'].on(strand)
+    else:
+        raise ValueError("Option --strand accepts only 'both'"
+                         "or 'plus' values")
+    app.Parameters['--derep_fulllength'].on(fasta_filepath)
+    app.Parameters['--output'].on(output_filepath)
+    app.Parameters['--log'].on(log_filepath)
+
+    app_result = app()
+
+    return output_filepath, uc_filepath, log_filepath
+
+
+def vsearch_sort_by_abundance(
+    fasta_filepath,
+    output_filepath,
+    working_dir=None,
+    minsize=None,
+    maxsize=None,
+    log_name="abundance_sort.log",
+    HALT_EXEC=False):
+    """ Fasta entries are sorted by decreasing abundance
+        (Fasta entries are assumed to be dereplicated with
+        the pattern "[>;]size=integer[;]" present in the
+        read label, ex. use function vsearch_dereplicate_exact_seqs
+        prior to calling this function)
+
+        Parameters
+        ----------
+
+        fasta_filepath : string
+           input fasta file (dereplicated fasta)
+        output_filepath : string
+           output filepath for the sorted sequences in fasta format
+        working_dir : string, optional
+           working directory to store intermediate files
+        minsize : integer, optional
+           discard sequences with an abundance value smaller than
+           minsize
+        maxsize : integer, optional
+           discard sequences with an abundance value greater than
+           maxsize
+        log_name : string, optional
+           log filename
+        HALT_EXEC : boolean, optional
+           used for debugging app controller
+
+        Return
+        ------
+
+        output_filepath : string
+           filepath to sorted fasta file
+        log_filepath : string
+           filepath to log file
+    """
+
+    # set working dir to same directory as the output
+    # file (if not provided)
+    if not working_dir:
+        working_dir = dirname(output_filepath)
+
+    app = Vsearch(WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    log_filepath = join(working_dir, log_name)
+
+    if minsize:
+        app.Parameters['--minsize'].on(minsize)
+
+    if maxsize:
+        app.Parameters['--maxsize'].on(maxsize)
+
+    app.Parameters['--sortbysize'].on(fasta_filepath)
+    app.Parameters['--output'].on(output_filepath)
+    app.Parameters['--log'].on(log_filepath)
+
+    app_result = app()
+
+    return output_filepath, log_filepath
+
+
+def vsearch_chimera_filter_de_novo(
+    fasta_filepath,
+    working_dir,
+    output_chimeras=True,
+    output_nonchimeras=True,
+    output_alns=False,
+    output_tabular=False,
+    log_name="vsearch_uchime_de_novo_chimera_filtering.log",
+    HALT_EXEC=False):
+    """ Detect chimeras present in the fasta-formatted filename,
+        without external references (i.e. de novo). Automatically
+        sort the sequences in filename by decreasing abundance
+        beforehand. Output chimeras and non-chimeras to FASTA files
+        and/or 3-way global alignments and/or tabular output.
+
+        Parameters
+        ----------
+
+        fasta_filepath : string
+           input fasta file (dereplicated fasta with pattern
+           [>;]size=integer[;] in the fasta header)
+        working_dir : string
+           directory path for all output files
+        output_chimeras : boolean, optional
+           output chimeric sequences to file, in fasta format
+        output_nonchimeras : boolean, optional
+           output nonchimeric sequences to file, in fasta format
+        output_alns : boolean, optional
+           output 3-way global alignments (parentA, parentB, chimera)
+           in human readable format to file
+        output_tabular : boolean, optional
+           output results using the uchime tab-separated format of
+           18 fields (see Vsearch user manual)
+        HALT_EXEC : boolean, optional
+           used for debugging app controller
+
+        Return
+        ------
+
+        output_chimera_filepath : string
+           filepath to chimeric fasta sequences
+        output_non_chimera_filepath : string
+           filepath to nonchimeric fasta sequences
+        output_alns_filepath : string
+           filepath to chimeric sequences alignment
+           file
+        output_tabular_filepath : string
+           filepath to chimeric sequences tabular
+           output file
+        log_filepath : string
+           filepath to log file
+    """
+
+    app = Vsearch(WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if not (output_chimeras or
+            output_nonchimeras or
+            output_alns or
+            output_tabular):
+        raise ValueError("At least one output format (output_chimeras,"
+                         "output_nonchimeras, output_alns, output_tabular)"
+                         "must be selected")
+
+    output_chimera_filepath = None
+    output_non_chimera_filepath = None
+    output_alns_filepath = None
+    output_tabular_filepath = None
+
+    # set output filepaths
+    if output_chimeras:
+        output_chimera_filepath = join(working_dir, 'uchime_chimeras.fasta')
+        app.Parameters['--chimeras'].on(output_chimera_filepath)
+    if output_nonchimeras:
+        output_non_chimera_filepath = join(working_dir,
+                                           'uchime_non_chimeras.fasta')
+        app.Parameters['--nonchimeras'].on(output_non_chimera_filepath)
+    if output_alns:
+        output_alns_filepath = join(working_dir, 'uchime_alignments.txt')
+        app.Parameters['--uchimealns'].on(output_alns_filepath)
+    if output_tabular:
+        output_tabular_filepath = join(working_dir, 'uchime_tabular.txt')
+        app.Parameters['--uchimeout'].on(output_tabular_filepath)
+    log_filepath = join(working_dir, log_name)
+
+    app.Parameters['--uchime_denovo'].on(fasta_filepath)
+    app.Parameters['--log'].on(log_filepath)
+
+    app_result = app()
+
+    return output_chimera_filepath, output_non_chimera_filepath,\
+        output_alns_filepath, output_tabular_filepath, log_filepath
+
+
+def vsearch_chimera_filter_ref(
+    fasta_filepath,
+    working_dir,
+    db_filepath,
+    output_chimeras=True,
+    output_nonchimeras=True,
+    output_alns=False,
+    output_tabular=False,
+    log_name="vsearch_uchime_ref_chimera_filtering.log",
+    threads=1,
+    HALT_EXEC=False):
+    """ Detect chimeras present in the fasta-formatted filename,
+        with an external reference (i.e. database). Output
+        chimeras and non-chimeras to FASTA files and/or 3-way
+        global alignments and/or tabular output.
+
+        Parameters
+        ----------
+
+        fasta_filepath : string
+           input fasta file (dereplicated fasta)
+        working_dir : string
+           directory path for all output files
+        db_filepath : string
+           filepath to reference database
+        output_chimeras : boolean, optional
+           output chimeric sequences to file, in fasta format
+        output_nonchimeras : boolean, optional
+           output nonchimeric sequences to file, in fasta format
+        output_alns : boolean, optional
+           output 3-way global alignments (parentA, parentB, chimera)
+           in human readable format to file
+        output_tabular : boolean, optional
+           output results using the uchime tab-separated format of
+           18 fields (see Vsearch user manual)
+        threads : integer, optional
+           number of computation threads to use (1 to 256)
+        HALT_EXEC : boolean, optional
+           used for debugging app controller
+
+        Return
+        ------
+
+        output_chimera_filepath : string
+           filepath to chimeric fasta sequences
+        output_non_chimera_filepath : string
+           filepath to nonchimeric fasta sequences
+        output_alns_filepath : string
+           filepath to chimeric sequences alignment
+           file
+        output_tabular_filepath : string
+           filepath to chimeric sequences tabular
+           output file
+        log_filepath : string
+           filepath to log file
+    """
+
+    app = Vsearch(WorkingDir=working_dir, HALT_EXEC=HALT_EXEC)
+
+    if not (output_chimeras or
+            output_nonchimeras or
+            output_alns or
+            output_tabular):
+        raise ValueError("At least one output format (output_chimeras,"
+                         "output_nonchimeras, output_alns, output_tabular)"
+                         "must be selected")
+
+    output_chimera_filepath = None
+    output_non_chimera_filepath = None
+    output_alns_filepath = None
+    output_tabular_filepath = None
+
+    # set output filepaths
+    if output_chimeras:
+        output_chimera_filepath = join(working_dir, 'uchime_chimeras.fasta')
+        app.Parameters['--chimeras'].on(output_chimera_filepath)
+    if output_nonchimeras:
+        output_non_chimera_filepath = join(working_dir,
+                                           'uchime_non_chimeras.fasta')
+        app.Parameters['--nonchimeras'].on(output_non_chimera_filepath)
+    if output_alns:
+        output_alns_filepath = join(working_dir, 'uchime_alignments.txt')
+        app.Parameters['--uchimealns'].on(output_alns_filepath)
+    if output_tabular:
+        output_tabular_filepath = join(working_dir, 'uchime_tabular.txt')
+        app.Parameters['--uchimeout'].on(output_tabular_filepath)
+    log_filepath = join(working_dir, log_name)
+
+    app.Parameters['--db'].on(db_filepath)
+    app.Parameters['--uchime_ref'].on(fasta_filepath)
+    app.Parameters['--log'].on(log_filepath)
+
+    app_result = app()
+
+    return output_chimera_filepath, output_non_chimera_filepath,\
+        output_alns_filepath, output_tabular_filepath, log_filepath
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index e3bc501..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,35 +0,0 @@
-python-burrito-fillings (0.1.1-1) UNRELEASED; urgency=medium
-
-  * Initial upload to Debian (Closes: #800739)
-  * Enhance long description
-  * cme fix dpkg-control (found need to s/clustal-w/clustalw/ !)
-
- -- Andreas Tille <tille at debian.org>  Sat, 03 Oct 2015 07:21:52 +0200
-
-python-burrito-fillings (0.1.1-0biolinux1) trusty; urgency=medium
-
-  * Bugfix release to go with burrito 0.9.1
-  * Refreshed patches
-  * Patched RDP Classifier runner - see notes in the patch
-  * Patched out 3 internal tests which now fail and we don't need
-    them
-
- -- Tim Booth <tbooth at ceh.ac.uk>  Wed, 29 Jul 2015 14:35:48 +0100
-
-python-burrito-fillings (0.1.0-0biolinux4) trusty; urgency=medium
-
-  * Limit to 64-bit architectures.  Some deps are 64-bit only.
-  * Insist on newer rdp-classifier, and modify tests to match
-  * Disable BLAST tests as they fail on launchpad and I have no
-    way to debug the failure
-
- -- Tim Booth <tbooth at ceh.ac.uk>  Thu, 05 Mar 2015 16:25:18 +0000
-
-python-burrito-fillings (0.1.0-0biolinux1) trusty; urgency=medium
-
-  * Initial release for QIIME 1.9
-  * Remove many tests - see comments in rules
-  * Lots of patches
-  * Build for Python2, as QIIME is Python2 only.
-
- -- Tim Booth <tbooth at ceh.ac.uk>  Thu, 05 Mar 2015 18:10:56 +0000
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 435bfc6..0000000
--- a/debian/control
+++ /dev/null
@@ -1,67 +0,0 @@
-Source: python-burrito-fillings
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Tim Booth <tbooth at ceh.ac.uk>,
-           Andreas Tille <tille at debian.org>
-Section: python
-Priority: optional
-Build-Depends: debhelper (>= 9),
-               python-all (>= 2.7),
-               dh-python,
-               python-burrito,
-               python-skbio,
-               python-cogent,
-               python-lockfile,
-               python-setuptools,
-               python-tk,
-               blast2,
-               bwa,
-               clearcut,
-               muscle,
-               parsinsert,
-               raxml,
-               rdp-classifier,
-               sortmerna,
-               sumatra,
-               sumaclust,
-               swarm,
-               vsearch (>= 1.1.3)
-Standards-Version: 3.9.6
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/python-burrito-fillings/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/python-burrito-fillings/trunk/
-Homepage: https://github.com/biocore/burrito-fillings
-
-Package: python-burrito-fillings
-Architecture: amd64 kfreebsd-amd64
-Depends: ${shlibs:Depends},
-         ${misc:Depends},
-         ${python:Depends}
-Recommends: blast2,
-            bwa,
-            cd-hit,
-            clearcut,
-            clustalw,
-            ea-utils,
-            fasttree,
-            infernal,
-            mafft,
-            mothur,
-            muscle,
-            parsinsert,
-            raxml,
-            rdp-classifier,
-#             rtax,
-            seqprep,
-            sortmerna,
-            sumatra,
-            swarm,
-            vsearch
-Description: burrito application controllers for bioinformatics
- The burrito-fillings project provides wrappers for bioinformatics tools
- using the burrito framework.
- .
- burrito-fillings (canonically pronounced boar-ee-toe phil-ings; python
- package name bfillings) contains burrito CommandLineApplication
- subclasses (i.e., application controllers) for bioinformatics
- applications. This is intended to be a temporary package for the
- application controllers that are used in QIIME as we figure out which of
- these we will continue to support.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index b5507e5..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,35 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: burrito-fillings
-Upstream-Contact: gregcaporaso at gmail.com
-Source: https://github.com/biocore/burrito/
-
-Files: *
-Copyright: © burrito development team <gregcaporaso at gmail.com>
-License:
- Copyright (c) 2014, burrito development team.
- All rights reserved.
- .
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- .
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- .
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- .
- * Neither the names burrito or biocore nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
- .
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/debian/patches/cd_hit_leaves_no_bak_file b/debian/patches/cd_hit_leaves_no_bak_file
deleted file mode 100644
index c55c563..0000000
--- a/debian/patches/cd_hit_leaves_no_bak_file
+++ /dev/null
@@ -1,32 +0,0 @@
-This is a port of the accept_newer_cdhit patch from python-cogent.
-The code in question seems to be copied form Cogent in the first place.
-
---- a/bfillings/cd_hit.py
-+++ b/bfillings/cd_hit.py
-@@ -269,7 +269,12 @@
-     # perform cleanup
-     res.cleanUp()
-     shutil.rmtree(working_dir)
--    remove(params['-o'] + '.bak.clstr')
-+    try:
-+        remove(params['-o'] + '.bak.clstr')
-+    except:
-+        #No file to clean up from later CD-HIT
-+        pass
-+
- 
-     return remapped_clusters
- 
-@@ -311,7 +316,11 @@
-     # perform cleanup
-     res.cleanUp()
-     shutil.rmtree(working_dir)
--    remove(params['-o'] + '.bak.clstr')
-+    try:
-+        remove(params['-o'] + '.bak.clstr')
-+    except:
-+        #No file to clean up from later CD-HIT
-+        pass
- 
-     return SequenceCollection(new_seqs, MolType=moltype)
- 
diff --git a/debian/patches/handle_renamed_binaries b/debian/patches/handle_renamed_binaries
deleted file mode 100644
index a7ea520..0000000
--- a/debian/patches/handle_renamed_binaries
+++ /dev/null
@@ -1,168 +0,0 @@
-# Some binaries are renamed when packaging, mostly to satisfy the guideline
-# that there should be no capitalization.
-# Also explicit call to /usr/lib/mafft/bin/mafft-profile
-# I was also going to use the rdp_classifier wrapper in Debian but it's
-# too much faff.  Instead look for /usr/share/java/rdp_classifier.jar
-
---- a/bfillings/parsinsert.py
-+++ b/bfillings/parsinsert.py
-@@ -29,7 +29,7 @@
- class ParsInsert(CommandLineApplication):
-     """ParsInsert application Controller"""
- 
--    _command = 'ParsInsert'
-+    _command = 'parsinsert'
-     _input_handler = '_input_as_multiline_string'
-     _parameters = {
-                     # read mask from this file
---- a/bfillings/seqprep.py
-+++ b/bfillings/seqprep.py
-@@ -30,7 +30,7 @@
- class SeqPrep(CommandLineApplication):
- 
-     """SeqPrep application controller for joining paired-end reads"""
--    _command = 'SeqPrep'
-+    _command = 'seqprep'
-     _parameters = {
-         # Required Arguments
-         # -f <first read input fastq filename>
-@@ -232,7 +232,7 @@
-         """seqprep help"""
-         help_str = """
-         For basic help, type the following at the command line:
--            'SeqPrep -h'
-+            'seqprep -h'
- 
-         Website:
-             https://github.com/jstjohn/SeqPrep
---- a/bfillings/fasttree.py
-+++ b/bfillings/fasttree.py
-@@ -27,7 +27,7 @@
- class FastTree(CommandLineApplication):
-     """FastTree application Controller"""
- 
--    _command = 'FastTree'
-+    _command = 'fasttree'
-     _input_handler = '_input_as_multiline_string'
-     _parameters = {
-             '-quiet':FlagParameter('-',Name='quiet'),
---- a/bfillings/mafft.py
-+++ b/bfillings/mafft.py
-@@ -439,7 +439,7 @@
-     app = Mafft(InputHandler='_input_as_paths',\
-         params=params,
-         SuppressStderr=False)
--    app._command = 'mafft-profile'
-+    app._command = '/usr/lib/mafft/bin/mafft-profile'
- 
-     aln1_path = app._tempfile_as_multiline_string(aln1_int_map.toFasta())
-     aln2_path = app._tempfile_as_multiline_string(aln2_int_map.toFasta())
---- a/bfillings/tests/test_parsinsert.py
-+++ b/bfillings/tests/test_parsinsert.py
-@@ -64,7 +64,7 @@
- 
-         app = ParsInsert()
-         self.assertEqual(app.BaseCommand, \
--                         ''.join(['cd "',getcwd(),'/"; ','ParsInsert']))
-+                         ''.join(['cd "',getcwd(),'/"; ','parsinsert']))
- 
-     def test_change_working_dir(self):
-         """Change working dir"""
-@@ -72,7 +72,7 @@
-         app = ParsInsert(WorkingDir='/tmp/ParsInsertTest')
-         self.assertEqual(app.BaseCommand, \
-                        ''.join(['cd "','/tmp/ParsInsertTest',\
--                                '/"; ','ParsInsert']))
-+                                '/"; ','parsinsert']))
- 
-         rmtree('/tmp/ParsInsertTest')
- 
---- a/bfillings/rdp_classifier.py
-+++ b/bfillings/rdp_classifier.py
-@@ -32,12 +32,10 @@
-     """RDP Classifier application controller
- 
-     The RDP Classifier program is distributed as a java archive (.jar)
--    file.  If the file 'rdp_classifier-2.2.jar' is not found in the
--    current directory, the app controller uses the JAR file specified
-+    file.  If set, the app controller uses the JAR file specified
-     by the environment variable RDP_JAR_PATH.  If this variable is not
--    set, and 'rdp_classifier-2.2.jar' is not found in the current
--    directory, the application controller raises an
--    ApplicationNotFoundError.
-+    set, and '/usr/share/java/rdp_classifier.jar' is not found,
-+    the application controller raises an ApplicationNotFoundError.
- 
-     The RDP Classifier often requires memory in excess of Java's
-     default 64M. To correct this situation, the authors recommend
-@@ -51,7 +49,7 @@
-     '-training-data'.
-     """
-     _input_handler = '_input_as_lines'
--    _command = "rdp_classifier-2.2.jar"
-+    _command = "rdp_classifier.jar"
-     _options = {
-         # output file name for classification assignment
-         '-o': ValuedParameter('-', Name='o', Delimiter=' ', IsPath=True),
-@@ -140,7 +138,7 @@
-         jar_fp = self._get_jar_fp()
-         if jar_fp is None:
-             raise ApplicationNotFoundError(
--                "JAR file not found in current directory and the RDP_JAR_PATH "
-+                "JAR file not found in /usr/share/java and the RDP_JAR_PATH "
-                 "environment variable is not set.  Please set RDP_JAR_PATH to "
-                 "the full pathname of the JAR file.")
-         if not os.path.exists(jar_fp):
-@@ -150,19 +148,9 @@
-     def _get_jar_fp(self):
-         """Returns the full path to the JAR file.
- 
--        If the JAR file cannot be found in the current directory and
--        the environment variable RDP_JAR_PATH is not set, returns
--        None.
-+        If the RDP_JAR_PATH is not set, returns /usr/share/java/rdp_classifier.jar
-         """
--        # handles case where the jar file is in the current working directory
--        if os.path.exists(self._command):
--            return self._command
--        # handles the case where the user has specified the location via
--        # an environment variable
--        elif 'RDP_JAR_PATH' in environ:
--            return getenv('RDP_JAR_PATH')
--        else:
--            return None
-+        return getenv('RDP_JAR_PATH', '/usr/share/java/rdp_classifier.jar')
- 
-     # Overridden to pull out JVM-specific command-line arguments.
-     def _get_base_command(self):
---- a/bfillings/tests/test_rdp_classifier.py
-+++ b/bfillings/tests/test_rdp_classifier.py
-@@ -27,7 +27,7 @@
-         if 'RDP_JAR_PATH' in environ:
-             self.user_rdp_jar_path = environ['RDP_JAR_PATH']
-         else:
--            self.user_rdp_jar_path = 'rdp_classifier-2.2.jar'
-+            self.user_rdp_jar_path = '/usr/share/java/rdp_classifier.jar'
-         self.output_file = tempfile.NamedTemporaryFile()
- 
-     def test_default_java_vm_parameters(self):
---- a/bfillings/swarm_v127.py
-+++ b/bfillings/swarm_v127.py
-@@ -106,7 +106,7 @@
- 
-             Return: clusters, a list of lists
-         """
--        swarm_breaker_command = ["swarm_breaker.py",
-+        swarm_breaker_command = ["/usr/share/swarm/scripts/swarm_breaker.py",
-                                  "-f",
-                                  seq_path,
-                                  "-s",
-@@ -140,7 +140,7 @@
-                 clusters.append(seq_ids)
-         except OSError:
-             raise ApplicationNotFoundError("Cannot find swarm_breaker.py "
--                                           "in the $PATH directories.")
-+                                           "in the expected location /usr/share/swarm/scripts.")
- 
-         return clusters
- 
diff --git a/debian/patches/mothur_skip_list_header b/debian/patches/mothur_skip_list_header
deleted file mode 100644
index 3efe1e8..0000000
--- a/debian/patches/mothur_skip_list_header
+++ /dev/null
@@ -1,63 +0,0 @@
-This fixes the main error revealed by the tests, but they still fail as the output is
-not byte identical.
---- a/bfillings/mothur.py
-+++ b/bfillings/mothur.py
-@@ -52,7 +52,10 @@
-         tokens = line.strip().split('\t')
- 
-         distance_str = tokens.pop(0)
--        if distance_str.lstrip().lower().startswith('u'):
-+        if distance_str.lstrip().lower().startswith('l'):
-+            #This is the header line
-+            continue
-+        elif distance_str.lstrip().lower().startswith('u'):
-             distance = 0.0
-         elif distance_str == '0.0':
-             distance = float(precision)
---- a/bfillings/tests/test_mothur.py
-+++ b/bfillings/tests/test_mothur.py
-@@ -121,7 +121,7 @@
-         """Mothur.__call__() should return correct otu's for input as single string"""
-         app = Mothur()
-         result = app(self.small_fasta)
--        observed_otus = result['otu list'].read()
-+        observed_otus = result['otu list'].read().split('\n',1)[1]
-         self.assertEquals(observed_otus, self.small_otus)
-         result.cleanUp()
- 
-@@ -130,7 +130,7 @@
-         lines = self.small_fasta.split('\n')
-         app = Mothur(InputHandler='_input_as_lines')
-         result = app(lines)
--        observed_otus = result['otu list'].read()
-+        observed_otus = result['otu list'].read().split('\n',1)[1]
-         self.assertEquals(observed_otus, self.small_otus)
-         result.cleanUp()
- 
-@@ -142,7 +142,7 @@
-             f.write(self.small_fasta)
-         app = Mothur(InputHandler='_input_as_path', WorkingDir=working_dir)
-         result = app(filename)
--        observed_otus = result['otu list'].read()
-+        observed_otus = result['otu list'].read().split('\n',1)[1]
-         self.assertEquals(observed_otus, self.small_otus)
-         remove(filename)
-         result.cleanUp()
-@@ -153,7 +153,7 @@
-         working_dir = mkdtemp()
-         app = Mothur(WorkingDir=working_dir)
-         result = app(self.small_fasta)
--        observed_otus = result['otu list'].read()
-+        observed_otus = result['otu list'].read().split('\n',1)[1]
-         self.assertEquals(observed_otus, self.small_otus)
-         result.cleanUp()
-         rmdir(working_dir)
-@@ -162,7 +162,7 @@
-         """Mothur.__call__() should return correct otu's for input sequences which are reverse complements"""
-         app = Mothur()
-         result = app(self.complement_fasta)
--        observed_otus = result['otu list'].read()
-+        observed_otus = result['otu list'].read().split('\n',1)[1]
-         self.assertEquals(observed_otus, self.complement_otus)
-         result.cleanUp()
- 
diff --git a/debian/patches/no_set_blastmat b/debian/patches/no_set_blastmat
deleted file mode 100644
index 3a48d58..0000000
--- a/debian/patches/no_set_blastmat
+++ /dev/null
@@ -1,12 +0,0 @@
-BLAST on Debian does not need this variable set, so suppress the error.
---- a/bfillings/blast.py
-+++ b/bfillings/blast.py
-@@ -168,7 +168,7 @@
-                     access(path.expanduser("~/.ncbirc"), F_OK) or \
-                     access(".ncbirc", F_OK)):
-                 ## SHOULD THIS BE CHANGED TO RAISE AN ApplicationError?
--                raise RuntimeError, blastmat_error_message
-+                pass
-             self._command = command
- 
-         super(Blast, self).__init__(params=params,
diff --git a/debian/patches/rdp_classifier_2.10 b/debian/patches/rdp_classifier_2.10
deleted file mode 100644
index fc5fe82..0000000
--- a/debian/patches/rdp_classifier_2.10
+++ /dev/null
@@ -1,106 +0,0 @@
-Newer RDP classifier takes the same params but needs parameter names
-to preceed them.  The whole way Burrito handles this is broken, so
-this is a crude patch-up.
-
-Also, the new RDP Classifier JAR uses the entry point:
-edu.msu.cme.rdp.classifier.cli.ClassifierMain
-and not:
-edu.msu.cme.rdp.classifier.ClassifierCmd
-
-On cursory inspection, it looks like the default behaviour of the new
-entry point is the same as the old entry point, but for Burrito it
-isn't.
-This change was made in RDP Classifier ages ago but I only just
-fixed the entry point in the DEB and thus triggered the bug.  The patch
-calls the entry point explicitly.
-
---- a/bfillings/rdp_classifier.py
-+++ b/bfillings/rdp_classifier.py
-@@ -162,7 +162,7 @@
-         jvm_command = "java"
-         jvm_arguments = self._commandline_join(
-             [self.Parameters[k] for k in self._jvm_parameters])
--        jar_arguments = '-jar "%s"' % self._get_jar_fp()
-+        jar_arguments = '-cp "%s" edu.msu.cme.rdp.classifier.ClassifierCmd' % self._get_jar_fp()
-         rdp_arguments = self._commandline_join(
-             [self.Parameters[k] for k in self._options])
- 
-@@ -197,11 +197,11 @@
-     PropertiesFile = 'RdpClassifier.properties'
- 
-     _parameters = {
--        'taxonomy_file': ValuedParameter(None, None, IsPath=True),
--        'model_output_dir': ValuedParameter(None, None, IsPath=True),
--        'training_set_id': ValuedParameter(None, None, Value='1'),
--        'taxonomy_version': ValuedParameter(None, None, Value='version1'),
--        'modification_info': ValuedParameter(None, None, Value='cogent'),
-+        'taxonomy_file':     ValuedParameter('-', Name='t', IsPath=True),
-+        'model_output_dir':  ValuedParameter('-', Name='o', IsPath=True),
-+        'training_set_id':   ValuedParameter('-', Name='n', Value='1'),
-+        'taxonomy_version':  ValuedParameter('-', Name='v', Value='version1'),
-+        'modification_info': ValuedParameter('-', Name='m', Value='cogent'),
-     }
-     _jvm_parameters = {
-         # Maximum heap size for JVM.
-@@ -253,11 +253,11 @@
-         input_handler = getattr(self, self.__InputHandler)
-         input_parts = [
-             self.Parameters['taxonomy_file'],
--            input_handler(data),
-+            '-s ' + input_handler(data),
-             self.Parameters['training_set_id'],
-             self.Parameters['taxonomy_version'],
-             self.Parameters['modification_info'],
--            self.ModelDir,
-+            '-o ' + self.ModelDir,
-         ]
-         return self._commandline_join(input_parts)
- 
---- a/bfillings/tests/test_rdp_classifier.py
-+++ b/bfillings/tests/test_rdp_classifier.py
-@@ -13,7 +13,7 @@
- from os import getcwd, environ, remove, listdir
- from shutil import rmtree
- import tempfile
--from unittest import TestCase, main
-+from unittest import TestCase, main, expectedFailure
- 
- from bfillings.rdp_classifier import (RdpClassifier, RdpTrainer, assign_taxonomy,
-                                    train_rdp_classifier,
-@@ -42,6 +42,7 @@
-         parameters.sort()
-         self.assertEqual(parameters, ['-Xmx', '-f', '-o', '-t'])
- 
-+    @expectedFailure
-     def test_assign_jvm_parameters(self):
-         """RdpCalssifier should pass alternate parameters to Java VM."""
-         app = RdpClassifier()
-@@ -56,6 +57,7 @@
-         app = RdpClassifier()
-         self.assertEqual(app.BaseCommand, app._get_base_command())
- 
-+    @expectedFailure
-     def test_base_command(self):
-         """RdpClassifier should return expected shell command."""
-         app = RdpClassifier()
-@@ -64,6 +66,7 @@
-             self.user_rdp_jar_path, '" -q'])
-         self.assertEqual(app.BaseCommand, exp)
- 
-+    @expectedFailure
-     def test_change_working_dir(self):
-         """RdpClassifier should run program in expected working directory."""
-         test_dir = '/tmp/RdpTest'
-@@ -387,10 +390,10 @@
- rdp_expected_out = {
-     'AY800210 description field': 'Archaea;Euryarchaeota',
-     'EU883771': 'Archaea;Euryarchaeota;Methanomicrobia;Methanomicrobiales;Methanomicrobiaceae;Methanomicrobium',
--    'EF503699': 'Archaea;Crenarchaeota;Thermoprotei',
-+    'EF503699': 'Archaea;Thaumarchaeota;Nitrososphaerales;Nitrososphaerales;Nitrososphaeraceae;Nitrososphaera',
-     'random_seq': 'Bacteria',
-     'DQ260310': 'Archaea;Euryarchaeota;Methanobacteria;Methanobacteriales;Methanobacteriaceae;Methanosphaera',
--    'EF503697': 'Archaea;Crenarchaeota;Thermoprotei',
-+    'EF503697': 'Archaea;Thaumarchaeota;Nitrososphaerales;Nitrososphaerales;Nitrososphaeraceae;Nitrososphaera',
-     'short_seq': 'Unassignable',
-     }
- 
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 80fd631..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1,7 +0,0 @@
-handle_renamed_binaries
-no_set_blastmat
-cd_hit_leaves_no_bak_file
-mothur_skip_list_header
-test_raxml_accept_new_version
-rdp_classifier_2.10
-test_usearch_known_failures
diff --git a/debian/patches/test_raxml_accept_new_version b/debian/patches/test_raxml_accept_new_version
deleted file mode 100644
index b949cdc..0000000
--- a/debian/patches/test_raxml_accept_new_version
+++ /dev/null
@@ -1,32 +0,0 @@
-Modify the tests to pass with the newer raXML
---- a/bfillings/tests/test_raxml_v730.py
-+++ b/bfillings/tests/test_raxml_v730.py
-@@ -38,7 +38,9 @@
-         version_string = stdout.strip().split(' ')[4].strip()
-         try:
-             version = tuple(map(int,version_string.split('.')))
--            pass_test = version == acceptable_version
-+            # This is a stupid thing to do and a stupid place to do it.
-+            # Bypassed check for DEB build.
-+            pass_test = True
-         except ValueError:
-             pass_test = False
-             version_string = stdout
-@@ -199,7 +201,7 @@
-                 node.Name = align_map[new_node_name]
- 
-         self.assertTrue(isinstance(tree, PhyloNode))
--        self.assertEqual(tree.getNewick(with_distances=True),RESULT_TREE)
-+        self.assertTrue(re.match(RESULT_TREE, tree.getNewick(with_distances=True)))
-         self.assertEqual(len(tree.tips()), 7)
-         self.assertRaises(NotImplementedError, build_tree_from_alignment, \
-                          self.align1, RNA, True)
-@@ -230,7 +232,7 @@
- REF_TREE="""((seq0000004:0.08408,seq0000005:0.13713)0.609:0.00215,seq0000003:0.02032,(seq0000001:0.00014,seq0000002:0.00014)0.766:0.00015);
- """
- 
--RESULT_TREE="""(Species003:0.0194919169324,(Species001:4.34281710439e-07,Species002:4.34281710439e-07):4.34281710439e-07,(((Species006:0.0,Species007:0.0):0.0,Species004:0.0438017433031):0.0438017433031,Species005:0.171345128781):0.00331197405878);"""
-+RESULT_TREE=r"""\(Species003:0\.019[0-9]*,\(Species001:4\.34[0-9]*e-07,Species002:4\.34[0-9]*e-07\):4\.34[0-9]*e-07,\(\(\(Species006:0\.0,Species007:0\.0\):0\.0,Species004:0\.043[0-9]*\):0\.043[0-9]*,Species005:0\.171[0-9]*\):0.00331[0-9]*\);$"""
- 
- if __name__ == '__main__':
-     main()
diff --git a/debian/patches/test_usearch_known_failures b/debian/patches/test_usearch_known_failures
deleted file mode 100644
index 0eb019c..0000000
--- a/debian/patches/test_usearch_known_failures
+++ /dev/null
@@ -1,39 +0,0 @@
-I've replaced uSearch with vSearch.  It seems to mostly work as expected.
-The main oddity is the ref_open_ref test that seems to return a very
-different number of clusters.  May be a vSearch bug?
---- a/bfillings/tests/test_usearch.py
-+++ b/bfillings/tests/test_usearch.py
-@@ -16,7 +16,7 @@
- from os.path import basename, join, exists
- from shutil import rmtree
- from glob import glob
--from unittest import TestCase, main
-+from unittest import TestCase, main, expectedFailure, skip
- from tempfile import mkstemp, mkdtemp
- 
- from skbio.util import remove_files
-@@ -202,6 +202,7 @@
-         self.assertEqual(clusters, expected_clusters)
-         self.assertEqual(failures, expected_failures)
- 
-+    @expectedFailure
-     def test_usearch61_ref_open_ref(self):
-         """ usearch61 does open reference OTU picking """
- 
-@@ -540,6 +541,7 @@
- 
-         self._files_to_remove.append(uchime_fp)
- 
-+    @expectedFailure
-     def test_usearch61_ref_chimera_detection(self):
-         """ usearch61 ref chimera detection correctly flags chimeras """
- 
-@@ -562,7 +564,7 @@
- 
-         self._files_to_remove.append(uchime_fp)
- 
--
-+ at skip("no usearch in Debian")
- class UsearchTests(TestCase):
- 
-     def setUp(self):
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index bc5dcfc..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/make -f
-# -*- makefile -*-
-
-# Uncomment this to turn on verbose mode.
-#export DH_VERBOSE=1
-
-PKG := $(shell dpkg-parsechangelog | sed -n 's/^Source: //p')
-
-# At the moment only build for Py2 because QIIME only works with Py2
-
-%:
-	dh $@ --with python2 --buildsystem=pybuild
-
-
-override_dh_clean:
-	dh_clean
-	rm -f *.log
-	#rm -rf *.egg-info/*
-
-override_dh_auto_test:
-	# Eliminate expected failures.  We have no blat
-	rm -f .pybuild/*/build/bfillings/tests/test_blat.*
-	# BLAST/formatdb works fine in pbuilder but on launchpad.net if the
-	# blast tests run the later formatdb tests fail.  No idea why. Cleanup? Disk space??
-	#rm -f .pybuild/*/build/bfillings/tests/test_blast*
-	# cd_hit no longer returns byte-identical results to the ancient 3.1.1
-	rm -f .pybuild/*/build/bfillings/tests/test_cd_hit.*
-	# clustalw is broken but not normally used by QIIME in any case
-	rm -f .pybuild/*/build/bfillings/tests/test_clustalw.*
-	# fasttree tests are borked and test nothing useful anyway
-	rm -f .pybuild/*/build/bfillings/tests/test_fasttree*
-	# Infernal tests are for old 1.0, we have 1.1
-	rm -f .pybuild/*/build/bfillings/tests/test_infernal*
-	# Mafft tests are for some ancient version
-	rm -f .pybuild/*/build/bfillings/tests/test_mafft*
-	# Mothur produces equivalent but not byte-identical output following
-	# my little patch.
-	rm -f .pybuild/*/build/bfillings/tests/test_mothur*
-	# pplacer is a big job to package!  For now users can grab the binary
-	# if they want it.
-	rm -f .pybuild/*/build/bfillings/tests/test_pplacer*
-	# rtax is awful and seems totally wedded to uSearch - as in, it relies on specific
-	# I/O buffering behaviour not just the paremeters and output formats.
-	rm -f .pybuild/*/build/bfillings/tests/test_rtax*
-	# We don't have uClust, though it is in the bio-linux-qiime package
-	rm -f .pybuild/*/build/bfillings/tests/test_uclust*
-	dh_auto_test
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 665b356..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,3 +0,0 @@
-version=4
-
-https://github.com/biocore/burrito-fillings/releases .*/archive/@ANY_VERSION@@ARCHIVE_EXT@
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..1de6f57
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2013--, biocore development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+__version__ = '0.1.1'
+
+from setuptools import find_packages, setup
+from distutils.command.build_py import build_py
+
+classes = """
+    Development Status :: 1 - Planning
+    License :: OSI Approved :: BSD License
+    Topic :: Software Development :: Libraries
+    Topic :: Scientific/Engineering
+    Topic :: Scientific/Engineering :: Bio-Informatics
+    Programming Language :: Python
+    Programming Language :: Python :: 2.7
+    Operating System :: Unix
+    Operating System :: POSIX
+    Operating System :: MacOS :: MacOS X
+"""
+classifiers = [s.strip() for s in classes.split('\n') if s]
+
+long_description = """The burrito-fillings project"""
+
+setup(name='burrito-fillings',
+      cmdclass={'build_py': build_py},
+      version=__version__,
+      license='BSD',
+      description=\
+        'burrito-fillings: burrito application controllers for bioinformatics',
+      long_description=long_description,
+      author="biocore",
+      author_email="gregcaporaso at gmail.com",
+      maintainer="biocore",
+      maintainer_email="gregcaporaso at gmail.com",
+      url='https://github.com/biocore/burrito-fillings',
+      packages=find_packages(),
+      install_requires=['scikit-bio >= 0.2.1, < 0.3.0', 'burrito  < 1.0.0'],
+      classifiers=classifiers)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-burrito-fillings.git