[med-svn] [vsearch] 01/10: New upstream version 2.3.0

Andreas Tille tille at debian.org
Mon Dec 5 11:49:38 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository vsearch.

commit cc668cc8e864e24ba9129966aed1ffd39ab45183
Author: Andreas Tille <tille at debian.org>
Date:   Sat Oct 15 15:56:23 2016 +0200

    New upstream version 2.3.0
---
 configure.ac                   |    2 +-
 man/vsearch.1                  | 1136 +++++++++++++++++++++++++++-------------
 src/Makefile.am                |    2 +
 src/align_simd.cc              |   30 +-
 src/allpairs.cc                |    6 +-
 src/chimera.cc                 |    4 +-
 src/cluster.cc                 |  129 ++++-
 src/derep.cc                   |   26 +-
 src/fasta.cc                   |    2 +
 src/fastq.cc                   |    4 +
 src/fastqops.cc                |  113 ++--
 src/fastqops.h                 |    3 +-
 src/linmemalign.cc             |   17 +-
 src/linmemalign.h              |    2 +
 src/mergepairs.cc              |    4 +-
 src/otutable.cc                |  405 ++++++++++++++
 src/{fastqops.h => otutable.h} |   13 +-
 src/results.cc                 |   13 +-
 src/results.h                  |    3 +-
 src/search.cc                  |   67 ++-
 src/searchcore.cc              |   66 ++-
 src/searchcore.h               |    4 +-
 src/searchexact.cc             |   73 ++-
 src/vsearch.cc                 |  346 +++++++-----
 src/vsearch.h                  |    8 +
 test/unclassified.sh           |  314 +++++++++++
 26 files changed, 2153 insertions(+), 639 deletions(-)

diff --git a/configure.ac b/configure.ac
index a55a50f..ff32971 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.63])
-AC_INIT([vsearch], [2.0.5], [torognes at ifi.uio.no])
+AC_INIT([vsearch], [2.3.0], [torognes at ifi.uio.no])
 AM_INIT_AUTOMAKE([subdir-objects])
 AC_LANG([C++])
 AC_CONFIG_SRCDIR([src/vsearch.cc])
diff --git a/man/vsearch.1 b/man/vsearch.1
index 3c9260f..41cdac6 100644
--- a/man/vsearch.1
+++ b/man/vsearch.1
@@ -1,11 +1,11 @@
 .\" ============================================================================
-.TH vsearch 1 "September 1, 2016" "version 2.0.4" "USER COMMANDS"
+.TH vsearch 1 "October 10, 2016" "version 2.3.0" "USER COMMANDS"
 .\" ============================================================================
 .SH NAME
 vsearch \(em chimera detection, clustering, dereplication and
 rereplication, FASTA/FASTQ file processing, masking, pairwise
 alignment, searching, shuffling, sorting and subsampling of amplicons
-from metagenomic projects.
+for metagenomics, genomics, and population genetics.
 .\" ============================================================================
 .SH SYNOPSIS
 .\" left justified, ragged right
@@ -24,10 +24,10 @@ Chimera detection:
 Clustering:
 .RS
 \fBvsearch\fR (\-\-cluster_fast | \-\-cluster_size |
-\-\-cluster_smallmem) \fIfastafile\fR (\-\-alnout | \-\-blast6out |
-\-\-centroids | \-\-clusters | \-\-msaout | \-\-profile | 
-\-\-samout | \-\-uc | \-\-userout) \fIoutputfile\fR
-\-\-id \fIreal\fR [\fIoptions\fR]
+\-\-cluster_smallmem) \fIfastafile\fR (\-\-alnout | \-\-biomout |
+\-\-blast6out | \-\-centroids | \-\-clusters | \-\-mothur_shared_out |
+\-\-msaout | \-\-otutabout | \-\-profile | \-\-samout | \-\-uc |
+\-\-userout) \fIoutputfile\fR \-\-id \fIreal\fR [\fIoptions\fR]
 .PP
 .RE
 Dereplication and rereplication:
@@ -85,12 +85,14 @@ Pairwise alignment:
 .RE
 Searching:
 .RS
-\fBvsearch\fR \-\-search_exact \fIfastafile\fR \-\-db
-\fIfastafile\fR (\-\-alnout | \-\-blast6out | \-\-samout | \-\-uc |
-\-\-userout) \fIoutputfile\fR [\fIoptions\fR]
+\fBvsearch\fR \-\-search_exact \fIfastafile\fR \-\-db \fIfastafile\fR
+(\-\-alnout | \-\-biomout | \-\-blast6out | \-\-mothur_shared_out |
+\-\-otutabout | \-\-samout | \-\-uc | \-\-userout) \fIoutputfile\fR
+[\fIoptions\fR]
 .PP
 \fBvsearch\fR \-\-usearch_global \fIfastafile\fR \-\-db
-\fIfastafile\fR (\-\-alnout | \-\-blast6out | \-\-samout | \-\-uc |
+\fIfastafile\fR (\-\-alnout | \-\-biomout | \-\-blast6out |
+\-\-mothur_shared_out | \-\-otutabout | \-\-samout | \-\-uc |
 \-\-userout) \fIoutputfile\fR \-\-id \fIreal\fR [\fIoptions\fR]
 .PP
 .RE
@@ -102,10 +104,9 @@ Shuffling and sorting:
 .RE
 Subsampling:
 .RS
-\fBvsearch\fR \-\-fastx_subsample \fIfastafile\fR
-\-\-fastaout \fIoutputfile\fR
-(\-\-sample_pct \fIreal\fR | \-\-sample_size \fIpositive integer\fR)
-[\fIoptions\fR]
+\fBvsearch\fR \-\-fastx_subsample \fIfastafile\fR (\-\-fastaout |
+\-\-fastqout) \fIoutputfile\fR (\-\-sample_pct \fIreal\fR |
+\-\-sample_size \fIpositive integer\fR) [\fIoptions\fR]
 .PP
 .RE
 .\" left and right justified (default)
@@ -113,75 +114,94 @@ Subsampling:
 .\" ============================================================================
 .SH DESCRIPTION
 Environmental or clinical molecular diversity studies generate large
-volumes of amplicons (e.g. SSU-rRNA sequences) that need to be checked
-for chimeras, dereplicated, masked, sorted, searched, clustered or
-compared to reference sequences. The aim of \fBvsearch\fR is to offer
-a all-in-one open source tool to perform these tasks, using optimized
-algorithm implementations and harvesting the full potential of modern
-computers, thus providing fast and accurate data processing.
+volumes of amplicons (e.g.; SSU-rRNA sequences) that need to be
+checked for chimeras, dereplicated, masked, sorted, searched,
+clustered or compared to reference sequences. The aim of \fBvsearch\fR
+is to offer a all-in-one open source tool to perform these tasks,
+using optimized algorithm implementations and harvesting the full
+potential of modern computers, thus providing fast and accurate data
+processing.
 .PP
 Comparing nucleotide sequences is at the core of \fBvsearch\fR. To
 speed up comparisons, \fBvsearch\fR implements an extremely fast
 Needleman-Wunsch algorithm, making use of the Streaming SIMD
-Extensions (SSE2) of modern x86-64 CPUs. If SSE2 instructions are not
-available, \fBvsearch\fR exits with an error message. Memory usage
+Extensions (SSE2) of post-2003 x86-64 CPUs. If SSE2 instructions are
+not available, \fBvsearch\fR exits with an error message. Memory usage
 increases rapidly with sequence length: for example comparing two
 sequences of length 1 kb requires 8 MB of memory per thread, and
 comparing two 10 kb sequences requires 800 MB of memory per
 thread. For comparisons involving sequences with a length product
-greater than 25,000,000, \fBvsearch\fR uses a slower alignment method
-described by Hirschberg (1975) and Myers and Miller (1988), with
-smaller memory requirements.
+greater than 25 million (for example two sequences of length 5 kb),
+\fBvsearch\fR uses a slower alignment method described by Hirschberg
+(1975) and Myers and Miller (1988), with much smaller memory
+requirements.
 .\" ----------------------------------------------------------------------------
 .SS Input
-\fBvsearch\fR input is a fasta (or fastq) file containing one or
-several nucleotide sequences. For each sequence, the sequence
-identifier is defined as the string comprised between the ">" (or "@")
-symbol and the first space, tab or the end of the line, whichever
-comes first. Additionally, if the fasta header line matches
-">[;]size=\fIinteger\fR;label", ">label;size=\fIinteger\fR;label" or
-">label;size=\fIinteger\fR[;]", \fBvsearch\fR will remove the pattern
-[;]size=\fIinteger\fR[;] from the header and interpret \fIinteger\fR
-as the number of occurrences (or abundance) of the sequence in the
-study. That abundance information is used or created during chimera
-detection, clustering, dereplication, sorting and searching.
-.PP
-The nucleotide sequence is defined as a string of IUPAC symbols
+\fBvsearch\fR accept as input fasta or fastq files containing one or
+several nucleotidic entries. In fasta files, each nucleotidic entry is
+made of a header and a sequence. The header is defined as the string
+comprised between the '>' symbol and the first space, tab or the end
+of the line, whichever comes first. Additionally, if the header
+matches
+'>[;]size=\fIinteger\fR;label', '>label;size=\fIinteger\fR;label' or
+'>label;size=\fIinteger\fR[;]', \fBvsearch\fR interpret
+\fIinteger\fR as the number of occurrences (or abundance) of the
+sequence in the study. That abundance information is used or created
+during chimera detection, clustering, dereplication, sorting and
+searching.
+.PP
+The sequence is defined as a string of IUPAC symbols
 (ACGTURYSWKMDBHVN), starting after the end of the identifier line and
 ending before the next identifier line, or the file end. \fBvsearch\fR
 silently ignores ascii characters 9 to 13, and exits with an error
-message if ascii characters 0 to 8, 14 to 31, "." or "-" are
+message if ascii characters 0 to 8, 14 to 31, '.' or '-' are
 present. All other ascii or non-ascii characters are stripped and
 complained about in a non-blocking warning message.
 .PP
+In fastq files, each entry is made of sequence header starting with a
+symbol '@', a nucleotidic sequence (same rules as for fasta
+sequences), a quality header starting with a symbol '+' and a string
+of ASCII characters (offset 33 or 64), each one encoding the quality
+value of the corresponding position in the nucleotidic sequence.
+.PP
 \fBvsearch\fR operations are case insensitive, except when soft
-masking is activated. Masking is performed by the commands for chimera
+masking is activated. Masking is automatically applied during chimera
 detection, clustering, masking, pairwise alignment and searching. Soft
-masking is specified with the options "\-\-dbmask soft" (for searching
-and chimera detection with a reference) or "\-\-qmask soft" (for
-searching, de novo chimera detection, clustering and masking). When
-using soft masking, lower case letters indicate masked symbols, while
-upper case letters indicate regular symbols. Masked symbols are never
-included in the unique index words used for searching, otherwise they
-are treated as normal symbols.
+masking is specified with the options '\-\-dbmask soft' (for searching
+and chimera detection with a reference) or '\-\-qmask soft' (for
+searching, \fIde novo\fR chimera detection, clustering and
+masking). When using soft masking, lower case letters indicate masked
+symbols, while upper case letters indicate regular symbols. Masked
+symbols are never included in the unique index words used for sequence
+comparisons, otherwise they are treated as normal symbols.
 .PP
 When comparing sequences during chimera detection, dereplication,
 searching and clustering, T and U are considered identical, regardless
-of their case. If two symbols are not identical, their alignment will
-result in the negative mismatch score (default -4), except if one or
+of their case. If two symbols are not identical, their alignment
+result in a negative mismatch score (default -4), except if one or
 both of the symbols are ambiguous (RYSWKMDBHVN) in which case the
-score is zero. Alignment of two identical ambiguous symbols (e.g. R vs
-R) also receives a score of zero.
-.PP
-Input files compressed with gzip or bzip2 are automatically
-detected and decompressed if vsearch was compiled with the appropriate
-libraries. Input from pipes is supported, but then compressed input
-must be indicated using the \-\-gzip_decompress or
-\-\-bzip2_decompress options. For input files the name '\-'
-represents standard input (/dev/stdin). Multiple FASTA or FASTQ files
-may be piped into vsearch for dereplication or other operations.  When
-reading from a pipe, the progress indicator is not updated.  For
-output files the name '\-' represents standard output (/dev/stdout).
+score is zero. Alignment of two identical ambiguous symbols (for
+example, R vs R) also receives a score of zero.
+.PP
+\fBvsearch\fR can read data from standard files and write to standard
+files, but it can also read from pipes and write to pipes! For
+example, multiple fasta files can be piped into vsearch for
+dereplication. To do so, file names can be replaced with:
+.RS
+.IP - 2
+the symbol '-', representing '/dev/stdin' for input files
+or '/dev/stdout' for output files,
+.IP -
+a named pipe created with the command mkfifo,
+.IP -
+a process substitution '<(command)' as input or '>(command)' as output.
+.RE
+\fBvsearch\fR can automatically read compressed gzip or bzip2 files if
+the appropriate libraries are present during the
+compilation. \fBvsearch\fR can also read pipes streaming compressed
+gzip or bzip2 data if the options \-\-gzip_decompress or
+\-\-bzip2_decompress are selected. When reading from a pipe, the
+progress indicator is not updated.
 .\" ----------------------------------------------------------------------------
 .SS Options
 \fBvsearch\fR recognizes a large number of command-line options. For
@@ -197,17 +217,19 @@ General options:
 .RS
 .TP 9
 .B \-\-bzip2_decompress
-Decompress input using bzip2. The option is required only when reading
-from a pipe, otherwise compression is automatically detected.
+When reading from a pipe streaming bzip2-compressed data, decompress
+the data. That option is not needed when reading from a standard
+bzip2-compressed file.
 .TP
 .BI \-\-fasta_width\~ "positive integer"
 Fasta files produced by \fBvsearch\fR are wrapped (sequences are
 written on lines of \fIinteger\fR nucleotides, 80 by default). Set
-that value to 0 to eliminate the wrapping.
+that value to zero to eliminate the wrapping.
 .TP
 .B \-\-gzip_decompress
-Decompress input using gzip. The option is required only when reading
-from a pipe, otherwise compression is automatically detected.
+When reading from a pipe streaming gzip-compressed data, decompress
+the data. That option is not needed when reading from a standard
+gzip-compressed file.
 .TP
 .B \-\-help | \-h
 Display help text and exit.
@@ -218,34 +240,35 @@ program version, amount of memory available, number of cores and
 command line options, and if need be, informational messages, warnings
 and fatal errors. The start and finish times are also recorded as well
 as the elapsed time and the maximum amount of memory consumed. The
-different commands will usually also write some information about
-their results.
+different \fBvsearch\fB commands can also write additional
+informations to the log file.
 .TP
 .BI \-\-maxseqlength\~ "positive integer"
-All \fBvsearch\fR operations will discard sequences of length equal or
+All \fBvsearch\fR operations discard sequences of length equal or
 greater than \fIinteger\fR (50,000 nucleotides by default).
 .TP
 .BI \-\-minseqlength\~ "positive integer"
-All \fBvsearch\fR operations will discard sequences of length smaller
-than \fIinteger\fR (1 nucleotide by default for sorting or shuffling,
-32 nucleotides for clustering, dereplication or searching).
+All \fBvsearch\fR operations discard sequences of length smaller than
+\fIinteger\fR: 1 nucleotide by default for sorting or shuffling, 32
+nucleotides for clustering, dereplication or searching.
 .TP
 .B \-\-notrunclabels
 Do not truncate sequence labels at first space or tab, use the full
 header in output files.
 .TP
 .B \-\-quiet
-Suppress all output to stdout and stderr except for warnings and fatal
-error messages.
+Suppress all messages to stdout and stderr except for warnings and
+fatal error messages.
 .TP
 .BI \-\-threads\~ "positive integer"
 Number of computation threads to use (1 to 256). The number of threads
 should be lesser or equal to the number of available CPU cores. The
-default is to use all available resources and to launch one thread
-per logical core. The following commands are multi-threaded:
-uchime_ref, cluster_fast, cluster_size, cluster_smallmem, maskfasta,
-allpairs_global, usearch_global.
-Only one thread is used for the other commands.
+default is to use all available resources and to launch one thread per
+logical core. The following commands are multi-threaded: uchime_ref,
+cluster_fast, cluster_size, cluster_smallmem, fastq_mergepairs,
+maskfasta, allpairs_global, usearch_global. Only one thread is used
+for the other commands.
+.\" fastq_mergepairs is not multithreaded in v2.1.0
 .TP
 .B \-\-version | \-v
 Output version information and exit.
@@ -257,8 +280,8 @@ Chimera detection options:
 .RS
 Chimera detection is based on a scoring function controlled by five
 options (\-\-dn, \-\-mindiffs, \-\-mindiv, \-\-minh,
-\-\-xn). Sequences are first sorted by decreasing abundance (if
-available), and compared on their \fIplus\fR strand only (case
+\-\-xn). Sequences are first sorted by decreasing abundance ,if
+available, and compared on their \fIplus\fR strand only (case
 insensitive).
 .PP
 Input sequences are masked as specified with the \-\-qmask and
@@ -266,30 +289,30 @@ Input sequences are masked as specified with the \-\-qmask and
 chimera detection is specified with the \-\-dbmask option.
 .PP
 In \fIde novo\fR mode, input fasta file should present abundance
-annotations (pattern [;]size=\fIinteger\fR[;] in the fasta
-header). The input order influences the chimera detection, so we
-recommend to sort sequences by decreasing abundance (default of
+annotations (i.e. a pattern [;]size=\fIinteger\fR[;] in the fasta
+header). Input order matters for chimera detection, so we recommend to
+sort sequences by decreasing abundance (default of
 \-\-derep_fulllength command). If your sequence set needs to be
 sorted, please see the \-\-sortbysize command in the sorting section.
 .PP
 .TP 9
 .BI \-\-abskew \0real
 When using \-\-uchime_denovo, the abundance skew is used to
-distinguish in a 3-way alignment which sequence is the chimera and
+distinguish in a three-way alignment which sequence is the chimera and
 which are the parents. The assumption is that chimeras appear later in
 the PCR amplification process and are therefore less abundant than
 their parents. The default value is 2.0, which means that the parents
 should be at least 2 times more abundant than their chimera. Any
-positive value greater than 1.0 can be used.
+positive value equal or greater than 1.0 can be used.
 .TP
 .BI \-\-alignwidth\~ "positive integer"
-When using \-\-uchimealns, set the width of the 3-way alignments. The
-default value is 80 characters. Set to 0 to eliminate wrapping.
+When using \-\-uchimealns, set the width of the three-way alignments
+(80 nucleotides by default). Set to zero to eliminate wrapping.
 .TP
 .BI \-\-borderline \0filename
 Output borderline chimeric sequences to \fIfilename\fR, in fasta
 format.  Borderline chimeric sequences are sequences that have a high
-enough score but which are not sufficiently diverged from their
+enough score but which are not sufficiently different from their
 closest parent.
 .TP
 .BI \-\-chimeras \0filename
@@ -299,18 +322,18 @@ order may vary when using multiple threads.
 .BI \-\-db \0filename
 When using \-\-uchime_ref, detect chimeras using the fasta-formatted
 reference sequences contained in \fIfilename\fR. Reference sequences
-are assumed to be chimera-free. Chimeras will not be detected if their
-parents (or sufficiently close relatives) are not present in the
+are assumed to be chimera-free. Chimeras cannot be detected if their
+parents, or sufficiently close relatives, are not present in the
 database.
 .TP
 .BI \-\-dn \0real
-No vote pseudo-count (parameter \fIn\fR in the chimera scoring
-function) (default value is 1.4).
+No vote pseudo-count, corresponding to the parameter \fIn\fR in the
+chimera scoring function (default value is 1.4).
 .TP
 .B \-\-fasta_score
 Add the chimera score to the headers in the fasta output files for
-chimeras, non-chimeras and borderline sequences. A string similar to
-";uchime_denovo=0.1234;" or ";uchime_ref=5.6789;" will be added.
+chimeras, non-chimeras and borderline sequences, using the format
+';uchime_denovo=\fIfloat\fR;'.
 .TP
 .BI \-\-mindiffs\~ "positive integer"
 Minimum number of differences per segment (default value is 3).
@@ -319,9 +342,9 @@ Minimum number of differences per segment (default value is 3).
 Minimum divergence from closest parent (default value is 0.8).
 .TP
 .BI \-\-minh \0real
-Minimum score (h). Increasing this value tends to reduce the number of
-false positives and to decrease sensitivity. Default value is 0.28,
-and values ranging from 0.0 to 1.0 included are accepted.
+Minimum score (\fIh\fR). Increasing this value tends to reduce the
+number of false positives and to decrease sensitivity. Default value
+is 0.28, and values ranging from 0.0 to 1.0 included are accepted.
 .TP
 .BI \-\-nonchimeras \0filename
 Output non-chimeric sequences to \fIfilename\fR, in fasta
@@ -338,37 +361,41 @@ When relabelling, keep the old identifier in the header after a space.
 .B \-\-relabel_md5
 Relabel sequences using the MD5 message digest algorithm applied to
 each sequence. Former sequence headers are discarded. The sequence is
-converted to upper case and U is replaced by T before the digest is
-computed.  The MD5 digest is a cryptographic hash function designed to
-minimize the probability that two different inputs will give the same
-output, even for very similar, but non-identical inputs. Still, there
-will always be a very small, but non-zero, probability that two
-different inputs will give the same result. The MD5 digest generates a
-128-bit (16-byte) digest that is represented by 16 hexadecimal numbers
-(using 32 symbols among 0123456789abcdef). Use \-\-sizeout to conserve
-the abundance annotations.
+converted to upper case and each 'U' is replaced by a 'T' before
+computation of the digest. The MD5 digest is a cryptographic hash
+function designed to minimize the probability that two different
+inputs give the same output, even for very similar, but non-identical
+inputs. Still, there is a very small, but non-zero, probability that
+two different inputs give the same digest (i.e. a collision). MD5
+generates a 128-bit (16-byte) digest that is represented by 16
+hexadecimal numbers (using 32 symbols among 0123456789abcdef). Use
+\-\-sizeout to conserve the abundance annotations.
+.\" The probablity of collision for two sequences is 1/2^128
 .TP
 .B \-\-relabel_sha1
 Relabel sequences using the SHA1 message digest algorithm applied to
 each sequence. It is similar to the \-\-relabel_md5 option but uses
-the SHA1 algorithm instead of the MD5 algorithm. The SHA1 digest
-generates a 160-bit (20-byte) result that is represented by 20
-hexadecimal numbers (40 symbols). The probability of a collision (two
-non-identical sequences having the same digest) is smaller for the
-SHA1 algorithm than it is for the MD5 algorithm.
+the SHA1 algorithm instead of the MD5 algorithm. SHA1 generates a
+160-bit (20-byte) digest that is represented by 20 hexadecimal numbers
+(40 symbols). The probability of a collision (two non-identical
+sequences resulting in the same digest) is smaller for the SHA1
+algorithm than it is for the MD5 algorithm.
+.\" The probablity of collision for two sequences is 1/2^160
 .TP
 .B \-\-self
 When using \-\-uchime_ref, ignore a reference sequence when its label
 matches the label of the query sequence (useful to estimate
 false-positive rate in reference sequences).
+.\" I am not sure the statement above is true.
 .TP
 .B \-\-selfid
 When using \-\-uchime_ref, ignore a reference sequence when its
-nucleotide sequence is strictly identical with the query sequence.
+nucleotide sequence is strictly identical to the nucleotidic sequence
+of the query.
 .TP
 .B \-\-sizeout
-When relabelling, add abundance annotations to the output fasta file
-(with the pattern ";size=\fIinteger\fR;" to sequence headers).
+When relabelling, add abundance annotations to fasta headers (using
+the format ';size=\fIinteger\fR;').
 .TP
 .BI \-\-uchime_denovo \0filename
 Detect chimeras present in the fasta-formatted \fIfilename\fR, without
@@ -382,17 +409,17 @@ comparing them with reference sequences (option
 \-\-db). Multithreading is supported.
 .TP
 .BI \-\-uchimealns \0filename
-Write the 3-way global alignments (parentA, parentB, chimera) to
+Write the three-way global alignments (parentA, parentB, chimera) to
 \fIfilename\fR using a human-readable format. Use \-\-alignwidth to
 modify alignment length. Output order may vary when using multiple
 threads. All sequences are converted to upper case before
 alignment. Lower case letters indicate disagreement in the alignment.
 .TP
 .BI \-\-uchimeout \0filename
-Write chimera detection results to \fIfilename\fR using the uchime
-tab-separated format of 18 fields (see the list below). Use
-\-\-uchimeout5 to use a format compatible with usearch v5 and earlier
-versions. Rows output order may vary when using multiple threads.
+Write chimera detection results to \fIfilename\fR using a 18-field,
+tab\-separated uchime\-like format. Use \-\-uchimeout5 to use a format
+compatible with usearch v5 and earlier versions. Rows output order may
+vary when using multiple threads.
 .RS
 .RS
 .nr step 1 1
@@ -439,13 +466,13 @@ YN: query is chimeric (Y), or not (N), or is a borderline case (?).
 .TP
 .B \-\-uchimeout5
 When using \-\-uchimeout, write chimera detection results using a
-tab-separated format of 17 fields (drop the 5th field of
+17\-field, tab\-separated uchime\-like format (drop the 5th field of
 \-\-uchimeout), compatible with usearch version 5 and earlier
 versions.
 .TP
 .BI \-\-xn \0real
-No vote weight (parameter beta in the scoring function) (default value
-is 8.0).
+No vote weight, corresponding to the parameter \fIbeta\fR in the
+scoring function (default value is 8.0).
 .TP
 .B \-\-xsize
 Strip abundance information from the headers when writing the output
@@ -465,6 +492,44 @@ clustering threshold (\-\-id) and the pairwise identity definition
 Input sequences are masked as specified with the \-\-qmask and
 \-\-hardmask options.
 .TP 9
+.BI \-\-biomout \0filename
+Generate an OTU table in the biom version 1.0 JSON file format as
+specified at
+http://biom-format.org/documentation/format_versions/biom-1.0.html.
+The format describes how to store a sparse matrix containing the
+abundances of the OTUs in the different samples. This format is much
+more efficient than the classic and mothur OTU table formats available
+with the \-\-otutabout and \-\-mothur_shared_out options,
+respectively, and is recommended at least for large tables. The OTUs
+are represented by the cluster centroids. Taxonomy information will be
+included for the OTUs if available. Sample identifiers will be
+extracted from the headers of all sequences in the input file. If the
+header contains ";sample=abc123;" or ";barcodelabel=abc123;" or a
+similar string somewhere, then the given sample identifier (here
+"abc123") will be used. The semicolon is not mandatory at the
+beginning or end of the header. The sample identifier may contain any
+printable character except semicolons. If no such sample label is
+found, the identifier in the initial part of the header will be used,
+but only letters, digits and underscores are allowed. OTU identifiers
+will be extracted from the headers of the cluster centroid
+sequences. If the header contains ";otu=def789;" or a similar string
+somewhere, then the given OTU identifier (here "def789") will be
+used. The semicolon is not mandatory at the beginning or end of the
+header. The OTU identifier may contain any printable character except
+semicolons. If no such OTU label is found, the identifier in the
+initial part of the header will be used, and all characters except
+semicolons are allowed. Alternatively, OTU identifers can be generated
+using the relabelling options (\-\-relabel, \-\-relabel_sha1 or
+\-\-relabel_md5). Taxonomy information, if present, will also be
+extracted from the headers of the centroid sequences. If the header
+contains ";tax=Homo_sapiens;" or a similar string somewhere, then the
+given taxonomy information (here "Homo_sapiens") will be used. The
+semicolon is not mandatory at the beginning or end of the header. The
+taxonomy information may contain any printable character except
+semicolons. If an OTU table in the biom version 2.1 HDF5 file format
+is required, the biom utility may be used as described at
+http://biom-format.org/documentation/biom_conversion.html.
+.TP
 .BI \-\-centroids \0filename
 Output cluster centroid sequences to \fIfilename\fR, in fasta
 format. The centroid is the sequence that seeded the cluster (i.e. the
@@ -479,12 +544,12 @@ Sort output files by decreasing abundance
 when using the \-\-consout, \-\-msaout and \-\-profile options.
 .TP
 .BI \-\-cluster_fast \0filename
-Clusterize the fasta sequences in \fIfilename\fR, automatically
-perform a sorting by decreasing sequence length beforehand.
+Clusterize the fasta sequences in \fIfilename\fR, automatically sort
+by decreasing sequence length beforehand.
 .TP
 .BI \-\-cluster_size \0filename
-Clusterize the fasta sequences in \fIfilename\fR, automatically
-perform a sorting by decreasing sequence abundance beforehand.
+Clusterize the fasta sequences in \fIfilename\fR, automatically sort
+by decreasing sequence abundance beforehand.
 .TP
 .BI \-\-cluster_smallmem \0filename
 Clusterize the fasta sequences in \fIfilename\fR without automatically
@@ -551,6 +616,35 @@ the majority symbol (nucleotide or gap) from each column of the
 alignment. Columns containing a majority of gaps are skipped, except
 for terminal gaps.
 .TP
+.BI \-\-mothur_shared_out \0filename
+Output an OTU table in the mothur "shared" tab-separated plain text
+format as described at http://www.mothur.org/wiki/Shared_file. The
+format describes how a matrix containing the abundances of the OTUs in
+the different samples is stored. The first line will start with the
+strings "label", "group" and "numOtus" and is followed by a list of
+all OTU identifiers. The following lines, one for each sample, starts
+with the string "vsearch" followed by the sample identifier, the total
+number of OTUs, and a list of abundances for each OTU in that sample,
+in the order given on the first line. The OTU and sample identifiers
+are extracted from the FASTA headers of the sequences. The OTUs are
+represented by the cluster centroids. See the \-\-biomout option for
+further details.
+.TP
+.BI \-\-otutabout \0filename
+Output an OTU table in the classic tab-separated plain text format as
+a matrix containing the abundances of the OTUs in the different
+samples. The first line will start with the string "#OTU ID" and is
+followed by a tab-separated list of all sample identifiers. The
+following lines, one for each OTU, starts with the OTU identifier and
+is followed by a tab-separated list of abundances for that OTU in each
+sample, in the order given on the first line. The OTU and sample
+identifiers are extracted from the FASTA headers of the sequences. The
+OTUs are represented by the cluster centroids. An extra column is
+added to the right of the table if taxonomy information is available
+for at least one of the OTUs. This column will be labelled "taxonomy"
+and each row will then contain the taxonomy information extracted for
+that OTU. See the \-\-biomout option for further details.
+.TP
 .BI \-\-profile \0filename
 Output a sequence profile to a text file with the frequency of each
 nucleotide in each position in the multiple alignment for each
@@ -560,7 +654,7 @@ position (1-based), consensus nucleotide, number of A's, number of
 C's, number of G's, number of Ts or Us, and finally the number of
 gaps. If ambiguous nucleotide symbols are present, the numbers may be
 floating point numbers, otherwise they are integers. For instance,
-an 'R' will count 0.5 towards an A and 0.5 towards a G.
+an 'R' counts 0.5 towards an A and 0.5 towards a G.
 .TP
 .BI \-\-qmask\~ "none|dust|soft"
 Mask regions in sequences using the
@@ -616,9 +710,43 @@ When comparing sequences with the cluster seed, check the \fIplus\fR
 strand only (default) or check \fIboth\fR strands.
 .TP
 .BI \-\-uc \0filename
-Output clustering results in \fIfilename\fR using a uclust-like
-format. For a description of the format, see
-<http://www.drive5.com/usearch/manual/ucout.html>.
+Output clustering results in \fIfilename\fR using a tab-separated
+uclust-like format with 10 columns and 3 different type of entries (S,
+H or C). Each fasta sequence in the input file can be either a cluster
+centroid (S) or a hit (H) assigned to a cluster. Cluster records (C)
+summarize information (size, centroid label) for each cluster. In the
+context of clustering, the option \-\-uc_allhits has no effect on the
+\-\-uc output. Column content varies with the type of entry (S, H or
+C):
+.RS
+.RS
+.nr step 1 1
+.IP \n[step]. 4
+Record type: S, H, or C.
+.IP \n+[step].
+Cluster number (0-based).
+.IP \n+[step].
+Centroid length (S), query length (H), or cluster size (C).
+.IP \n+[step].
+Percentage of similarity with the centroid sequence (H), or set to "*"
+(S, C).
+.IP \n+[step].
+Match orientation + or - (H), or set to "*" (S, C).
+.IP \n+[step].
+Not used, always set to "*" (S, C) or to zero (H).
+.IP \n+[step].
+Not used, always set to "*" (S, C) or to zero (H).
+.IP \n+[step].
+Compact representation of the pairwise alignment using the CIGAR
+format (Compact Idiosyncratic Gapped Alignment Report): M (match), D
+(deletion) and I (insertion). The equal sign "=" indicates that the
+query is identical to the centroid sequence.
+.IP \n+[step].
+Label of the query sequence (H), or of the centroid sequence (S, C).
+.IP \n+[step].
+Label of the centroid sequence (H), or set to "*" (S, C).
+.RE
+.RE
 .TP
 .B \-\-usersort
 When using \-\-cluster_smallmem, allow any sequence input order, not
@@ -628,11 +756,10 @@ just a decreasing length ordering.
 Strip abundance information from the headers when writing the output
 file.
 .TP
+.B ...
 Most searching options as well as score filtering, gap penalties and
 masking also apply to clustering (see the Searching section for
-definitions):
-.br
-\-\-alnout, \-\-blast6out, \-\-fastapairs, \-\-matched,
+definitions): \-\-alnout, \-\-blast6out, \-\-fastapairs, \-\-matched,
 \-\-notmatched, \-\-maxaccept, \-\-maxreject, \-\-samout, \-\-userout,
 \-\-userfields
 .RE
@@ -652,11 +779,10 @@ Merge sequences with identical prefixes contained in \fIfilename\fR.
 A short sequence identical to an initial segment (prefix) of another
 sequence is considered a replicate of the longer sequence. If a
 sequence is identical to the prefix of two or more longer sequences,
-it will be clustered with the shortest of them. If they are equally
-long, it will be clustered with the most abundant. Remaining ties are
-solved using sequence headers and sequence input order. Sequence
-comparisons are case insensitive, and T and U are considered
-identical.
+it is clustered with the shortest of them. If they are equally long,
+it is clustered with the most abundant. Remaining ties are solved
+using sequence headers and sequence input order. Sequence comparisons
+are case insensitive, and T and U are considered identical.
 .TP
 .BI \-\-maxuniquesize\~ "positive integer"
 Discard sequences with an abundance value greater than \fIinteger\fR.
@@ -689,12 +815,12 @@ for details.
 .TP
 .BI \-\-rereplicate \0filename
 Duplicate each sequence the number of times indicated by the abundance
-of each sequence in the specified file. The sequence labels will be
+of each sequence in the specified file. The sequence labels are
 identical for the same sequence, unless \-\-relabel, \-\-relabel_sha1
-or \-\-relabel_md5 is used to create unique labels. Output will be
-written to the file specified with the \-\-output option, in FASTA
-format. The output file will not contain abundance information unless
-\-\-sizeout is specified, in which case an abundance of 1 is used.
+or \-\-relabel_md5 is used to create unique labels. Output is written
+to the file specified with the \-\-output option, in FASTA format. The
+output file does not contain abundance information unless \-\-sizeout
+is specified, in which case an abundance of 1 is used.
 .TP
 .B \-\-sizein
 Take into account the abundance annotations present in the input fasta
@@ -718,11 +844,40 @@ strand only (default) or check \fIboth\fR strands.
 Output only the top \fIinteger\fR sequences (i.e. the most abundant).
 .TP
 .BI \-\-uc \0filename
-Output dereplication results in \fIfilename\fR using a uclust-like
-format. For a description of the format, see
-<http://www.drive5.com/usearch/manual/ucout.html>. In the context of
-dereplication, the option \-\-uc_allhits has no effect on the \-\-uc
-output.
+Output full-length or prefix-dereplication results in \fIfilename\fR
+using a tab-separated uclust-like format with 10 columns and 3
+different type of entries (S, H or C). Each fasta sequence in the
+input file can be either a cluster centroid (S) or a hit (H) assigned
+to a cluster. Cluster records (C) summarize information (size,
+centroid label) for each cluster. In the context of dereplication, the
+option \-\-uc_allhits has no effect on the \-\-uc output. Column
+content varies with the type of entry (S, H or C):
+.RS
+.RS
+.nr step 1 1
+.IP \n[step]. 4
+Record type: S, H, or C.
+.IP \n+[step].
+Cluster number (0-based).
+.IP \n+[step].
+Sequence length (S, H), or cluster size (C).
+.IP \n+[step].
+Percentage of similarity with the centroid sequence (H), or set to "*"
+(S, C).
+.IP \n+[step].
+Match orientation + or - (H), or set to "*" (S, C).
+.IP \n+[step].
+Not used, always set to "*" (S, C) or 0 (H).
+.IP \n+[step].
+Not used, always set to "*" (S, C) or 0 (H).
+.IP \n+[step].
+Not used, always set to "*".
+.IP \n+[step].
+Label of the query sequence (H), or of the centroid sequence (S, C).
+.IP \n+[step].
+Label of the centroid sequence (H), or set to "*" (S, C).
+.RE
+.RE
 .TP
 .B \-\-xsize
 Strip abundance information from the headers when writing the output
@@ -736,15 +891,15 @@ FASTA/FASTQ file processing options:
 Analyse, shorten, filter, convert or merge sequences in FASTQ files,
 or reverse complement sequences in FASTA or FASTQ files. The
 \-\-fastq_chars command can be used to analyse FASTQ files to identify
-the type of FASTQ file and the range of quality score values used. To
+the quality encoding and the range of quality score values used. To
 convert between different FASTQ file variants, use the
 \-\-fastq_convert command. Statistical analysis of the quality and
 length of the sequences in a FASTQ file may be performed with the
 \-\-fastq_stats and \-\-fastq_eestats commands. Sequences may be
-shortened, filtered and converted by the \-\-fastq_filter
-command. Paired-end reads can be merged using the \-\-fastq_mergepairs
-command. Finally, the \-\-fastx_revcomp command will reverse
-complement sequences.
+shortened, filtered and converted by the \-\-fastq_filter or
+\-\-fastx_filter commands. Paired-end reads can be merged using the
+\-\-fastq_mergepairs command. Finally, the \-\-fastx_revcomp command
+reverse-complements sequences.
 .PP
 .TP 9
 .B \-\-eeout
@@ -763,9 +918,9 @@ of errors are the number of differences in the overlap region of the
 merged sequence relative to each of the reads in the pair.
 .TP
 .BI \-\-fastaout \0filename
-When using \-\-fastq_filter or \-\-fastq_mergepairs, write to the
-given FASTA-formatted file the sequences passing the filter, or the
-merged sequences.
+When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter,
+write to the given FASTA-formatted file the sequences passing the
+filter, or the merged sequences.
 .TP
 .BI \-\-fastaout_notmerged_fwd \0filename
 When using \-\-fastq_mergepairs, write forward reads not merged to the
@@ -776,8 +931,8 @@ When using \-\-fastq_mergepairs, write reverse reads not merged to the
 specified FASTA file.
 .TP
 .BI \-\-fastaout_discarded \0filename
-Write sequences that do not pass the filter of the \-\-fastq_filter
-command to the given FASTA-formatted file.
+Write sequences that do not pass the filter of the \-\-fastq_filter or
+\-\-fastx_filter command to the given FASTA-formatted file.
 .TP
 .B \-\-fastq_allowmergestagger
 When using \-\-fastq_mergepairs, allow to merge staggered read
@@ -809,21 +964,21 @@ letter. For each character present in the quality strings,
 frequency, and the number of times a \fIk\fR-mer of that character
 appears at the end of quality strings. The length of the \fIk\fR-mer
 can be set using \-\-fastq_tail (4 by default). The command
-\-\-fastq_chars tries to automatically detect the type of FASTQ file
-given (Solexa, Illumina 1.3+, Illumina 1.5+ or Illumina 1.8+/Sanger)
-by analyzing the range of observed quality score values. In case of
+\-\-fastq_chars tries to automatically detect the quality encoding
+(Solexa, Illumina 1.3+, Illumina 1.5+ or Illumina 1.8+/Sanger) by
+analyzing the range of observed quality score values. In case of
 success, \-\-fastq_chars suggests values for the \-\-fastq_ascii (33
 or 64), \-\-fastq_qmin and \-\-fastq_qmax options to be used with the
 other commands that require a FASTQ input file.
 .TP
 .BI \-\-fastq_convert \0filename
 Convert between the different variants of the FASTQ file format. The
-type of input file must be specified with the \-\-fastq_ascii option
-(either 33 or 64, the default is 33), and the type of output file must
-be specified with the \-\-fastq_asciiout option (default 33). The
-mimimum and maximum output quality scores may be limited using the
-\-\-fastq_qminout and \-\-fastq_qmaxout options. The output file is
-specified with the \-\-fastqout option.
+quality encoding of the input file must be specified with the
+\-\-fastq_ascii option (either 33 or 64, the default is 33), and the
+output quality encoding must be specified with the \-\-fastq_asciiout
+option (default 33). The mimimum and maximum output quality scores may
+be limited using the \-\-fastq_qminout and \-\-fastq_qmaxout
+options. The output file is specified with the \-\-fastqout option.
 .TP
 .B \-\-fastq_eeout
 When using \-\-fastq_filter or \-\-fastq_mergepairs, include the
@@ -844,27 +999,14 @@ the expected number of accumulated errors from the beginning of the
 reads and until the current position (EE). For each of the Q, Pe and
 EE distributions, the following statistics are included: minimum value
 (Min), lower quartile (Low), median (Med), mean (Mean), upper quartile
-(Hi), and maximum value (Max). The type of FASTQ file may be specified
-with \-\-fastq_ascii \-\-fastq_qmin and \-\-fastq_qmax.
+(Hi), and maximum value (Max). The quality encoding and the range of
+quality values may be specified with \-\-fastq_ascii \-\-fastq_qmin
+and \-\-fastq_qmax.
 .TP
 .BI \-\-fastq_filter \0filename
-Shorten and/or filter the sequences in the given FASTQ file and output
-the remaining sequences to the FASTQ file specified with the
-\-\-fastqout option and, after conversion, to the FASTA file specified
-with the \-\-fastaout option. The discarded sequences will be written
-to the files specified with the \-\-fastaout_discarded and
-\-\-fastqout_discarded options. Sequences may be shortened using the
-options \-\-fastq_stripleft, \-\-fastq_trunclen, and
-\-\-fastq_truncqual. The sequences may be filtered using the options
-\-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_maxns,
-\-\-fastq_minlen and \-\-fastq_trunclen. If shortening results in an
-empty sequence, it will be discarded. The sequences are first
-shortened and then filtered based on the remaining bases. If no
-shortening or filtering options are given, all sequences will be
-written to the output files, possibly after conversion from FASTQ to
-FASTA format. The \-\-relabel option may be used to relabel the output
-sequences. The \-\-eeout may be used to output the expected number of
-errors in each sequence.
+Shorten and/or filter sequences in the given FASTQ file. Similar to
+the \-\-fastx_filter command, but works only on FASTQ files. See
+\-\-fastx_filter for details.
 .TP
 .BI \-\-fastq_maxdiffs\~ "positive integer"
 When using \-\-fastq_mergepairs, specify the maximum number of
@@ -873,20 +1015,25 @@ has a strong influence on the merging success rate. The default
 value is 5.
 .TP
 .BI \-\-fastq_maxee\~ real
-When using \-\-fastq_filter or \-\-fastq_mergepairs, discard sequences
-with more than the specified number of expected errors.
+When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter,
+discard sequences with more than the specified number of expected
+errors.
 .TP
 .BI \-\-fastq_maxee_rate\~ real
-When using \-\-fastq_filter, discard sequences with more than the
-specified number of expected errors per base.
+When using \-\-fastq_filter or \-\-fastx_filter, discard sequences
+with more than the specified number of expected errors per base.
+.TP
+.BI \-\-fastq_maxlen\~ "positive integer"
+When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter,
+discard sequences with more than the specified number of bases.
 .TP
 .BI \-\-fastq_maxmergelen\~ "positive integer"
 When using \-\-fastq_mergepairs, specify the maximum length of the
 merged sequence. By default there is no limit.
 .TP
 .BI \-\-fastq_maxns\~ "positive integer"
-When using \-\-fastq_filter or \-\-fastq_mergepairs, discard sequences
-with more than the specified number of N's.
+When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter,
+discard sequences with more than the specified number of N's.
 .TP
 .BI \-\-fastq_mergepairs\0 filename
 Merge paired-end sequence reads into one sequence. The method has some
@@ -898,27 +1045,28 @@ the \-\-fastaout or \-\-fastqout options. The non-merged reads can be
 output to the files specified with the \-\-fastaout_notmerged_fwd,
 \-\-fastaout_notmerged_rev, \-\-fastqout_notmerged_fwd and
 \-\-fastqout_notmerged_rev options. Statistics may be output to the
-file specified with the \-\-eetabbedout option. Sequences will be
+file specified with the \-\-eetabbedout option. Sequences are
 truncated as specified with the \-\-fastq_truncqual option to remove
 low-quality bases in the 3' end. Sequences shorter than specified with
-\-\-fastq_minlen (after truncation) will be discarded (1 by
+\-\-fastq_minlen (after truncation) are discarded (1 by
 default). Sequences with too many ambiguous bases (N's), as specified
-with the \-\-fastq_maxns will also be discarded (no limit by
-default). Staggered reads will not be merged unless the
+with the \-\-fastq_maxns are also discarded (no limit by
+default). Staggered reads are not merged unless the
 \-\-fastq_allowmergestagger option is specified. The minimum length of
 the overlap region between the reads may be specified with the
 \-\-minovlen option (default 10), and the overlap region may not
-include more mismatches than specified with the \-\-maxdiffs option
-(5 by default), otherwise the read pair will be discarded. The
-mimimum and maximum length of the merged sequence may be specified
-with the \-\-fastq_minmergelen and \-\-fastq_maxmergelen options,
+include more mismatches than specified with the \-\-maxdiffs option (5
+by default), otherwise the read pair is discarded. The mimimum and
+maximum length of the merged sequence may be specified with the
+\-\-fastq_minmergelen and \-\-fastq_maxmergelen options,
 respectively. Other relevant options are: \-\-fastq_ascii,
 \-\-fastq_maxee, \-\-fastq_nostagger, \-\-fastq_qmax, \-\-fastq_qmin,
 and \-\-label_suffix.
 .TP
 .BI \-\-fastq_minlen\~ "positive integer"
-When using \-\-fastq_filter or \-\-fastq_mergepairs, discard sequences
-with less than the specified number of bases (default 1).
+When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter,
+discard sequences with less than the specified number of bases
+(default 1).
 .TP
 .BI \-\-fastq_minmergelen\~ "positive integer"
 When using \-\-fastq_mergepairs, specify the minimum length of the
@@ -957,11 +1105,11 @@ scores between -5 and 2.
 .TP
 .BI \-\-fastq_stats \0filename
 Analyze a FASTQ file and report the number of reads it contains. The
-type of FASTQ file may be specified with \-\-fastq_ascii
-\-\-fastq_qmin and \-\-fastq_qmax. That command requires the \-\-log
-option and outputs the following detailed statistics on read length,
-quality score, length vs. quality distributions, and length / quality
-filtering:
+quality encoding and the range of quality values may be specified with
+\-\-fastq_ascii \-\-fastq_qmin and \-\-fastq_qmax. That command
+requires the \-\-log option and outputs the following detailed
+statistics on read length, quality score, length vs. quality
+distributions, and length / quality filtering:
 .RS
 .TP
 Read length distribution:
@@ -1039,31 +1187,40 @@ position with a quality \fIQ\fR below 5, 10, 15 or 20 (option
 .RE
 .TP
 .BI \-\-fastq_stripleft\~ "positive integer"
-When using \-\-fastq_filter, strip the specified number of bases from
-the beginning of the reads.
+When using \-\-fastq_filter or \-\-fastx_filter, strip the specified
+number of bases from the beginning of the reads.
 .TP
 .BI \-\-fastq_tail\~ "positive integer"
 When using \-\-fastq_chars, count the number of times a series of
 characters of length \fIk\fR appears at the end of quality strings. By
 default, \fIk\fR = 4.
 .TP
+.BI \-\-fastq_truncee\~ real
+When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences so
+that their total expected error is not higher than the specified
+value.
+.TP
 .BI \-\-fastq_trunclen\~ "positive integer"
-When using \-\-fastq_filter, truncate sequences to the specified
-length. Sequences that are shorter will be discarded.
+When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences to
+the specified length. Shorter sequences are discarded.
+.TP
+.BI \-\-fastq_trunclen_keep\~ "positive integer"
+When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences to
+the specified length. Shorter sequences are not discarded.
 .TP
 .BI \-\-fastq_truncqual\~ "positive integer"
-When using \-\-fastq_filter, truncate sequences starting from the
-first base with the specified base quality score value or lower. Empty
-sequences will be discarded.
+When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences
+starting from the first base with the specified base quality score
+value or lower.
 .TP
 .BI \-\-fastqout \0filename
-When using \-\-fastq_filter or \-\-fastq_mergepairs, write to the
-given FASTQ-formatted file the sequences passing the filter, or the
-merged sequences.
+When using \-\-fastq_filter, \-\-fastq_mergepairs or \-\-fastx_filter,
+write to the given FASTQ-formatted file the sequences passing the
+filter, or the merged sequences.
 .TP
 .BI \-\-fastqout_discarded \0filename
-When using \-\-fastq_filter, write sequences that do not pass the
-filter to the given FASTQ-formatted file.
+When using \-\-fastq_filter or \-\-fastx_filter, write sequences that
+do not pass the filter to the given FASTQ-formatted file.
 .TP
 .BI \-\-fastqout_notmerged_fwd \0filename
 When using \-\-fastq_mergepairs, write forward reads not merged to the
@@ -1073,6 +1230,27 @@ specified FASTQ file.
 When using \-\-fastq_mergepairs, write reverse reads not merged to the
 specified FASTQ file.
 .TP
+.BI \-\-fastx_filter \0filename
+Shorten and/or filter the sequences in the given FASTA or FASTQ file
+and output the remaining sequences to the FASTQ file specified with
+the \-\-fastqout option and to the FASTA file specified with the
+\-\-fastaout option. The discarded sequences are written to the files
+specified with the \-\-fastaout_discarded and \-\-fastqout_discarded
+options. The input format (FASTA or FASTQ) is automatically
+detected. Output can not be written to FASTQ files if the input is in
+FASTA format. Sequences may be shortened using the options
+\-\-fastq_stripleft, \-\-fastq_truncee, \-\-fastq_trunclen and
+\-\-fastq_truncqual. The sequences may be filtered using the options
+\-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_maxlen,
+\-\-fastq_maxns, \-\-fastq_minlen and \-\-fastq_trunclen. If
+shortening results in an empty sequence, it is discarded. The
+sequences are first shortened and then filtered based on the remaining
+bases. If no shortening or filtering options are given, all sequences
+are written to the output files, possibly after conversion from FASTQ
+to FASTA format. The \-\-relabel option may be used to relabel the
+output sequences. The \-\-eeout may be used to output the expected
+number of errors in each sequence.
+.TP
 .BI \-\-fastx_revcomp \0filename
 Reverse-complement the sequences in the given FASTA or FASTQ file to a
 file specified with the \-\-fastaout and/or \-\-fastqout options. If
@@ -1134,32 +1312,31 @@ the \-\-db option with the \-\-usearch_global,
 
 The argument to the \-\-qmask and \-\-dbmask option may be none, soft
 or dust. If the argument is none, the no masking is performed. If the
-argument is soft the lower case symbols will be masked. Finally, if
-the argument is dust, the sequence will be masked using the DUST
-algorithm by Tatusov and Lipman to mask low-complexity regions.
+argument is soft the lower case symbols are masked. Finally, if the
+argument is dust, the sequence is masked using the DUST algorithm by
+Tatusov and Lipman to mask low-complexity regions.
 
-If the \-\-hardmask option is specified, all masked regions will be
-converted to N's, otherwise masked regions will be indicated by lower
-case letters.
+If the \-\-hardmask option is specified, all masked regions are
+converted to N's, otherwise masked regions are indicated by lower case
+letters.
 
 If any sequence is masked, the masked version of the sequence (with
-lower case letters or N's) will be used in all output files. Otherwise
-the sequence will be unmodified. The exception is the sequences in the
-output file specified with the \-\-uchimealns option, where the input
+lower case letters or N's) is used in all output files. Otherwise the
+sequence is unmodified. The exception is the sequences in the output
+file specified with the \-\-uchimealns option, where the input
 sequences are converted to upper case first and lower case letters
 indicate disagreement between the aligned sequences.
 
-When a sequence region is masked, words in the region will not be
-included in the indicies used in the heuristic search algorithm. In
-all other aspects the region is treated as other regions.
+When a sequence region is masked, words in the region are not included
+in the indices used in the heuristic search algorithm. In all other
+aspects, the region is treated as other regions.
 
-Regions in sequences that are hardmasked (with N's) will have a zero
+Regions in sequences that are hardmasked (with N's) have a zero
 alignment score and do not contribute to an alignment.
 
-Here are the
-results of combined masking options \-\-qmask (or \-\-dbmask for
-database sequences) and \-\-hardmask, assuming each input sequence
-contains both lower and uppercase nucleotides:
+Here are the results of combined masking options \-\-qmask (or
+\-\-dbmask for database sequences) and \-\-hardmask, assuming each
+input sequence contains both lower and uppercase nucleotides:
 .PP
 .ce 10 \# center the table (10 lines)
 .TS
@@ -1257,6 +1434,42 @@ Number of computation threads to use (1 to 256). The number of threads
 should be lesser or equal to the number of available CPU cores. The
 default is to use all available resources and to launch one thread
 per logical core.
+.TP
+.BI \-\-uc \0filename
+Output pairwise alignment results in \fIfilename\fR using a
+tab-separated uclust-like format with 10 columns. Each sequence is
+compared to all other sequences, and all hits (\-\-acceptall) or only
+some hits (\-\-id \fIfloat\fR) are reported, with one pairwise
+comparison per line:
+.RS
+.RS
+.nr step 1 1
+.IP \n[step]. 4
+Record type, always set to "H".
+.IP \n+[step].
+Ordinal number of the target sequence (based on input order, starting
+from zero).
+.IP \n+[step].
+Sequence length.
+.IP \n+[step].
+Percentage of similarity with the target sequence.
+.IP \n+[step].
+Match orientation, always set to "+".
+.IP \n+[step].
+Not used, always set to zero.
+.IP \n+[step].
+Not used, always set to zero.
+.IP \n+[step].
+Compact representation of the pairwise alignment using the CIGAR
+format (Compact Idiosyncratic Gapped Alignment Report): M (match), D
+(deletion) and I (insertion). The equal sign "=" indicates that the
+query is identical to the centroid sequence.
+.IP \n+[step].
+Label of the query sequence.
+.IP \n+[step].
+Label of the target sequence.
+.RE
+.RE
 .RE
 .PP
 .\" ----------------------------------------------------------------------------
@@ -1268,6 +1481,13 @@ Write pairwise global alignments to \fIfilename\fR using a
 human-readable format. Use \-\-rowlen to modify alignment
 length. Output order may vary when using multiple threads.
 .TP
+.BI \-\-biomout \0filename
+Write search results to an OTU table in the biom version 1.0 file
+format. The query file contains the samples, while the database file
+contains the OTUs. Sample and OTU identifiers are extracted from the
+header of these sequences. See the \-\-biomout option in the
+Clustering section for further details.
+.TP
 .BI \-\-blast6out \0filename
 Write search results to \fIfilename\fR using a blast-like
 tab-separated format of twelve fields (listed below), with one line
@@ -1410,9 +1630,9 @@ Reject the sequence match if the pairwise identity is lower than
 \fIreal\fR (value ranging from 0.0 to 1.0 included). The search
 process sorts target sequences by decreasing number of \fIk\fR-mers
 they have in common with the query sequence, using that information as
-a proxy for sequence similarity. That efficient pre-filtering will
-also prevent pairwise alignments with weakly matching targets, as
-there needs to be at least 6 shared \fIk\fR-mers to start the pairwise
+a proxy for sequence similarity. That efficient pre-filtering also
+prevents pairwise alignments with weakly matching targets, as there
+needs to be at least 6 shared \fIk\fR-mers to start the pairwise
 alignment, and at least one out of every 16 \fIk\fR-mers from the
 query needs to match the target. Consequently, using values lower than
 \-\-id 0.5 is not likely to capture more weakly matching targets. The
@@ -1551,25 +1771,38 @@ Reject target sequences with an abundance lower than \fIinteger\fR.
 .TP
 .BI \-\-minwordmatches\~ "non-negative integer"
 Minimum number of word matches required for a sequence to be
-considered further. Default value is 10 for the default word length
-8. For word lengths 7-15, the default minimum word matches is 14, 10,
-9, 8, 6, 5, 4, 3 and 2, respectively. If the query sequence has fewer
-unique words than the number specified, all words must match.
+considered further. Default value is 12 for the default word length
+8. For word lengths 3-15, the default minimum word matches are 18, 17,
+16, 15, 14, 12, 11, 10, 9, 8, 7, 5 and 3, respectively. If the query
+sequence has fewer unique words than the number specified, all words
+in the query must match.
 .TP
 .BI \-\-mismatch\~ "integer"
 Score assigned to a mismatch (i.e. different nucleotides) in the
 pairwise alignment. The default value is -4.
 .TP
+.BI \-\-mothur_shared_out \0filename
+Write search results to an OTU table in the mothur "shared"
+tab-separated plain text file format. The query file contains the
+samples, while the database file contains the OTUs. Sample and OTU
+identifiers are extracted from the header of these sequences. See the
+\-\-otutabout option in the Clustering section for further details.
+.TP
 .BI \-\-notmatched \0filename
 Write query sequences not matching database target sequences to
 \fIfilename\fR, in fasta format.
 .TP
+.BI \-\-otutabout \0filename
+Write search results to an OTU table in the classic tab-separated plain text
+format. The query file contains the samples, while the database file
+contains the OTUs. Sample and OTU identifiers are extracted from the
+header of these sequences. See the \-\-mothur_shared_out option in the
+Clustering section for further details.
+.TP
 .B \-\-output_no_hits
 Write both matching and non-matching queries to \-\-alnout,
-\-\-blast6out, \-\-samout or \-\-userout output files (\-\-uc and
-\-\-uc_allhits output files always feature non-matching
-queries). Non-matching queries are labelled "No hits" in \-\-alnout
-files.
+\-\-blast6out, \-\-samout or \-\-userout output files. Non-matching
+queries are labelled "No hits" in \-\-alnout files.
 .TP
 .B \-\-pattern \fIstring\fR
 This option is ignored. It is provided for compatibility with usearch.
@@ -1595,7 +1828,7 @@ Width of alignment lines in \-\-alnout output. The default value is
 .TP
 .B \-\-samheader
 Include header lines to the sam file when \-\-samout is specified. The
-header will include @HD, @SQ and @PG lines, but no read group (@RG)
+header includes @HD, @SQ and @PG lines, but no read group (@RG)
 information. By default no header lines are written.
 .TP
 .BI \-\-samout \0filename
@@ -1636,17 +1869,55 @@ coverage is computed as (matches + mismatches) / target sequence
 length.  Internal or terminal gaps are not taken into account.
 .TP
 .B \-\-top_hits_only
-Output only the hits with the highest percentage of identity with the
-query. That option modifies the output of the options \-\-alnout,
+Only the top hits between the query and database sequence sets are
+written to the output specified with the options \-\-alnout,
 \-\-samout, \-\-userout, \-\-blast6out, \-\-uc, \-\-fastapairs,
-\-\-matched and \-\-notmatched, but not \-\-dbmatched and
-\-\-dbnotmatched
+\-\-matched or \-\-notmatched (but not \-\-dbmatched and
+\-\-dbnotmatched). For each query, the top hit is the one presenting
+the highest percentage of identity (see the \-\-iddef option to change
+the way identity is measured). For a given query, if several top hits
+present exactly the same percentage of identity, the number of hits
+reported is controlled by the \-\-maxaccepts value (1 by default).
 .TP
 .BI \-\-uc \0filename
-Output searching results in \fIfilename\fR using a uclust-like
-format. For a description of the format, see
-<http://www.drive5.com/usearch/manual/ucout.html>. Output order may
-vary when using multiple threads.
+Output searching results in \fIfilename\fR using a tab-separated
+uclust-like format with 10 columns. When using the \-\-search_exact
+command, the table layout is the same than with the
+\-\-allpairs_global. When using the \-\-usearch_global command, the
+table present 2 different type of entries: hit (H) or no hit (N). Each
+query sequence is compared to all other sequences, and the best hit
+(\-\-maxaccept 1) or several hits (\-\-maxaccept >1) are reported
+(H). Output order may vary when using multiple threads. Column content
+varies with the type of entry (H or N):
+.RS
+.RS
+.nr step 1 1
+.IP \n[step]. 4
+Record type: H, or N.
+.IP \n+[step].
+Ordinal number of the target sequence (based on input order, starting
+from zero). Set to "*" for N.
+.IP \n+[step].
+Sequence length. Set to "*" for N.
+.IP \n+[step].
+Percentage of similarity with the target sequence. Set to "*" for N.
+.IP \n+[step].
+Match orientation + or -. . Set to "." for N.
+.IP \n+[step].
+Not used, always set to zero for H, or "*" for N.
+.IP \n+[step].
+Not used, always set to zero for H, or "*" for N.
+.IP \n+[step].
+Compact representation of the pairwise alignment using the CIGAR
+format (Compact Idiosyncratic Gapped Alignment Report): M (match), D
+(deletion) and I (insertion). The equal sign "=" indicates that the
+query is identical to the centroid sequence. Set to "*" for N.
+.IP \n+[step].
+Label of the query sequence.
+.IP \n+[step].
+Label of the target centroid sequence. Set to "*" for N.
+.RE
+.RE
 .TP
 .B \-\-uc_allhits
 When using the \-\-uc option, show all hits, not just the top hit for
@@ -1679,12 +1950,12 @@ value indicated by \-\-id.
 .TP
 .BI \-\-wordlength\~ "positive integer"
 Length of words (i.e. \fIk\fR-mers) for database indexing. The range
-of possible values goes from 7 to 15, but values near 8 or 9 are
+of possible values goes from 3 to 15, but values near 8 or 9 are
 generally recommended. Longer words may reduce the sensitivity/recall
 for weak similarities, but can increase precision. On the other hand,
 shorter words may increase sensitivity or recall, but may reduce
-precision. Computation time will generally increase with shorter words
-and decrease with longer words, but will increase again for very long
+precision. Computation time generally increases with shorter words and
+decreases with longer words, but it increases again for very long
 words. Memory requirements for a part of the index increase with a
 factor of 4 each time word length increases by one nucleotide, and
 this generally becomes significant for long words (12 or more). The
@@ -1694,22 +1965,60 @@ default value is 8.
 .\" ----------------------------------------------------------------------------
 Shuffling options:
 .RS
+Fasta entries in the input file are outputted in a pseudo-random
+order.
 .TP 9
 .BI \-\-output \0filename
 Write the shuffled sequences to \fIfilename\fR, in fasta format.
 .TP
 .BI \-\-randseed\~ "positive integer"
 When shuffling sequence order, use \fIinteger\fR as seed. A given seed
-will always produce the same output order (useful for
-replicability). Set to 0 to use a pseudo-random seed (default
-behavior).
+always produces the same output order (useful for replicability). Set
+to 0 to use a pseudo-random seed (default behavior).
+.TP
+.BI \-\-relabel \0string
+Relabel sequences using the prefix \fIstring\fR and a ticker (1, 2, 3,
+etc.) to construct the new headers. Use \-\-sizeout to conserve the
+abundance annotations.
+.TP
+.B \-\-relabel_keep
+When relabelling, keep the old identifier in the header after a space.
+.TP
+.B \-\-relabel_md5
+Relabel sequences using the MD5 message digest algorithm applied to
+each sequence. Former sequence headers are discarded. The sequence is
+converted to upper case and U is replaced by T before the digest is
+computed. The MD5 digest is a cryptographic hash function designed to
+minimize the probability that two different inputs gives the same
+output, even for very similar, but non-identical inputs. Still, there
+is always a very small, but non-zero probability that two different
+inputs give the same result. The MD5 digest generates a 128-bit
+(16-byte) digest that is represented by 16 hexadecimal numbers (using
+32 symbols among 0123456789abcdef). Use \-\-sizeout to conserve the
+abundance annotations.
+.TP
+.B \-\-relabel_sha1
+Relabel sequences using the SHA1 message digest algorithm applied to
+each sequence. It is similar to the \-\-relabel_md5 option but uses
+the SHA1 algorithm instead of the MD5 algorithm. The SHA1 digest
+generates a 160-bit (20-byte) result that is represented by 20
+hexadecimal numbers (40 symbols). The probability of a collision (two
+non-identical sequences having the same digest) is smaller for the
+SHA1 algorithm than it is for the MD5 algorithm. Use \-\-sizeout to
+conserve the abundance annotations.
+.TP
+.B \-\-sizeout
+When using \-\-relabel, \-\-relabel_md5 or \-\-relabel_sha1, preserve
+and report abundance annotations to the output fasta file (using the
+pattern ";size=\fIinteger\fR;").
 .TP
 .BI \-\-shuffle \0filename
 Pseudo-randomly shuffle the order of sequences contained in
 \fIfilename\fR.
 .TP
 .BI \-\-topn\~ "positive integer"
-Output only the top \fIinteger\fR sequences.
+Output only the first \fIinteger\fR sequences after pseudo-random
+reordering.
 .TP
 .B \-\-xsize
 Strip abundance information from the headers when writing the output
@@ -1783,20 +2092,21 @@ file.
 .\" ----------------------------------------------------------------------------
 Subsampling options:
 .RS
-Subsampling will randomly extract a certain number or a certain
-percentage of the sequences in the input file. If the \-\-sizein
-option is in effect, the abundances of the input sequences will be
-taken into account and the sampling will be performed from the input
-sequences as if they had not been dereplicated. The extraction is
-performed as a random sampling with a uniform distribution among the
-input sequences and is performed without replacement. The input file
-is specified with \-\-fastx_subsample option, the output files are
-specified with the \-\-fastaout and \-\-fastqout options and the
-amount of sequences to be sampled is specified with the \-\-sample_pct
-or \-\-sample_size options. The sequences not sampled may be written
-to files specified with the options \-\-fasta_discarded and
-\-\-fastq_discarded. The \-\-fastq_ascii, \-\-fastq_qmin and
-\-\-fastq_qmax options are also available.
+Subsampling randomly extracts a certain number or a certain percentage
+of the sequences in the input file. If the \-\-sizein option is in
+effect, the abundances of the input sequences is taken into account
+and the sampling is performed as if the input sequences were
+rereplicated, subsampled and dereplicated before being written to the
+output file. The extraction is performed as a random sampling with a
+uniform distribution among the input sequences and is performed
+without replacement. The input file is specified with
+\-\-fastx_subsample option, the output files are specified with the
+\-\-fastaout and \-\-fastqout options and the amount of sequences to
+be sampled is specified with the \-\-sample_pct or \-\-sample_size
+options. The sequences not sampled may be written to files specified
+with the options \-\-fasta_discarded and \-\-fastq_discarded. The
+\-\-fastq_ascii, \-\-fastq_qmin and \-\-fastq_qmax options are also
+available.
 .PP
 .TP 9
 .BI \-\-fastaout \0filename
@@ -1805,6 +2115,22 @@ Write the sampled sequences to \fIfilename\fR, in fasta format.
 .BI \-\-fastaout_discarded \0filename
 Write the sequences not sampled to \fIfilename\fR, in fasta format.
 .TP
+.BI \-\-fastq_ascii\~ "positive integer"
+Define the ASCII character number used as the basis for the FASTQ
+quality score. The default is 33, which is used by the Sanger /
+Illumina 1.8+ FASTQ format (phred+33). The value 64 is used by the
+Solexa, Illumina 1.3+ and Illumina 1.5+ formats (phred+64).
+.TP
+.BI \-\-fastq_qmax\~ "positive integer"
+Specify the maximum quality score accepted when reading FASTQ
+files. The default is 41, which is usual for recent Sanger/Illumina
+1.8+ files.
+.TP
+.BI \-\-fastq_qmin\~ "positive integer"
+Specify the minimum quality score accepted for FASTQ files. The
+default is 0, which is usual for recent Sanger/Illumina 1.8+
+files. Older formats may use scores between -5 and 2.
+.TP
 .BI \-\-fastqout \0filename
 Write the sampled sequences to \fIfilename\fR, in fastq
 format. Requires input in fastq format.
@@ -1819,10 +2145,41 @@ that is in FASTA or FASTQ format.
 .TP
 .BI \-\-randseed\~ "positive integer"
 Use \fIinteger\fR as a seed for the pseudo-random generator. A given
-seed will always produce the same output, which is useful for
+seed always produces the same output, which is useful for
 replicability. Set to 0 to use a pseudo-random seed (default
 behavior).
 .TP
+.BI \-\-relabel \0string
+Relabel sequences using the prefix \fIstring\fR and a ticker (1, 2, 3,
+etc.) to construct the new headers. Use \-\-sizeout to conserve the
+abundance annotations.
+.TP
+.B \-\-relabel_keep
+When relabelling, keep the old identifier in the header after a space.
+.TP
+.B \-\-relabel_md5
+Relabel sequences using the MD5 message digest algorithm applied to
+each sequence. Former sequence headers are discarded. The sequence is
+converted to upper case and U is replaced by T before the digest is
+computed. The MD5 digest is a cryptographic hash function designed to
+minimize the probability that two different inputs give the same
+output, even for very similar, but non-identical inputs. Still, there
+is always a very small, but non-zero probability that two different
+inputs give the same result. The MD5 digest generates a 128-bit
+(16-byte) digest that is represented by 16 hexadecimal numbers (using
+32 symbols among 0123456789abcdef). Use \-\-sizeout to conserve the
+abundance annotations.
+.TP
+.B \-\-relabel_sha1
+Relabel sequences using the SHA1 message digest algorithm applied to
+each sequence. It is similar to the \-\-relabel_md5 option but uses
+the SHA1 algorithm instead of the MD5 algorithm. The SHA1 digest
+generates a 160-bit (20-byte) result that is represented by 20
+hexadecimal numbers (40 symbols). The probability of a collision (two
+non-identical sequences having the same digest) is smaller for the
+SHA1 algorithm than it is for the MD5 algorithm. Use \-\-sizeout to
+conserve the abundance annotations.
+.TP
 .BI \-\-sample_pct\~ "real"
 Subsample the given percentage of the input sequences. Accepted values
 range from 0.0 to 100.0.
@@ -2067,7 +2424,7 @@ The fields qlo, qhi, tlo, thi now have counterparts (qilo, qihi, tilo,
 tihi) reporting alignment coordinates ignoring terminal gaps.
 .PP
 In usearch, when using the option \-\-output_no_hits, queries that
-receive no match are reported in blast6out file, but not in the
+receive no match are reported in \-\-blast6out file, but not in the
 alignment output file. This is corrected in \fBvsearch\fR.
 .PP
 \fBvsearch\fR introduces a new \-\-cluster_size command that sorts
@@ -2093,32 +2450,32 @@ be more consistent.
 .PP
 .\" ============================================================================
 .SH NOVELTIES
-\fBvsearch\fR introduces new options not present in usearch 7. They
-are described in the "Options" section of this manual. Here is a short
-list:
+\fBvsearch\fR introduces new commands and new options not present in
+usearch 7. They are described in the "Options" section of this
+manual. Here is a short list:
+.RS
 .IP - 2
-alignwidth (chimera checking)
-.IP -
-borderline (chimera checking)
-.IP -
-cluster_size (clustering)
+alignwidth, borderline, fasta_score (chimera checking)
 .IP -
-clusterout_id (clustering)
+cluster_size, clusterout_id, clusterout_sort, profile (clustering)
 .IP -
-clusterout_sort (clustering)
-.IP -
-fasta_width (general option)
+fasta_width, gzip_decompress, bzip2_decompress (general option)
 .IP -
 iddef (clustering, pairwise alignment, searching)
 .IP -
 maxuniquesize (dereplication)
 .IP -
-profile (clustering)
-.IP -
 relabel_md5 and relabel_sha1 (chimera detection, dereplication, FASTQ
 processing, shuffling, sorting)
 .IP -
 shuffle (shuffling)
+.IP -
+fastq_eestats, fastq_maxlen, fastq_truncee (FASTQ processing)
+.IP -
+fastaout_discarded, fastqout_discarded (subsampling)
+.IP -
+rereplicate (dereplication/rereplication)
+.RE
 .PP
 .\" ============================================================================
 .SH EXAMPLES
@@ -2136,8 +2493,8 @@ at least 1.5 times more abundant than chimeras. Output non-chimeric
 sequences in fasta format (no wrapping):
 .PP
 .RS
-\fBvsearch\fR \-\-uchime_denovo \fIqueries.fas\fR \-\-nonchimeras
-\fIresults.fas\fR \-\-fasta_width 0 \-\-abskew 1.5
+\fBvsearch\fR \-\-uchime_denovo \fIqueries.fas\fR \-\-abskew 1.5
+\-\-nonchimeras \fIresults.fas\fR \-\-fasta_width 0
 .RE
 .PP
 Cluster with a 97% similarity threshold, collect cluster centroids,
@@ -2148,43 +2505,45 @@ and write cluster descriptions using a uclust-like format:
 \-\-centroids \fIcentroids.fas\fR \-\-uc \fIclusters.uc\fR
 .RE
 .PP
-Dereplicate the sequences contained in queries.fas, take into account
-the abundance information already present, write unwrapped sequences
-to output with the new abundance information, discard all sequences
-with an abundance of 1:
+Dereplicate the sequences contained in \fIqueries.fas\fR, take into
+account the abundance information already present, write unwrapped
+fasta sequences to \fIqueries_unique.fas\fR with the new abundance
+information, discard all sequences with an abundance of 1:
 .PP
 .RS
-\fBvsearch\fR \-\-derep_fulllength \fIqueries.fas\fR \-\-output
-\fIqueries_masked.fas\fR \-\-sizein \-\-sizeout \-\-fasta_width 0
+\fBvsearch\fR \-\-derep_fulllength \fIqueries.fas\fR \-\-sizein
+\-\-fasta_width 0 \-\-sizeout \-\-output \fIqueries_unique.fas\fR
 \-\-minuniquesize 2
 .RE
 .PP
 Mask simple repeats and low complexity regions in the input fasta file
-(masked regions are lowercased), and write the results to the output
-file:
+with the DUST algorithm (masked regions are lowercased), and write the
+results to the output file:
 .PP
 .RS
-\fBvsearch\fR \-\-maskfasta \fIqueries.fas\fR \-\-output
-\fIqueries_masked.fas\fR \-\-qmask dust
+\fBvsearch\fR \-\-maskfasta \fIqueries.fas\fR \-\-qmask dust
+\-\-output \fIqueries_masked.fas\fR
 .RE
 .PP
 Search queries in a reference database, with a 80%-similarity
 threshold, take terminal gaps into account when calculating pairwise
-similarities:
+similarities, output pairwise alignments:
 .PP
 .RS
 \fBvsearch\fR \-\-usearch_global \fIqueries.fas\fR \-\-db
-\fIreferences.fas\fR \-\-alnout \fIresults.aln\fR \-\-id 0.8 \-\-iddef
-1
+\fIreferences.fas\fR \-\-id 0.8 \-\-iddef 1 \-\-alnout
+\fIresults.aln\fR
 .RE
 .PP
 Search a sequence dataset against itself (ignore self hits), get all
-matches with at least 60% identity, and collect results in a
-blast-like tab-separated format:
+matches with at least 60% similarity, and collect results in a
+blast-like tab-separated format. Accept an unlimited number of hits
+(\-\-maxaccepts 0), and compare each query to all other sequences,
+including unlikely candidates (\-\-maxrejects 0):
 .PP
 .RS
 \fBvsearch\fR \-\-usearch_global \fIqueries.fas\fR \-\-db
-\fIqueries.fas\fR \-\-id 0.6 \-\-self \-\-blast6out
+\fIqueries.fas\fR \-\-self \-\-id 0.6 \-\-blast6out
 \fIresults.blast6\fR \-\-maxaccepts 0 \-\-maxrejects 0
 .RE
 .PP
@@ -2197,10 +2556,11 @@ to the output file:
 \fIqueries_shuffled.fas\fR \-\-randseed 13 \-\-fasta_width 0
 .RE
 .PP
-Sort by decreasing abundance the sequences contained in queries.fas
-(using the "size=\fIinteger\fR" information), relabel the sequences
-while preserving the abundance information (with \-\-sizeout), keep
-only sequences with an abundance equal to or greater than 2:
+Sort by decreasing abundance the sequences contained in
+\fIqueries.fas\fR (using the "size=\fIinteger\fR" information),
+relabel the sequences while preserving the abundance information (with
+\-\-sizeout), keep only sequences with an abundance equal to or
+greater than 2:
 .PP
 .RS
 \fBvsearch\fR \-\-sortbysize \fIqueries.fas\fR \-\-output
@@ -2290,31 +2650,33 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 .PP
-\fBvsearch\fR includes code from several other projects. We thank the
-authors for making their source code available.
-.PP
+We would like to thank the authors of the following projects for
+making their source code available:
+.RS
+.IP - 2
 \fBvsearch\fR includes code from Google's CityHash project by Geoff
 Pike and Jyrki Alakuijala, providing some excellent hash functions
 available under a MIT license.
-.PP
+.IP -
 \fBvsearch\fR includes code derived from Tatusov and Lipman's DUST
 program that is in the public domain.
-.PP
+.IP -
 \fBvsearch\fR includes public domain code written by Alexander Peslyak
 for the MD5 message digest algorithm.
-.PP
+.IP -
 \fBvsearch\fR includes public domain code written by Steve Reid and
 others for the SHA1 message digest algorithm.
-.PP
+.IP -
 \fBvsearch\fR includes statistical data from PEAR
 <https://github.com/xflouris/PEAR> by Zhang, Kobert, Flouri and
 Stamatakis. Used with permission.
-.PP
+.IP -
 \fBvsearch\fR binaries may include code from the zlib library,
 copyright Jean-Loup Gailly and Mark Adler.
-.PP
+.IP -
 \fBvsearch\fR binaries may include code from the bzip2 library,
 copyright Julian R. Seward.
+.RE
 .PP
 .\" ============================================================================
 .SH SEE ALSO
@@ -2338,8 +2700,8 @@ First public release.
 .BR v1.0.1\~ "released December 1st, 2014"
 Bug fixes (sortbysize, semicolon after size annotation in headers) and
 minor changes (labels as secondary sort key for most sorts, treat T
-and U as identical for dereplication, only output size in dbmatched
-file if sizeout specified).
+and U as identical for dereplication, only output size in
+\-\-dbmatched file if \-\-sizeout specified).
 .TP
 .BR v1.0.2\~ "released December 6th, 2014"
 Bug fixes (ssse3/sse4.1 requirement, memory leak).
@@ -2348,8 +2710,8 @@ Bug fixes (ssse3/sse4.1 requirement, memory leak).
 Bug fix (now writes help to stdout instead of stderr).
 .TP
 .BR v1.0.4\~ "released December 8th, 2014"
-Added \-\-allpairs_global option. Reduced memory requirements
-slightly. Removed memory leaks.
+Added \-\-allpairs_global option. Reduce memory requirements slightly
+and eliminate memory leaks.
 .TP
 .BR v1.0.5\~ "released December 9th, 2014"
 Fixes a minor bug with \-\-allpairs_global and \-\-acceptall options.
@@ -2375,15 +2737,14 @@ abundance before clustering,
 meaning of userfields qlo, qhi, tlo, thi changed for compatibility
 with usearch,
 .IP -
-new userfields qilo, qihi, tilo, tihi gives alignment coordinates
+new userfields qilo, qihi, tilo, tihi give alignment coordinates
 ignoring terminal gaps,
 .IP -
 in \-\-uc output files, a perfect alignment is indicated with a "="
 sign,
 .IP -
-the option \-\-cluster_fast will now sort sequences by decreasing
-length, then by decreasing abundance and finally by sequence
-identifier,
+the option \-\-cluster_fast now sorts sequences by decreasing length,
+then by decreasing abundance and finally by sequence identifier,
 .IP -
 default \-\-maxseqlength value set to 50,000 nucleotides,
 .IP -
@@ -2397,8 +2758,8 @@ Fixes a bug in the function sorting sequences by decreasing abundance
 (\-\-sortbysize).
 .TP
 .BR v1.0.10\~ "released January 23rd, 2015"
-Fixes a bug where the sizein option was ignored and always treated as
-on, affecting clustering and dereplication commands.
+Fixes a bug where the \-\-sizein option was ignored and always treated
+as on, affecting clustering and dereplication commands.
 .TP
 .BR v1.0.11\~ "released February 5th, 2015"
 Introduces the possibility to output results in SAM format (for
@@ -2415,7 +2776,7 @@ memory leak.  Also increased line buffer for reading FASTA files to
 .TP
 .BR v1.0.14\~ "released February 17th, 2015"
 Fix a bug where the multiple alignment and consensus sequence computed
-after clustering ignored the strand of the sequences.  Also decreased
+after clustering ignored the strand of the sequences. Also decreased
 size of line buffer for reading FASTA files to 1MB again due to
 excessive stack memory usage.
 .TP
@@ -2428,9 +2789,9 @@ Integrated patches from Debian for increased compatibility with
 various architectures.
 .TP
 .BR v1.1.0\~ "released February 20th, 2015"
-Added the \-\-quiet option to suppress all output to stdout and stdout
-except for warnings and fatal errors.
-Added the \-\-log option to write messages to a log file.
+Added the \-\-quiet option to suppress all output to stdout and stderr
+except for warnings and fatal errors. Added the \-\-log option to
+write messages to a log file.
 .TP
 .BR v1.1.1\~ "released February 20th, 2015"
 Added info about \-\-log and \-\-quiet options to help text.
@@ -2463,7 +2824,7 @@ Fixed bug in hexadecimal digits of MD5 and SHA1 digests. Added
 Fixed compilation problems with zlib and bzip2lib.
 .TP
 .BR v1.3.5\~ "released September 17th, 2015"
-Minor configuration/makefile changes to compile to native cpu and
+Minor configuration/makefile changes to compile to native CPU and
 simplify makefile.
 .TP
 .BR v1.4.0\~ "released September 25th, 2015"
@@ -2485,17 +2846,17 @@ OS X.
 .BR v1.4.4\~ "released October 3rd, 2015"
 Remove debug message.
 .TP
-.BR v1.4.5\~ "released October 6rd, 2015"
+.BR v1.4.5\~ "released October 6th, 2015"
 Fix memory allocation bug when reading long FASTA sequences.
 .TP
-.BR v1.4.6\~ "released October 6rd, 2015"
+.BR v1.4.6\~ "released October 6th, 2015"
 Fix subtle bug in SIMD alignment code that reduced accuracy.
 .TP
 .BR v1.4.7\~ "released October 7th, 2015"
 Fixes a problem with searching for or clustering sequences with
-repeats. In this new version, vsearch will look at all words occurring
-at least once in the sequences in the initial step. Previously only
-words occurring exactly once were considered. In addition, vsearch now
+repeats. In this new version, vsearch looks at all words occurring at
+least once in the sequences in the initial step. Previously only words
+occurring exactly once were considered. In addition, vsearch now
 requires at least 10 words to be shared by the sequences, previously
 only 6 were required. If the query contains less than 10 words, all
 words must be present for a match. This change seems to lead to
@@ -2541,8 +2902,8 @@ Fixed a bug in detection of the file format (FASTA/FASTQ) of a gzip
 compressed input file.
 .TP
 .BR v1.9.1\~ "released November 13th, 2015"
-Fixed memory leak and a bug in score computation in fastq_mergepairs,
-and improved speed.
+Fixed memory leak and a bug in score computation in
+\-\-fastq_mergepairs, and improved speed.
 .TP
 .BR v1.9.2\~ "released November 17th, 2015"
 Fixed a bug in the computation of some values with \-\-fastq_stats.
@@ -2574,38 +2935,40 @@ extremely short sequences.
 Adjusted default minimum number of word matches during searches for
 improved performance.
 .TP
-.BR v1.9.10\~ "released January 25nd, 2016"
+.BR v1.9.10\~ "released January 25th, 2016"
 Fixed bug related to masking and lower case database sequences.
 .TP
 .BR v1.10.0\~ "released February 11th, 2016"
 Parallelized and improved merging of paired-end reads and adjusted
 some defaults. Removed progress indicator when stderr is not a
 terminal. Added \-\-fasta_score option to report chimera scores in
-FASTA files. Added rereplicate and fastq_eestats commands. Fixed
-typos. Added relabelling to files produced with \-\-consout and
-\-\-profile options.
-.TP
-.BR v1.10.1\~ "released February 23, 2016"
-Fixed a bug affecting the fastq_mergepairs command causing FASTQ
-headers to be truncated at first space. Full headers are now included
-in the output (no matter if --notrunclabels is in effect or not).
-.TP
-.BR v1.10.2\~ "released March 18, 2016"
-Fixed a bug causing a segmentation fault when running usearch_global
-with an empty query sequence. Also fixed a bug causing imperfect
-alignments to be reported with an alignment string of "=" in uc output
-files. Fixed typos in man file. Fixed fasta/fastq processing code
-regarding presence or absence of compression library header files.
-.TP
-.BR v1.11.1\~ "released April 13, 2016"
-Added strand information in UC file for derep_fulllength and
-derep_prefix. Added expected errors (ee) to header of FASTA files
+FASTA files. Added \-\-rereplicate and \-\-fastq_eestats
+commands. Fixed typos. Added relabelling to files produced with
+\-\-consout and \-\-profile options.
+.TP
+.BR v1.10.1\~ "released February 23rd, 2016"
+Fixed a bug affecting the \-\-fastq_mergepairs command causing FASTQ
+headers to be truncated at first space (despite the bug fix release
+1.9.0 of November 12th, 2015). Full headers are now included in the
+output (no matter if \-\-notrunclabels is in effect or not).
+.TP
+.BR v1.10.2\~ "released March 18th, 2016"
+Fixed a bug causing a segmentation fault when running
+\-\-usearch_global with an empty query sequence. Also fixed a bug
+causing imperfect alignments to be reported with an alignment string
+of "=" in uc output files. Fixed typos in man file. Fixed fasta/fastq
+processing code regarding presence or absence of compression library
+header files.
+.TP
+.BR v1.11.1\~ "released April 13th, 2016"
+Added strand information in UC file for \-\-derep_fulllength and
+\-\-derep_prefix. Added expected errors (ee) to header of FASTA files
 specified with \-\-fastaout and \-\-fastaout_discarded when \-\-eeout
 or \-\-fastq_eeout option is in effect for fastq_filter and
 fastq_mergepairs. The options \-\-eeout and \-\-fastq_eeout are now
 equivalent.
 .TP
-.BR v1.11.2\~ "released June 21, 2016"
+.BR v1.11.2\~ "released June 21st, 2016"
 Two bugs were fixed. The first issue was related to the \-\-query_cov
 option that used a different coverage definition than the qcov
 userfield. The coverage is now defined as the fraction of the whole
@@ -2617,7 +2980,7 @@ converted to A's in the consensus. The behaviour is changed so that
 N's are produced in the consensus, and it should now be more
 compatible with usearch.
 .TP
-.BR v2.0.0\~ "released June 24, 2016"
+.BR v2.0.0\~ "released June 24th, 2016"
 This major new version supports reading from pipes. Two new options
 are added: \-\-gzip_decompress and \-\-bzip2_decompress. One of these
 options must be specified if reading compressed input from a pipe, but
@@ -2628,22 +2991,43 @@ now represent standard input (/dev/stdin) or standard output
 (/dev/stdout) when reading or writing files, respectively. Code for
 reading FASTA and FASTQ files has been refactored.
 .TP
-.BR v2.0.1\~ "released June 30, 2016"
+.BR v2.0.1\~ "released June 30th, 2016"
 Avoid segmentation fault when masking very long sequences.
 .TP
-.BR v2.0.2\~ "released July 5, 2016"
+.BR v2.0.2\~ "released July 5th, 2016"
 Avoid warnings when compiling with GCC 6.
 .TP
-.BR v2.0.3\~ "released August 2, 2016"
+.BR v2.0.3\~ "released August 2nd, 2016"
 Fixed bad compiler options resulting in Illegal instruction errors
 when running precompiled binaries.
 .TP
-.BR v2.0.4\~ "released September 1, 2016"
+.BR v2.0.4\~ "released September 1st, 2016"
 Improved error message for bad FASTQ quality values. Improved manual.
 .TP
-.BR v2.0.5\~ "released September 9, 2016"
-Added options to output discarded sequences from subsampling to
-separate files. Updated manual.
+.BR v2.0.5\~ "released September 9th, 2016"
+Add options \-\-fastaout_discarded and \-\-fastqout_discarded to
+output discarded sequences from subsampling to separate files. Updated
+manual.
+.TP
+.BR v2.1.0\~ "released September 16th, 2016"
+New command: \-\-fastx_filter. New options: \-\-fastq_maxlen,
+\-\-fastq_truncee. Allow \-\-minwordmatches down to 3.
+.TP
+.BR v2.1.1\~ "released September 23rd, 2016"
+Fixed bugs in output to UC-files. Improved help text and manual.
+.TP
+.BR v2.1.2\~ "released September 28th, 2016"
+Fixed incorrect abundance output from fastx_filter and fastq_filter
+when relabelling.
+.TP
+.BR v2.2.0\~ "released October 7th, 2016"
+Added OTU table generation options \-\-biomout, \-\-mothur_shared_out
+and \-\-otutabout to the clustering and searching commands.
+.TP
+.BR v2.3.0\~ "released October 10th, 2016"
+Allowed zero-length sequences in FASTA and FASTQ files. Added
+\-\-fastq_trunclen_keep option. Fixed bug with output of OTU tables to
+pipes.
 .RE
 .LP
 .\" ============================================================================
diff --git a/src/Makefile.am b/src/Makefile.am
index 7732165..02417b1 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -34,6 +34,7 @@ md5.h \
 mergepairs.h \
 minheap.h \
 msa.h \
+otutable.h \
 rerep.h \
 results.h \
 search.h \
@@ -90,6 +91,7 @@ md5.c \
 mergepairs.cc \
 minheap.cc \
 msa.cc \
+otutable.cc \
 rerep.cc \
 results.cc \
 search.cc \
diff --git a/src/align_simd.cc b/src/align_simd.cc
index 498236a..6d71004 100644
--- a/src/align_simd.cc
+++ b/src/align_simd.cc
@@ -830,18 +830,34 @@ void search16(s16info_s * s,
         {
           unsigned int seqno = seqnos[cand_id];
           long length = db_getsequencelen(seqno);
-          pscores[cand_id] = 0;
-          paligned[cand_id] = 0;
+
+          paligned[cand_id] = length;
           pmatches[cand_id] = 0;
-          pmismatches[cand_id] = length;
+          pmismatches[cand_id] = 0;
           pgaps[cand_id] = length;
 
+          if (length == 0)
+            pscores[cand_id] = 0;
+          else
+            pscores[cand_id] =
+              MAX(- s->penalty_gap_open_target_left -
+                  length * s->penalty_gap_extension_target_left,
+                  - s->penalty_gap_open_target_right -
+                  length * s->penalty_gap_extension_target_right);
+
           char * cigar = 0;
-          int ret = asprintf(&cigar, "%ldI", length);
+          if (length > 0)
+            {
+              int ret = asprintf(&cigar, "%ldI", length);
+              if ((ret < 2) || !cigar)
+                fatal("Unable to allocate enough memory.");
+            }
+          else
+            {
+              cigar = (char *) xmalloc(1);
+              cigar[0] = 0;
+            }
           pcigar[cand_id] = cigar;
-
-          if ((ret < 2) || !cigar)
-            fatal("Unable to allocate enough memory.");
         }
       return;
     }
diff --git a/src/allpairs.cc b/src/allpairs.cc
index 97dd97b..808dc4b 100644
--- a/src/allpairs.cc
+++ b/src/allpairs.cc
@@ -160,7 +160,8 @@ void allpairs_output_results(int hit_count,
                                   query_head,
                                   qsequence,
                                   qseqlen,
-                                  qsequence_rc);
+                                  qsequence_rc,
+                                  hp->target);
               
           if (fp_userout)
             results_show_userout_one(fp_userout,
@@ -187,7 +188,8 @@ void allpairs_output_results(int hit_count,
                             query_head,
                             qsequence,
                             qseqlen,
-                            qsequence_rc);
+                            qsequence_rc,
+                            0);
       
       if (fp_userout)
         results_show_userout_one(fp_userout,
diff --git a/src/chimera.cc b/src/chimera.cc
index c750dd9..4e76fc3 100644
--- a/src/chimera.cc
+++ b/src/chimera.cc
@@ -157,7 +157,7 @@ static struct chimera_info_s * cia;
 
 void realloc_arrays(struct chimera_info_s * ci)
 {
-  int maxhlen = ci->query_head_len;
+  int maxhlen = MAX(ci->query_head_len,1);
   if (maxhlen > ci->head_alloc)
     {
       ci->head_alloc = maxhlen;
@@ -166,7 +166,7 @@ void realloc_arrays(struct chimera_info_s * ci)
 
   /* realloc arrays based on query length */
 
-  int maxqlen = ci->query_len;
+  int maxqlen = MAX(ci->query_len,1);
   if (maxqlen > ci->query_alloc)
     {
       ci->query_alloc = maxqlen;
diff --git a/src/cluster.cc b/src/cluster.cc
index 8d201f4..4ce9694 100644
--- a/src/cluster.cc
+++ b/src/cluster.cc
@@ -85,7 +85,10 @@ static FILE * fp_blast6out = 0;
 static FILE * fp_fastapairs = 0;
 static FILE * fp_matched = 0;
 static FILE * fp_notmatched = 0;
-  
+static FILE * fp_otutabout = 0;
+static FILE * fp_mothur_shared_out = 0;
+static FILE * fp_biomout = 0;
+
 static pthread_attr_t attr;
 
 static struct searchinfo_s * si_plus;
@@ -324,24 +327,57 @@ void cluster_query_exit(struct searchinfo_s * si)
     free(si->kmers);
 }
 
+char * relabel_otu(int clusterno, char * sequence, int seqlen)
+{
+  char * label = 0;
+  if (opt_relabel)
+    {
+      label = (char*) xmalloc(strlen(opt_relabel) + 21);
+      sprintf(label, "%s%d", opt_relabel, clusterno+1);
+    }
+  else if (opt_relabel_sha1)
+    {
+      label = (char*) xmalloc(LEN_HEX_DIG_SHA1);
+      get_hex_seq_digest_sha1(label, sequence, seqlen);
+    }
+  else if (opt_relabel_md5)
+    {
+      label = (char*) xmalloc(LEN_HEX_DIG_MD5);
+      get_hex_seq_digest_md5(label, sequence, seqlen);
+    }
+  return label;
+}
+
 void cluster_core_results_hit(struct hit * best,
                               int clusterno,
                               char * query_head,
                               int qseqlen,
                               char * qsequence,
-                              char * qsequence_rc)
+                              char * qsequence_rc,
+                              int qsize)
 {
-  if (opt_uc)
+
+  if (opt_otutabout || opt_mothur_shared_out || opt_biomout)
     {
-      fprintf(fp_uc, "H\t%d\t%d\t%.1f\t%c\t0\t0\t%s\t%s\t%s\n",
-              clusterno,
-              qseqlen,
-              best->id,
-              best->strand ? '-' : '+',
-              best->nwalignment,
-              query_head,
-              db_getheader(best->target));
+      if (opt_relabel || opt_relabel_sha1 || opt_relabel_md5)
+        {
+          char * label = relabel_otu(clusterno,
+                                     db_getsequence(best->target),
+                                     db_getsequencelen(best->target));
+          otutable_add(query_head, label, qsize);
+          free(label);
+        }
+      else
+        otutable_add(query_head,
+                     db_getheader(best->target),
+                     qsize);
     }
+
+  if (fp_uc)
+    results_show_uc_one(fp_uc,
+                        best, query_head,
+                        qsequence, qseqlen, qsequence_rc,
+                        clusterno);
   
   if (fp_alnout)
     results_show_alnout(fp_alnout,
@@ -377,8 +413,22 @@ void cluster_core_results_nohit(int clusterno,
                                 char * query_head,
                                 int qseqlen,
                                 char * qsequence,
-                                char * qsequence_rc)
+                                char * qsequence_rc,
+                                int qsize)
 {
+
+  if (opt_otutabout || opt_mothur_shared_out || opt_biomout)
+    {
+      if (opt_relabel || opt_relabel_sha1 || opt_relabel_md5)
+        {
+          char * label = relabel_otu(clusterno, qsequence, qseqlen);
+          otutable_add(query_head, label, qsize);
+          free(label);
+        }
+      else
+        otutable_add(query_head, query_head, qsize);
+    }
+
   if (opt_uc)
     {
       fprintf(fp_uc, "S\t%d\t%d\t*\t*\t*\t*\t*\t%s\t*\n",
@@ -763,7 +813,8 @@ void cluster_core_parallel()
                                        si_p->query_head,
                                        si_p->qseqlen,
                                        si_p->qsequence,
-                                       best->strand ? si_m->qsequence : 0);
+                                       best->strand ? si_m->qsequence : 0,
+                                       si_p->qsize);
 
               /* update cluster info about this sequence */
               clusterinfo[myseqno].seqno = myseqno;
@@ -793,7 +844,8 @@ void cluster_core_parallel()
                                          si_p->query_head,
                                          si_p->qseqlen,
                                          si_p->qsequence,
-                                         0);
+                                         0,
+                                         si_p->qsize);
               clusters++;
             }
           
@@ -887,7 +939,8 @@ void cluster_core_serial()
                                    si_p->query_head,
                                    si_p->qseqlen,
                                    si_p->qsequence,
-                                   best->strand ? si_m->qsequence : 0);
+                                   best->strand ? si_m->qsequence : 0,
+                                   si_p->qsize);
           clusterinfo[seqno].seqno = seqno;
           clusterinfo[seqno].clusterno = clusterinfo[target].clusterno;
           clusterinfo[seqno].cigar = best->nwalignment;
@@ -905,7 +958,8 @@ void cluster_core_serial()
                                      si_p->query_head,
                                      si_p->qseqlen,
                                      si_p->qsequence,
-                                     0);
+                                     0,
+                                     si_p->qsize);
           clusters++;
         }
       
@@ -999,8 +1053,31 @@ void cluster(char * dbname,
         fatal("Unable to open notmatched output file for writing");
     }
 
+  if (opt_otutabout)
+    {
+      fp_otutabout = fopen(opt_otutabout, "w");
+      if (! fp_otutabout)
+        fatal("Unable to open OTU table (text format) output file for writing");
+    }
+
+  if (opt_mothur_shared_out)
+    {
+      fp_mothur_shared_out = fopen(opt_mothur_shared_out, "w");
+      if (! fp_mothur_shared_out)
+        fatal("Unable to open OTU table (mothur format) output file for writing");
+    }
+
+  if (opt_biomout)
+    {
+      fp_biomout = fopen(opt_biomout, "w");
+      if (! fp_biomout)
+        fatal("Unable to open OTU table (biom 1.0 format) output file for writing");
+    }
+
   db_read(dbname, 0);
 
+  otutable_init();
+
   results_show_samheader(fp_samout, cmdline, dbname);
 
   if (opt_qmask == MASK_DUST)
@@ -1309,6 +1386,26 @@ void cluster(char * dbname,
 
   free(clusterinfo);
 
+  if (fp_biomout)
+    {
+      otutable_print_biomout(fp_biomout);
+      fclose(fp_biomout);
+    }
+
+  if (fp_otutabout)
+    {
+      otutable_print_otutabout(fp_otutabout);
+      fclose(fp_otutabout);
+    }
+
+  if (fp_mothur_shared_out)
+    {
+      otutable_print_mothur_shared_out(fp_mothur_shared_out);
+      fclose(fp_mothur_shared_out);
+    }
+
+  otutable_done();
+
   if (opt_matched)
     fclose(fp_matched);
   if (opt_notmatched)
diff --git a/src/derep.cc b/src/derep.cc
index 336d3d6..793631a 100644
--- a/src/derep.cc
+++ b/src/derep.cc
@@ -198,7 +198,8 @@ void derep_fulllength()
   /* alloc and init table of links to other sequences in cluster */
 
   unsigned int * nextseqtab = (unsigned int*) xmalloc(sizeof(unsigned int) * dbsequencecount);
-  memset(nextseqtab, 0, sizeof(unsigned int) * dbsequencecount);
+  const unsigned int terminal = (unsigned int)(-1);
+  memset(nextseqtab, -1, sizeof(unsigned int) * dbsequencecount);
 
   char * match_strand = (char *) xmalloc(dbsequencecount);
   memset(match_strand, 0, dbsequencecount);
@@ -418,8 +419,8 @@ void derep_fulllength()
           fprintf(fp_uc, "S\t%ld\t%ld\t*\t*\t*\t*\t*\t%s\t*\n",
                   i, len, h);
           
-          for (unsigned long next = nextseqtab[bp->seqno_first];
-               next;
+          for (unsigned int next = nextseqtab[bp->seqno_first];
+               next != terminal;
                next = nextseqtab[next])
             fprintf(fp_uc,
                     "H\t%ld\t%ld\t%.1f\t%s\t0\t0\t*\t%s\t%s\n",
@@ -519,7 +520,8 @@ void derep_prefix()
   /* alloc and init table of links to other sequences in cluster */
 
   unsigned int * nextseqtab = (unsigned int*) xmalloc(sizeof(unsigned int) * dbsequencecount);
-  memset(nextseqtab, 0, sizeof(unsigned int) * dbsequencecount);
+  const unsigned int terminal = (unsigned int)(-1);
+  memset(nextseqtab, -1, sizeof(unsigned int) * dbsequencecount);
 
   char * seq_up = (char*) xmalloc(db_getlongestsequence() + 1);
 
@@ -565,18 +567,19 @@ void derep_prefix()
       /* compute hashes of all prefixes */
 
       unsigned long fnv1a_hash = 14695981039346656037UL;
+      prefix_hashes[0] = fnv1a_hash;
       for(unsigned int j = 0; j < seqlen; j++)
         {
           fnv1a_hash ^= seq_up[j];
           fnv1a_hash *= 1099511628211UL;
-          prefix_hashes[j] = fnv1a_hash;
+          prefix_hashes[j+1] = fnv1a_hash;
         }
 
       /* first, look for an identical match */
 
       unsigned int prefix_len = seqlen;
 
-      unsigned long hash = prefix_hashes[prefix_len-1];
+      unsigned long hash = prefix_hashes[prefix_len];
       struct bucket * bp = hashtable + (hash & hash_mask);
       
       while ((bp->size) &&
@@ -611,9 +614,10 @@ void derep_prefix()
         {
           /* look for prefix match */
           
-          while((! bp->size) && (prefix_len-- >= len_shortest))
+          while((! bp->size) && (prefix_len > len_shortest))
             {
-              hash = prefix_hashes[prefix_len-1];
+              prefix_len--;
+              hash = prefix_hashes[prefix_len];
               bp = hashtable + (hash & hash_mask);
               
               while ((bp->size) &&
@@ -630,7 +634,7 @@ void derep_prefix()
                 }
             }
           
-          if ((bp->size) && (prefix_len >= len_shortest))
+          if (bp->size)
             {
               /* prefix match */
 
@@ -763,8 +767,8 @@ void derep_prefix()
           fprintf(fp_uc, "S\t%ld\t%ld\t*\t*\t*\t*\t*\t%s\t*\n",
                   i, len, h);
           
-          for (unsigned long next = nextseqtab[bp->seqno_first];
-               next;
+          for (unsigned int next = nextseqtab[bp->seqno_first];
+               next != terminal;
                next = nextseqtab[next])
             fprintf(fp_uc,
                     "H\t%ld\t%lu\t%.1f\t+\t0\t0\t*\t%s\t%s\n",
diff --git a/src/fasta.cc b/src/fasta.cc
index d618135..84b3a60 100644
--- a/src/fasta.cc
+++ b/src/fasta.cc
@@ -158,7 +158,9 @@ bool fasta_next(fastx_handle h,
   h->lineno_start = h->lineno;
 
   h->header_buffer.length = 0;
+  h->header_buffer.data[0] = 0;
   h->sequence_buffer.length = 0;
+  h->sequence_buffer.data[0] = 0;
 
   unsigned long rest = fastx_file_fill_buffer(h);
 
diff --git a/src/fastq.cc b/src/fastq.cc
index 43bd488..69bb3d7 100644
--- a/src/fastq.cc
+++ b/src/fastq.cc
@@ -258,8 +258,10 @@ bool fastq_next(fastx_handle h,
       rest -= len;
     }
 
+#if 0
   if (h->sequence_buffer.length == 0)
     fastq_fatal(lineno_seq, "Empty sequence line");
+#endif
 
   unsigned long lineno_plus = h->lineno;
 
@@ -363,8 +365,10 @@ bool fastq_next(fastx_handle h,
       rest -= len;
     }
 
+#if 0
   if (h->quality_buffer.length == 0)
     fastq_fatal(lineno_seq, "Empty quality line");
+#endif
 
   if (h->sequence_buffer.length != h->quality_buffer.length)
     fastq_fatal(lineno_qual,
diff --git a/src/fastqops.cc b/src/fastqops.cc
index fdf9950..93a81ac 100644
--- a/src/fastqops.cc
+++ b/src/fastqops.cc
@@ -80,11 +80,20 @@ int fastq_get_qual(char q)
   return qual;
 }
 
-void fastq_filter()
+void filter(bool fastq_only, char * filename)
 {
-  fastx_handle h = fastq_open(opt_fastq_filter);
+  fastx_handle h = fastx_open(filename);
 
-  unsigned long filesize = fastq_get_size(h);
+  if (!h)
+    fatal("Unrecognized file type (not proper FASTA or FASTQ format)");
+
+  if (fastq_only && ! h->is_fastq)
+    fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead");
+
+  if ((opt_fastqout || opt_fastqout_discarded) && ! h->is_fastq)
+    fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores");
+
+  unsigned long filesize = fastx_get_size(h);
 
   FILE * fp_fastaout = 0;
   FILE * fp_fastqout = 0;
@@ -127,7 +136,7 @@ void fastq_filter()
       header = (char*) xmalloc(header_alloc);
     }
 
-  progress_init("Reading fastq file", filesize);
+  progress_init("Reading input file", filesize);
 
   long kept = 0;
   long discarded = 0;
@@ -136,13 +145,14 @@ void fastq_filter()
   char hex_md5[LEN_HEX_DIG_MD5];
   char hex_sha1[LEN_HEX_DIG_SHA1];
 
-  while(fastq_next(h, 0, chrmap_no_change))
+  while(fastx_next(h, 0, chrmap_no_change))
     {
-      long length = fastq_get_sequence_length(h);
-      char * d = fastq_get_header(h);
-      char * p = fastq_get_sequence(h);
-      char * q = fastq_get_quality(h);
-
+      long length = fastx_get_sequence_length(h);
+      char * d = fastx_get_header(h);
+      char * p = fastx_get_sequence(h);
+      char * q = fastx_get_quality(h);
+      long abundance = fastx_get_abundance(h);
+      
       /* strip initial part */
       if (opt_fastq_stripleft > 0)
         {
@@ -161,7 +171,7 @@ void fastq_filter()
         }
       
       /* truncate trailing part */
-      if (opt_fastq_trunclen > 0)
+      if (opt_fastq_trunclen >= 0)
         {
           if (length >= opt_fastq_trunclen)
             length = opt_fastq_trunclen;
@@ -169,15 +179,26 @@ void fastq_filter()
             length = 0;
         }
       
-      /* quality truncation */
-      for (long i = 0; i < length; i++)
-        {
-          int qual = fastq_get_qual(q[i]);
+      /* truncate trailing part, but keep if short */
+      if ((opt_fastq_trunclen_keep >= 0) && (length > opt_fastq_trunclen_keep))
+        length = opt_fastq_trunclen_keep;
 
-          if (qual <= opt_fastq_truncqual)
+      /* quality and ee truncation */
+      double ee = 0.0;
+      if (h->is_fastq)
+        {
+          for (long i = 0; i < length; i++)
             {
-              length = i;
-              break;
+              int qual = fastq_get_qual(q[i]);
+              ee += exp10(- qual / 10.0);
+              
+              if ((qual <= opt_fastq_truncqual) ||
+                  (ee > opt_fastq_truncee))
+                {
+                  ee -= exp10(- qual / 10.0);
+                  length = i;
+                  break;
+                }
             }
         }
 
@@ -190,29 +211,23 @@ void fastq_filter()
             ncount++;
         }
 
-      /* compute ee */
-      double ee = 0.0;
-      for (long i = 0; i < length; i++)
-        {
-          int qual = fastq_get_qual(q[i]);
-          ee += exp10(- qual / 10.0);
-        }
-
       if ((length >= opt_fastq_minlen) &&
-          ((opt_fastq_trunclen == 0) || (length >= opt_fastq_trunclen)) &&
+          (length <= opt_fastq_maxlen) &&
+          ((opt_fastq_trunclen < 0) || (length >= opt_fastq_trunclen)) &&
+          (ncount <= opt_fastq_maxns) &&
           (ee <= opt_fastq_maxee) &&
-          (ee / length <= opt_fastq_maxee_rate) &&
-          (ncount <= opt_fastq_maxns))
+          ((length == 0) || (ee / length <= opt_fastq_maxee_rate)))
         {
           /* keep the sequence */
 
           kept++;
 
-          if ((unsigned long)(length) < fastq_get_sequence_length(h))
+          if ((unsigned long)(length) < fastx_get_sequence_length(h))
             {
               truncated++;
               p[length] = 0;
-              q[length] = 0;
+              if (h->is_fastq)
+                q[length] = 0;
             }
 
           if (opt_fastaout)
@@ -220,14 +235,14 @@ void fastq_filter()
               if (opt_eeout || opt_fastq_eeout)
                 fasta_print_relabel_ee(fp_fastaout,
                                        p, length,
-                                       d, fastq_get_header_length(h),
-                                       1, kept,
+                                       d, fastx_get_header_length(h),
+                                       abundance, kept,
                                        ee);
               else
                 fasta_print_relabel(fp_fastaout,
                                     p, length,
-                                    d, fastq_get_header_length(h),
-                                    1, kept);
+                                    d, fastx_get_header_length(h),
+                                    abundance, kept);
             }
           if (opt_fastqout)
             {
@@ -260,22 +275,22 @@ void fastq_filter()
 
           discarded++;
 
-          p = fastq_get_sequence(h);
-          q = fastq_get_quality(h);
+          p = fastx_get_sequence(h);
+          q = fastx_get_quality(h);
 
           if (opt_fastaout_discarded)
             {
               if (opt_eeout || opt_fastq_eeout)
                 fasta_print_relabel_ee(fp_fastaout_discarded,
                                        p, length,
-                                       d, fastq_get_header_length(h),
-                                       1, discarded,
+                                       d, fastx_get_header_length(h),
+                                       abundance, discarded,
                                        ee);
               else
                 fasta_print_relabel(fp_fastaout_discarded,
                                     p, length,
-                                    d, fastq_get_header_length(h),
-                                    1, discarded);
+                                    d, fastx_get_header_length(h),
+                                    abundance, discarded);
             }
 
           if (opt_fastqout_discarded)
@@ -303,7 +318,7 @@ void fastq_filter()
             }
         }
 
-      progress_update(fastq_get_position(h));
+      progress_update(fastx_get_position(h));
     }
   progress_done();
 
@@ -328,7 +343,17 @@ void fastq_filter()
   if (opt_fastqout_discarded)
     fclose(fp_fastqout_discarded);
 
-  fastq_close(h);
+  fastx_close(h);
+}
+
+void fastq_filter()
+{
+  filter(1, opt_fastq_filter);
+}
+
+void fastx_filter()
+{
+  filter(0, opt_fastx_filter);
 }
 
 void fastq_chars()
@@ -624,7 +649,7 @@ void fastq_stats()
             {
               char * msg;
               if (asprintf(& msg,
-"FASTQ quality value (%d) out of range (%d-%d).\n"
+"FASTQ quality value (%d) out of range (%ld-%ld).\n"
 "Please adjust the FASTQ quality base character or range with the\n"
 "--fastq_ascii, --fastq_qmin or --fastq_qmax options. For a complete\n"
 "diagnosis with suggested values, please run vsearch --fastq_chars file.",
diff --git a/src/fastqops.h b/src/fastqops.h
index 525155f..8936c0d 100644
--- a/src/fastqops.h
+++ b/src/fastqops.h
@@ -59,7 +59,8 @@
 */
 
 void fastq_chars();
+void fastq_convert();
 void fastq_filter();
 void fastq_stats();
+void fastx_filter();
 void fastx_revcomp();
-void fastq_convert();
diff --git a/src/linmemalign.cc b/src/linmemalign.cc
index 2711ef0..146227c 100644
--- a/src/linmemalign.cc
+++ b/src/linmemalign.cc
@@ -152,6 +152,19 @@ void LinearMemoryAligner::alloc_vectors(size_t x)
     }
 }
 
+void LinearMemoryAligner::cigar_reset()
+{
+  if (cigar_alloc < 1)
+    {
+      cigar_alloc = 64;
+      cigar_string = (char*) xrealloc(cigar_string, cigar_alloc);
+    }
+  cigar_string[0] = 0;
+  cigar_length = 0;
+  op = 0;
+  op_run = 0;
+}
+
 void LinearMemoryAligner::cigar_flush()
 {
   if (op_run > 0)
@@ -604,9 +617,7 @@ char * LinearMemoryAligner::align(char * _a_seq,
   b_seq = _b_seq;
 
   /* init cigar operations */
-  op = 0;
-  op_run = 0;
-  cigar_length = 0;
+  cigar_reset();
 
   /* allocate enough memory for vectors */
   alloc_vectors(b_len+1);
diff --git a/src/linmemalign.h b/src/linmemalign.h
index 753b0ad..438f118 100644
--- a/src/linmemalign.h
+++ b/src/linmemalign.h
@@ -95,6 +95,8 @@ class LinearMemoryAligner
   long * XX;
   long * YY;
   
+  void cigar_reset();
+
   void cigar_flush();
 
   void cigar_add(char _op, long run);
diff --git a/src/mergepairs.cc b/src/mergepairs.cc
index 1d5fbd0..6fa1973 100644
--- a/src/mergepairs.cc
+++ b/src/mergepairs.cc
@@ -508,7 +508,9 @@ void process(merge_data_t * ip)
   /* check length */
 
   if ((ip->fwd_length < opt_fastq_minlen) ||
-      (ip->rev_length < opt_fastq_minlen))
+      (ip->rev_length < opt_fastq_minlen) ||
+      (ip->fwd_length > opt_fastq_maxlen) ||
+      (ip->rev_length > opt_fastq_maxlen))
     skip = 1;
 
   /* truncate sequences by quality */
diff --git a/src/otutable.cc b/src/otutable.cc
new file mode 100644
index 0000000..5075a0c
--- /dev/null
+++ b/src/otutable.cc
@@ -0,0 +1,405 @@
+/*
+
+  VSEARCH: a versatile open source tool for metagenomics
+
+  Copyright (C) 2014-2015, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
+  All rights reserved.
+
+  Contact: Torbjorn Rognes <torognes at ifi.uio.no>,
+  Department of Informatics, University of Oslo,
+  PO Box 1080 Blindern, NO-0316 Oslo, Norway
+
+  This software is dual-licensed and available under a choice
+  of one of two licenses, either under the terms of the GNU
+  General Public License version 3 or the BSD 2-Clause License.
+
+
+  GNU General Public License version 3
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+  The BSD 2-Clause License
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  1. Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "vsearch.h"
+
+#include <string>
+#include <set>
+#include <map>
+
+
+/*
+
+  Identify sample and otu identifiers in headers, and count
+  abundance of the samples in different OTUs.
+
+  http://www.drive5.com/usearch/manual/upp_labels_sample.html
+  http://www.drive5.com/usearch/manual/upp_labels_otus.html
+
+  TODO:
+  - add relabel @
+
+*/
+
+typedef std::set<std::string> string_set_t;
+typedef std::pair<std::string, std::string> string_pair_t;
+typedef std::map<string_pair_t, unsigned long> string_pair_map_t;
+typedef std::map<std::string, std::string> otu_tax_map_t;
+typedef std::map<std::string, unsigned long> string_no_map_t;
+
+struct otutable_s
+{
+  regex_t regex_sample;
+  regex_t regex_otu;
+  regex_t regex_tax;
+  
+  string_set_t otu_set;
+  string_set_t sample_set;
+  string_pair_map_t sample_otu_count;
+  string_pair_map_t otu_sample_count;
+  otu_tax_map_t otu_tax_map;
+};
+
+static otutable_s otutable;
+
+void otutable_init()
+{
+  /* compile regular expression matchers */
+  
+  if (regcomp(&otutable.regex_sample,
+              "(^|;)(sample|barcodelabel)=([^;]*)($|;)",
+              REG_EXTENDED))
+    fatal("Compilation of regular expression for sample annotation failed");
+  
+  if (regcomp(&otutable.regex_otu,
+              "(^|;)otu=([^;]*)($|;)",
+              REG_EXTENDED))
+    fatal("Compilation of regular expression for otu annotation failed");
+  
+  if (regcomp(&otutable.regex_tax,
+              "(^|;)tax=([^;]*)($|;)",
+              REG_EXTENDED))
+    fatal("Compilation of regular expression for taxonomy annotation failed");
+}
+
+void otutable_done()
+{
+  regfree(&otutable.regex_sample);
+  regfree(&otutable.regex_otu);
+  regfree(&otutable.regex_tax);
+
+  otutable.otu_set.clear();
+  otutable.sample_set.clear();
+  otutable.sample_otu_count.clear();
+  otutable.otu_sample_count.clear();
+}
+
+void otutable_add(char * query_header, char * target_header, long abundance)
+{
+  /* read sample annotation in query */
+
+  regmatch_t pmatch_sample[5];
+  int len_sample;
+  char * start_sample = query_header;
+  if (!regexec(&otutable.regex_sample, query_header, 5, pmatch_sample, 0))
+    {
+      /* match: use the matching sample name */
+      len_sample = pmatch_sample[3].rm_eo - pmatch_sample[3].rm_so;
+      start_sample += pmatch_sample[3].rm_so;
+    }
+  else
+    {
+      /* no match: use first name in header with A-Za-z0-9_ */
+      len_sample = strspn(query_header,
+                          "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                          "abcdefghijklmnopqrstuvwxyz"
+                          "_"
+                          "0123456789");
+    }
+  char * sample_name = (char *) xmalloc(len_sample+1);
+  strncpy(sample_name, start_sample, len_sample);
+  sample_name[len_sample] = 0;
+
+
+  /* read OTU annotation in target */
+
+  regmatch_t pmatch_otu[4];
+  int len_otu;
+  char * start_otu = target_header;
+  if (!regexec(&otutable.regex_otu, target_header, 4, pmatch_otu, 0))
+    {
+      /* match: use the matching otu name */
+      len_otu = pmatch_otu[2].rm_eo - pmatch_otu[2].rm_so;
+      start_otu += pmatch_otu[2].rm_so;
+    }
+  else
+    {
+      /* no match: use first name in header up to ; */
+      len_otu = strcspn(target_header, ";");
+    }
+  char * otu_name = (char *) xmalloc(len_otu+1);
+  strncpy(otu_name, start_otu, len_otu);
+  otu_name[len_otu] = 0;
+  
+
+  /* read tax annotation in target */
+
+  regmatch_t pmatch_tax[4];
+  int len_tax;
+  char * start_tax = target_header;
+
+  if (!regexec(&otutable.regex_tax, target_header, 4, pmatch_tax, 0))
+    {
+      /* match: use the matching tax name */
+      len_tax = pmatch_tax[2].rm_eo - pmatch_tax[2].rm_so;
+      start_tax += pmatch_tax[2].rm_so;
+
+      char * tax_name = (char *) xmalloc(len_tax+1);
+      strncpy(tax_name, start_tax, len_tax);
+      tax_name[len_tax] = 0;
+      otutable.otu_tax_map[otu_name] = tax_name;
+      free(tax_name);
+    }
+
+  /* store data */
+
+  otutable.sample_set.insert(sample_name);
+  otutable.otu_set.insert(otu_name);
+  otutable.sample_otu_count[string_pair_t(sample_name,otu_name)]
+    += abundance;
+  otutable.otu_sample_count[string_pair_t(otu_name,sample_name)]
+    += abundance;
+
+  free(otu_name);
+  free(sample_name);
+}
+
+void otutable_print_otutabout(FILE * fp)
+{
+  long progress = 0;
+  progress_init("Writing OTU table (classic)", otutable.otu_set.size());
+
+  fprintf(fp, "#OTU ID");
+  for (string_set_t::iterator it_sample = otutable.sample_set.begin();
+       it_sample != otutable.sample_set.end();
+       it_sample++)
+    fprintf(fp, "\t%s", it_sample->c_str());
+  if (! otutable.otu_tax_map.empty())
+    fprintf(fp, "\ttaxonomy");
+  fprintf(fp, "\n");
+
+  string_pair_map_t::iterator it_map = otutable.otu_sample_count.begin();
+  for (string_set_t::iterator it_otu = otutable.otu_set.begin();
+       it_otu != otutable.otu_set.end();
+       it_otu++)
+    {
+      fprintf(fp, "%s", it_otu->c_str());
+
+      for (string_set_t::iterator it_sample = otutable.sample_set.begin();
+           it_sample != otutable.sample_set.end();
+           it_sample++)
+        { 
+          unsigned long a = 0;
+          if ((it_map != otutable.otu_sample_count.end()) &&
+              (it_map->first.first == *it_otu) &&
+              (it_map->first.second == *it_sample))
+            {
+              a = it_map->second;
+              it_map++;
+            }
+          fprintf(fp, "\t%ld", a);
+        }
+      if (! otutable.otu_tax_map.empty())
+        {
+          fprintf(fp, "\t");
+          otu_tax_map_t::iterator it
+            = otutable.otu_tax_map.find(*it_otu);
+          if (it != otutable.otu_tax_map.end())
+            fprintf(fp, "%s", it->second.c_str());
+        }
+      fprintf(fp, "\n");
+      progress_update(++progress);
+    }
+  progress_done();
+}
+
+void otutable_print_mothur_shared_out(FILE * fp)
+{
+  long progress = 0;
+  progress_init("Writing OTU table (mothur)", otutable.sample_set.size());
+
+  fprintf(fp, "label\tGroup\tnumOtus");
+  long numotus = 0;
+  for (string_set_t::iterator it_otu = otutable.otu_set.begin();
+       it_otu != otutable.otu_set.end();
+       it_otu++)
+    {
+      const char * otu_name = it_otu->c_str();
+      fprintf(fp, "\t%s", otu_name);
+      numotus++;
+    }
+  fprintf(fp, "\n");
+
+  string_pair_map_t::iterator it_map = otutable.sample_otu_count.begin();
+
+  for (string_set_t::iterator it_sample = otutable.sample_set.begin();
+       it_sample != otutable.sample_set.end();
+       it_sample++)
+    {
+      fprintf(fp, "vsearch\t%s\t%ld", it_sample->c_str(), numotus);
+      
+      for (string_set_t::iterator it_otu = otutable.otu_set.begin();
+           it_otu != otutable.otu_set.end();
+           it_otu++)
+        {
+          unsigned long a = 0;
+          if ((it_map != otutable.sample_otu_count.end()) &&
+              (it_map->first.first == *it_sample) &&
+              (it_map->first.second == *it_otu))
+            {
+              a = it_map->second;
+              it_map++;
+            }
+          fprintf(fp, "\t%ld", a);
+        }
+
+      fprintf(fp, "\n");
+      progress_update(++progress);
+    }
+  progress_done();
+}
+
+void otutable_print_biomout(FILE * fp)
+{
+  long progress = 0;
+  progress_init("Writing OTU table (biom 1.0)", otutable.otu_sample_count.size());
+
+  long rows = otutable.otu_set.size();
+  long columns = otutable.sample_set.size();
+
+  static time_t time_now = time(0);
+  struct tm tm_now;
+  localtime_r(& time_now, & tm_now);
+  char date[50];
+  strftime(date, 50, "%Y-%m-%dT%H:%M:%S", & tm_now);
+
+  fprintf(fp,
+          "{\n"
+          "\t\"id\":\"%s\",\n"
+          "\t\"format\": \"Biological Observation Matrix 1.0\",\n"
+          "\t\"format_url\": \"http://biom-format.org/documentation/format_versions/biom-1.0.html\",\n"
+          "\t\"type\": \"OTU table\",\n"
+          "\t\"generated_by\": \"%s %s\",\n"
+          "\t\"date\": \"%s\",\n"
+          "\t\"matrix_type\": \"sparse\",\n"
+          "\t\"matrix_element_type\": \"int\",\n"
+          "\t\"shape\": [%ld,%ld],\n",
+          opt_biomout,
+          PROG_NAME, PROG_VERSION,
+          date,
+          rows,
+          columns);
+  
+  string_no_map_t otu_no_map;
+  unsigned long otu_no = 0;
+
+  fprintf(fp, "\t\"rows\":[");
+  for (string_set_t::iterator it_otu = otutable.otu_set.begin();
+       it_otu != otutable.otu_set.end();
+       it_otu++)
+    {
+      if (it_otu != otutable.otu_set.begin())
+        fprintf(fp, ",");
+      const char * otu_name = it_otu->c_str();
+      fprintf(fp, "\n\t\t{\"id\":\"%s\", \"metadata\":", otu_name);
+      if (otutable.otu_tax_map.empty())
+        fprintf(fp, "null");
+      else
+        {
+          fprintf(fp, "{\"taxonomy\":\"");
+          otu_tax_map_t::iterator it
+            = otutable.otu_tax_map.find(otu_name);
+          if (it != otutable.otu_tax_map.end())
+            fprintf(fp, "%s", it->second.c_str());
+          fprintf(fp, "\"}");
+        }
+      fprintf(fp, "}");
+      otu_no_map[*it_otu] = otu_no++;
+    }
+  fprintf(fp, "\n");
+  fprintf(fp, "\t],\n");
+
+  string_no_map_t sample_no_map;
+  unsigned long sample_no = 0;
+
+  fprintf(fp, "\t\"columns\":[");
+  for (string_set_t::iterator it_sample = otutable.sample_set.begin();
+       it_sample != otutable.sample_set.end();
+       it_sample++)
+    {
+      if (it_sample != otutable.sample_set.begin())
+        fprintf(fp, ",");
+      fprintf(fp, "\n\t\t{\"id\":\"%s\", \"metadata\":null}", it_sample->c_str());
+      sample_no_map[*it_sample] = sample_no++;
+    }
+  fprintf(fp, "\n\t],\n");
+
+  bool first = true;
+  fprintf(fp, "\t\"data\": [");
+
+  for (string_pair_map_t::iterator it_map = otutable.otu_sample_count.begin();
+       it_map != otutable.otu_sample_count.end();
+       it_map++)
+    {
+      if (!first)
+        fprintf(fp, ",");
+
+      otu_no = otu_no_map[it_map->first.first];
+      sample_no = sample_no_map[it_map->first.second];
+
+      fprintf(fp, "\n\t\t[%ld,%ld,%lu]", otu_no, sample_no, it_map->second);
+      first = false;
+      progress_update(++progress);
+    }
+  fprintf(fp, "\n\t]\n");
+
+  fprintf(fp, "}\n");
+  progress_done();
+}
diff --git a/src/fastqops.h b/src/otutable.h
similarity index 88%
copy from src/fastqops.h
copy to src/otutable.h
index 525155f..4a8e866 100644
--- a/src/fastqops.h
+++ b/src/otutable.h
@@ -2,7 +2,7 @@
 
   VSEARCH: a versatile open source tool for metagenomics
 
-  Copyright (C) 2014-2015, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
+  Copyright (C) 2014-2016, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
   All rights reserved.
 
   Contact: Torbjorn Rognes <torognes at ifi.uio.no>,
@@ -58,8 +58,9 @@
 
 */
 
-void fastq_chars();
-void fastq_filter();
-void fastq_stats();
-void fastx_revcomp();
-void fastq_convert();
+void otutable_init();
+void otutable_done();
+void otutable_add(char * query_header, char * target_header, long abundance);
+void otutable_print_otutabout(FILE * fp);
+void otutable_print_mothur_shared_out(FILE * fp);
+void otutable_print_biomout(FILE * fp);
diff --git a/src/results.cc b/src/results.cc
index 342c2a4..82b1c79 100644
--- a/src/results.cc
+++ b/src/results.cc
@@ -167,7 +167,8 @@ void results_show_uc_one(FILE * fp,
                          char * query_head,
                          char * qsequence,
                          long qseqlen,
-                         char * rc)
+                         char * rc,
+                         int clusterno)
 {
   /*
     http://www.drive5.com/usearch/manual/ucout.html
@@ -180,7 +181,7 @@ void results_show_uc_one(FILE * fp,
     strand: + or -
     0
     0
-    compressed alignment, e.g. 9I92M14D
+    compressed alignment, e.g. 9I92M14D, or "=" if prefect alignment
     query label
     target label
   */
@@ -485,9 +486,13 @@ void results_show_alnout(FILE * fp,
                   0.0);
 
 #if 0
-          fprintf(fp, "%d kmers, %d score, %d gap opens. %s\n",
+          fprintf(fp, "%d kmers, %d score, %d gap opens. %s %s %d %d %d %d %d\n",
                   hp->count, hp->nwscore, hp->nwgaps,
-                  hp->accepted ? "accepted" : "not accepted");
+                  hp->accepted ? "accepted" : "not accepted",
+                  hp->nwalignment, hp->nwalignmentlength,
+                  hp->trim_q_left, hp->trim_q_right,
+                  hp->trim_t_left, hp->trim_t_right
+                  );
 #endif
         }
     }
diff --git a/src/results.h b/src/results.h
index 7eaa764..48c1cfd 100644
--- a/src/results.h
+++ b/src/results.h
@@ -78,7 +78,8 @@ void results_show_uc_one(FILE * fp,
                          char * query_head,
                          char * qsequence,
                          long qseqlen,
-                         char * rc);
+                         char * rc,
+                         int clusterno);
 
 void results_show_userout_one(FILE * fp,
                               struct hit * hp,
diff --git a/src/search.cc b/src/search.cc
index 8b5dcee..fef3586 100644
--- a/src/search.cc
+++ b/src/search.cc
@@ -86,13 +86,17 @@ static FILE * fp_matched = 0;
 static FILE * fp_notmatched = 0;
 static FILE * fp_dbmatched = 0;
 static FILE * fp_dbnotmatched = 0;
+static FILE * fp_otutabout = 0;
+static FILE * fp_mothur_shared_out = 0;
+static FILE * fp_biomout = 0;
 
 void search_output_results(int hit_count,
                            struct hit * hits,
                            char * query_head,
                            int qseqlen,
                            char * qsequence,
-                           char * qsequence_rc)
+                           char * qsequence_rc,
+                           int qsize)
 {
   pthread_mutex_lock(&mutex_output);
 
@@ -120,7 +124,12 @@ void search_output_results(int hit_count,
   if (toreport)
     {
       double top_hit_id = hits[0].id;
-      
+
+      if (opt_otutabout || opt_mothur_shared_out || opt_biomout)
+        otutable_add(query_head,
+                     db_getheader(hits[0].target),
+                     qsize);
+
       for(int t = 0; t < toreport; t++)
         {
           struct hit * hp = hits + t;
@@ -143,7 +152,8 @@ void search_output_results(int hit_count,
                                   query_head,
                                   qsequence,
                                   qseqlen,
-                                  qsequence_rc);
+                                  qsequence_rc,
+                                  hp->target);
               
           if (fp_userout)
             results_show_userout_one(fp_userout,
@@ -170,7 +180,8 @@ void search_output_results(int hit_count,
                             query_head,
                             qsequence,
                             qseqlen,
-                            qsequence_rc);
+                            qsequence_rc,
+                            0);
       
       if (fp_userout)
         results_show_userout_one(fp_userout,
@@ -247,7 +258,8 @@ int search_query(long t)
                         si_plus[t].query_head,
                         si_plus[t].qseqlen,
                         si_plus[t].qsequence,
-                        opt_strand > 1 ? si_minus[t].qsequence : 0);
+                        opt_strand > 1 ? si_minus[t].qsequence : 0,
+                        si_plus[t].qsize);
 
   /* free memory for alignment strings */
   for(int i=0; i<hit_count; i++)
@@ -502,6 +514,27 @@ void search_prep(char * cmdline, char * progheader)
         fatal("Unable to open notmatched output file for writing");
     }
 
+  if (opt_otutabout)
+    {
+      fp_otutabout = fopen(opt_otutabout, "w");
+      if (! fp_otutabout)
+        fatal("Unable to open OTU table (text format) output file for writing");
+    }
+
+  if (opt_mothur_shared_out)
+    {
+      fp_mothur_shared_out = fopen(opt_mothur_shared_out, "w");
+      if (! fp_mothur_shared_out)
+        fatal("Unable to open OTU table (mothur format) output file for writing");
+    }
+
+  if (opt_biomout)
+    {
+      fp_biomout = fopen(opt_biomout, "w");
+      if (! fp_biomout)
+        fatal("Unable to open OTU table (biom 1.0 format) output file for writing");
+    }
+
   db_read(opt_db, 0);
 
   results_show_samheader(fp_samout, cmdline, opt_db);
@@ -535,8 +568,10 @@ void search_prep(char * cmdline, char * progheader)
 void search_done()
 {
   /* clean up, global */
+
   dbindex_free();
   db_free();
+
   if (opt_matched)
     fclose(fp_matched);
   if (opt_notmatched)
@@ -577,6 +612,8 @@ void usearch_global(char * cmdline, char * progheader)
   dbmatched = (int*) xmalloc(seqcount * sizeof(int*));
   memset(dbmatched, 0, seqcount * sizeof(int*));
 
+  otutable_init();
+
   /* prepare reading of queries */
   qmatches = 0;
   queries = 0;
@@ -619,6 +656,26 @@ void usearch_global(char * cmdline, char * progheader)
     fprintf(fp_log, "Matching query sequences: %d of %d (%.2f%%)\n", 
             qmatches, queries, 100.0 * qmatches / queries);
 
+  if (opt_biomout)
+    {
+      otutable_print_biomout(fp_biomout);
+      fclose(fp_biomout);
+    }
+
+  if (opt_otutabout)
+    {
+      otutable_print_otutabout(fp_otutabout);
+      fclose(fp_otutabout);
+    }
+
+  if (opt_mothur_shared_out)
+    {
+      otutable_print_mothur_shared_out(fp_mothur_shared_out);
+      fclose(fp_mothur_shared_out);
+    }
+
+  otutable_done();
+
   if (opt_dbmatched || opt_dbnotmatched)
     {
       for(long i=0; i<seqcount; i++)
diff --git a/src/searchcore.cc b/src/searchcore.cc
index c4f3e23..2500db1 100644
--- a/src/searchcore.cc
+++ b/src/searchcore.cc
@@ -266,37 +266,51 @@ void align_trim(struct hit * hit)
   /* left trim alignment */
   
   char * p = hit->nwalignment;
-  long run = 1;
-  int scanlength = 0;
-  sscanf(p, "%ld%n", &run, &scanlength);
-  char op = *(p+scanlength);
-  if (op != 'M')
+  char op;
+  long run;
+  if (*p)
     {
-      hit->trim_aln_left = 1 + scanlength;
-      if (op == 'D')
-        hit->trim_q_left = run;
-      else
-        hit->trim_t_left = run;
+      run = 1;
+      int scanlength = 0;
+      sscanf(p, "%ld%n", &run, &scanlength);
+      op = *(p+scanlength);
+      if (op != 'M')
+        {
+          hit->trim_aln_left = 1 + scanlength;
+          if (op == 'D')
+            hit->trim_q_left = run;
+          else
+            hit->trim_t_left = run;
+        }
     }
   
   /* right trim alignment */
   
   char * e = hit->nwalignment + strlen(hit->nwalignment);
-  p = e - 1;
-  op = *p;
-  if (op != 'M')
+  if (e > hit->nwalignment)
     {
-      while (*(p-1) <= '9')
-        p--;
-      run = 1;
-      sscanf(p, "%ld", &run);
-      hit->trim_aln_right = e - p;
-      if (op == 'D')
-        hit->trim_q_right = run;
-      else
-        hit->trim_t_right = run;
+      p = e - 1;
+      op = *p;
+      if (op != 'M')
+        {
+          while ((p > hit->nwalignment) && (*(p-1) <= '9'))
+            p--;
+          run = 1;
+          sscanf(p, "%ld", &run);
+          hit->trim_aln_right = e - p;
+          if (op == 'D')
+            hit->trim_q_right = run;
+          else
+            hit->trim_t_right = run;
+        }
     }
-  
+
+  if (hit->trim_q_left >= hit->nwalignmentlength)
+    hit->trim_q_right = 0;
+
+  if (hit->trim_t_left >= hit->nwalignmentlength)
+    hit->trim_t_right = 0;
+
   hit->internal_alignmentlength = hit->nwalignmentlength
     - hit->trim_q_left - hit->trim_t_left
     - hit->trim_q_right - hit->trim_t_right;
@@ -306,9 +320,9 @@ void align_trim(struct hit * hit)
     - hit->trim_q_right - hit->trim_t_right;
 
   hit->internal_gaps = hit->nwgaps
-    - (hit->trim_q_left  + hit->trim_t_left  > 0 ? 1 : 0)
-    - (hit->trim_q_right + hit->trim_t_right > 0 ? 1 : 0);
-  
+    - ((hit->trim_q_left  + hit->trim_t_left)  > 0 ? 1 : 0)
+    - ((hit->trim_q_right + hit->trim_t_right) > 0 ? 1 : 0);
+
   /* CD-HIT */
   hit->id0 = hit->shortest > 0 ? 100.0 * hit->matches / hit->shortest : 0.0;
   /* all diffs */
diff --git a/src/searchcore.h b/src/searchcore.h
index a8e0863..82a405a 100644
--- a/src/searchcore.h
+++ b/src/searchcore.h
@@ -63,9 +63,9 @@
 /* the number of alignments that can be delayed */
 #define MAXDELAYED 8
 
-/* Default minimum number of word matches for word lengths 7-15 */
+/* Default minimum number of word matches for word lengths 3-15 */
 const int minwordmatches_defaults[] =
-  { -1, -1, -1, -1, -1, -1, -1, 14, 12, 11, 10,  9,  8,  7,  5,  3 };
+  { -1, -1, -1, 18, 17, 16, 15, 14, 12, 11, 10,  9,  8,  7,  5,  3 };
 
 struct hit
 {
diff --git a/src/searchexact.cc b/src/searchexact.cc
index 739a46d..09f7a18 100644
--- a/src/searchexact.cc
+++ b/src/searchexact.cc
@@ -86,6 +86,9 @@ static FILE * fp_matched = 0;
 static FILE * fp_notmatched = 0;
 static FILE * fp_dbmatched = 0;
 static FILE * fp_dbnotmatched = 0;
+static FILE * fp_otutabout = 0;
+static FILE * fp_mothur_shared_out = 0;
+static FILE * fp_biomout = 0;
 
 void add_hit(struct searchinfo_s * si, unsigned long seqno)
 {
@@ -162,11 +165,12 @@ void search_exact_onequery(struct searchinfo_s * si)
 }
 
 void search_exact_output_results(int hit_count,
-                           struct hit * hits,
-                           char * query_head,
-                           int qseqlen,
-                           char * qsequence,
-                           char * qsequence_rc)
+                                 struct hit * hits,
+                                 char * query_head,
+                                 int qseqlen,
+                                 char * qsequence,
+                                 char * qsequence_rc,
+                                 int qsize)
 {
   pthread_mutex_lock(&mutex_output);
 
@@ -194,7 +198,12 @@ void search_exact_output_results(int hit_count,
   if (toreport)
     {
       double top_hit_id = hits[0].id;
-      
+
+      if (opt_otutabout || opt_mothur_shared_out || opt_biomout)
+        otutable_add(query_head,
+                     db_getheader(hits[0].target),
+                     qsize);
+
       for(int t = 0; t < toreport; t++)
         {
           struct hit * hp = hits + t;
@@ -217,7 +226,8 @@ void search_exact_output_results(int hit_count,
                                   query_head,
                                   qsequence,
                                   qseqlen,
-                                  qsequence_rc);
+                                  qsequence_rc,
+                                  hp->target);
               
           if (fp_userout)
             results_show_userout_one(fp_userout,
@@ -244,7 +254,8 @@ void search_exact_output_results(int hit_count,
                             query_head,
                             qsequence,
                             qseqlen,
-                            qsequence_rc);
+                            qsequence_rc,
+                            0);
       
       if (fp_userout)
         results_show_userout_one(fp_userout,
@@ -321,7 +332,8 @@ int search_exact_query(long t)
                               si_plus[t].query_head,
                               si_plus[t].qseqlen,
                               si_plus[t].qsequence,
-                              opt_strand > 1 ? si_minus[t].qsequence : 0);
+                              opt_strand > 1 ? si_minus[t].qsequence : 0,
+                              si_plus[t].qsize);
 
   /* free memory for alignment strings */
   for(int i=0; i<hit_count; i++)
@@ -560,6 +572,27 @@ void search_exact_prep(char * cmdline, char * progheader)
         fatal("Unable to open dbnotmatched output file for writing");
     }
 
+  if (opt_otutabout)
+    {
+      fp_otutabout = fopen(opt_otutabout, "w");
+      if (! fp_otutabout)
+        fatal("Unable to open OTU table (text format) output file for writing");
+    }
+
+  if (opt_mothur_shared_out)
+    {
+      fp_mothur_shared_out = fopen(opt_mothur_shared_out, "w");
+      if (! fp_mothur_shared_out)
+        fatal("Unable to open OTU table (mothur format) output file for writing");
+    }
+
+  if (opt_biomout)
+    {
+      fp_biomout = fopen(opt_biomout, "w");
+      if (! fp_biomout)
+        fatal("Unable to open OTU table (biom 1.0 format) output file for writing");
+    }
+
   db_read(opt_db, 0);
 
   results_show_samheader(fp_samout, cmdline, opt_db);
@@ -621,6 +654,8 @@ void search_exact(char * cmdline, char * progheader)
 
   search_exact_prep(cmdline, progheader);
 
+  otutable_init();
+
   /* prepare reading of queries */
   qmatches = 0;
   queries = 0;
@@ -663,6 +698,26 @@ void search_exact(char * cmdline, char * progheader)
     fprintf(fp_log, "Matching query sequences: %d of %d (%.2f%%)\n", 
             qmatches, queries, 100.0 * qmatches / queries);
 
+  if (fp_biomout)
+    {
+      otutable_print_biomout(fp_biomout);
+      fclose(fp_biomout);
+    }
+
+  if (fp_otutabout)
+    {
+      otutable_print_otutabout(fp_otutabout);
+      fclose(fp_otutabout);
+    }
+
+  if (fp_mothur_shared_out)
+    {
+      otutable_print_mothur_shared_out(fp_mothur_shared_out);
+      fclose(fp_mothur_shared_out);
+    }
+
+  otutable_done();
+
   if (opt_dbmatched || opt_dbnotmatched)
     {
       for(long i=0; i<seqcount; i++)
diff --git a/src/vsearch.cc b/src/vsearch.cc
index 8cfdc10..2b0962b 100644
--- a/src/vsearch.cc
+++ b/src/vsearch.cc
@@ -80,6 +80,7 @@ bool opt_sizeorder;
 bool opt_xsize;
 char * opt_allpairs_global;
 char * opt_alnout;
+char * opt_biomout;
 char * opt_blast6out;
 char * opt_borderline;
 char * opt_centroids;
@@ -110,6 +111,7 @@ char * opt_fastqout;
 char * opt_fastqout_discarded;
 char * opt_fastqout_notmerged_fwd;
 char * opt_fastqout_notmerged_rev;
+char * opt_fastx_filter;
 char * opt_fastx_mask;
 char * opt_fastx_revcomp;
 char * opt_fastx_subsample;
@@ -117,9 +119,11 @@ char * opt_label_suffix;
 char * opt_log;
 char * opt_maskfasta;
 char * opt_matched;
+char * opt_mothur_shared_out;
 char * opt_msaout;
 char * opt_nonchimeras;
 char * opt_notmatched;
+char * opt_otutabout;
 char * opt_output;
 char * opt_pattern;
 char * opt_profile;
@@ -142,6 +146,7 @@ double opt_abskew;
 double opt_dn;
 double opt_fastq_maxee;
 double opt_fastq_maxee_rate;
+double opt_fastq_truncee;
 double opt_id;
 double opt_max_unmasked_pct;
 double opt_maxid;
@@ -186,6 +191,7 @@ long opt_fasta_width;
 long opt_fastq_ascii;
 long opt_fastq_asciiout;
 long opt_fastq_maxdiffs;
+long opt_fastq_maxlen;
 long opt_fastq_maxmergelen;
 long opt_fastq_maxns;
 long opt_fastq_minlen;
@@ -198,6 +204,7 @@ long opt_fastq_qminout;
 long opt_fastq_stripleft;
 long opt_fastq_tail;
 long opt_fastq_trunclen;
+long opt_fastq_trunclen_keep;
 long opt_fastq_truncqual;
 long opt_fulldp;
 long opt_hardmask;
@@ -518,6 +525,7 @@ void args_init(int argc, char **argv)
   opt_allpairs_global = 0;
   opt_alnout = 0;
   opt_blast6out = 0;
+  opt_biomout = 0;
   opt_borderline = 0;
   opt_bzip2_decompress = 0;
   opt_centroids = 0;
@@ -557,6 +565,7 @@ void args_init(int argc, char **argv)
   opt_fastq_maxdiffs = 5;
   opt_fastq_maxee = DBL_MAX;
   opt_fastq_maxee_rate = DBL_MAX;
+  opt_fastq_maxlen = LONG_MAX;
   opt_fastq_maxmergelen  = 1000000;
   opt_fastq_maxns = LONG_MAX;
   opt_fastq_mergepairs = 0;
@@ -573,10 +582,13 @@ void args_init(int argc, char **argv)
   opt_fastq_stats = 0;
   opt_fastq_stripleft = 0;
   opt_fastq_tail = 4;
-  opt_fastq_trunclen = 0;
+  opt_fastq_truncee = DBL_MAX;
+  opt_fastq_trunclen = -1;
+  opt_fastq_trunclen_keep = -1;
   opt_fastq_truncqual = LONG_MIN;
   opt_fastqout = 0;
   opt_fastqout_discarded = 0;
+  opt_fastx_filter = 0;
   opt_fastx_mask = 0;
   opt_fastx_revcomp = 0;
   opt_fastx_subsample = 0;
@@ -628,7 +640,7 @@ void args_init(int argc, char **argv)
   opt_mindiv = 0.8;
   opt_minh = 0.28;
   opt_minqt = 0.0;
-  opt_minseqlength = 0;
+  opt_minseqlength = -1;
   opt_minsize = 0;
   opt_minsizeratio = 0.0;
   opt_minsl = 0.0;
@@ -636,10 +648,12 @@ void args_init(int argc, char **argv)
   opt_minuniquesize = 0;
   opt_minwordmatches = 0;
   opt_mismatch = -4;
+  opt_mothur_shared_out = 0;
   opt_msaout = 0;
   opt_nonchimeras = 0;
   opt_notmatched = 0;
   opt_notrunclabels = 0;
+  opt_otutabout = 0;
   opt_output = 0;
   opt_output_no_hits = 0;
   opt_pattern = 0;
@@ -871,6 +885,13 @@ void args_init(int argc, char **argv)
     {"hspw",                  required_argument, 0, 0 },
     {"gzip_decompress",       no_argument,       0, 0 },
     {"bzip2_decompress",      no_argument,       0, 0 },
+    {"fastq_maxlen",          required_argument, 0, 0 },
+    {"fastq_truncee",         required_argument, 0, 0 },
+    {"fastx_filter",          required_argument, 0, 0 },
+    {"otutabout",             required_argument, 0, 0 },
+    {"mothur_shared_out",     required_argument, 0, 0 },
+    {"biomout",               required_argument, 0, 0 },
+    {"fastq_trunclen_keep",   required_argument, 0, 0 },
     { 0, 0, 0, 0 }
   };
 
@@ -1610,15 +1631,41 @@ void args_init(int argc, char **argv)
           break;
 
         case 173:
-          /* gzip_decompress */
           opt_gzip_decompress = 1;
           break;
 
         case 174:
-          /* bzip2_decompress */
           opt_bzip2_decompress = 1;
           break;
 
+        case 175:
+          opt_fastq_maxlen = args_getlong(optarg);
+          break;
+
+        case 176:
+          opt_fastq_truncee = args_getdouble(optarg);
+          break;
+
+        case 177:
+          opt_fastx_filter = optarg;
+          break;
+
+        case 178:
+          opt_otutabout = optarg;
+          break;
+
+        case 179:
+          opt_mothur_shared_out = optarg;
+          break;
+
+        case 180:
+          opt_biomout = optarg;
+          break;
+
+        case 181:
+          opt_fastq_trunclen_keep = args_getlong(optarg);
+          break;
+
         default:
           fatal("Internal error in option parsing");
         }
@@ -1676,6 +1723,8 @@ void args_init(int argc, char **argv)
     commands++;
   if (opt_search_exact)
     commands++;
+  if (opt_fastx_filter)
+    commands++;
   if (opt_fastx_mask)
     commands++;
   if (opt_fastq_convert)
@@ -1701,8 +1750,8 @@ void args_init(int argc, char **argv)
         opt_maxrejects = 32;
     }
 
-  if (opt_minseqlength < 0)
-    fatal("The argument to --minseqlength must be positive");
+  if (opt_minseqlength < -1)
+    fatal("The argument to --minseqlength must not be negative");
 
   if (opt_maxaccepts < 0)
     fatal("The argument to --maxaccepts must not be negative");
@@ -1713,8 +1762,8 @@ void args_init(int argc, char **argv)
   if ((opt_threads < 0) || (opt_threads > 1024))
     fatal("The argument to --threads must be in the range 0 (default) to 1024");
 
-  if ((opt_wordlength < 7) || (opt_wordlength > 15))
-    fatal("The argument to --wordlength must be in the range 7 to 15");
+  if ((opt_wordlength < 3) || (opt_wordlength > 15))
+    fatal("The argument to --wordlength must be in the range 3 to 15");
 
   if ((opt_iddef < 0) || (opt_iddef > 4))
     fatal("The argument to --iddef must in the range 0 to 4");
@@ -1808,7 +1857,7 @@ void args_init(int argc, char **argv)
 
   /* set default opt_minseqlength depending on command */
 
-  if (opt_minseqlength == 0)
+  if (opt_minseqlength < 0)
     {
       if (opt_cluster_smallmem || opt_cluster_fast || opt_cluster_size ||
           opt_usearch_global || opt_derep_fulllength || opt_derep_prefix )
@@ -1834,6 +1883,7 @@ void args_init(int argc, char **argv)
       & opt_fastq_filter,
       & opt_fastq_mergepairs,
       & opt_fastq_stats,
+      & opt_fastx_filter,
       & opt_fastx_mask,
       & opt_fastx_revcomp,
       & opt_fastx_subsample,
@@ -1861,6 +1911,7 @@ void args_init(int argc, char **argv)
   char * * stdout_options[] =
     {
       & opt_alnout,
+      & opt_biomout,
       & opt_blast6out,
       & opt_borderline,
       & opt_centroids,
@@ -1880,7 +1931,9 @@ void args_init(int argc, char **argv)
       & opt_fastqout_notmerged_rev,
       & opt_log,
       & opt_matched,
+      & opt_mothur_shared_out,
       & opt_msaout,
+      & opt_otutabout,
       & opt_nonchimeras,
       & opt_notmatched,
       & opt_output,
@@ -1913,215 +1966,234 @@ void cmd_help()
       fprintf(stdout,
               "\n"
               "General options\n"
-              "  --bzip2_decompress          decompress input with bzip2 (required for pipes)\n"
+              "  --bzip2_decompress          decompress input with bzip2 (required if pipe)\n"
               "  --fasta_width INT           width of FASTA seq lines, 0 for no wrap (80)\n"
-              "  --gzip_decompress           decompress input with gzip (required for pipes)\n"
-              "  --help | --h                display help information\n"
+              "  --gzip_decompress           decompress input with gzip (required if pipe)\n"
+              "  --help | -h                 display help information\n"
               "  --log FILENAME              write messages, timing and memory info to file\n"
               "  --maxseqlength INT          maximum sequence length (50000)\n"
               "  --minseqlength INT          min seq length (clust/derep/search: 32, other:1)\n"
               "  --notrunclabels             do not truncate labels at first space\n"
               "  --quiet                     output just warnings and fatal errors to stderr\n"
               "  --threads INT               number of threads to use, zero for all cores (0)\n"
-              "  --version                   display version information\n"
+              "  --version | -v              display version information\n"
               "\n"
               "Chimera detection\n"
               "  --uchime_denovo FILENAME    detect chimeras de novo\n"
               "  --uchime_ref FILENAME       detect chimeras using a reference database\n"
-              "Options\n"
-              "  --abskew REAL               min abundance ratio of parent vs chimera (2.0)\n"
-              "  --alignwidth INT            width of alignment in uchimealn output (80)\n"
-              "  --borderline FILENAME       output borderline chimeric sequences to file\n"
-              "  --chimeras FILENAME         output chimeric sequences to file\n"
+              " Data\n"
               "  --db FILENAME               reference database for --uchime_ref\n"
+              " Parameters\n"
+              "  --abskew REAL               min abundance ratio of parent vs chimera (2.0)\n"
               "  --dn REAL                   'no' vote pseudo-count (1.4)\n"
-              "  --fasta_score               include chimera score in fasta output\n"
               "  --mindiffs INT              minimum number of differences in segment (3)\n"
               "  --mindiv REAL               minimum divergence from closest parent (0.8)\n"
               "  --minh REAL                 minimum score (0.28)\n"
+              "  --sizein                    propagate abundance annotation from input\n"
+              "  --self                      exclude identical labels for --uchime_ref\n"
+              "  --selfid                    exclude identical sequences for --uchime_ref\n"
+              "  --xn REAL                   'no' vote weight (8.0)\n"
+              " Output\n"
+              "  --alignwidth INT            width of alignment in uchimealn output (80)\n"
+              "  --borderline FILENAME       output borderline chimeric sequences to file\n"
+              "  --chimeras FILENAME         output chimeric sequences to file\n"
+              "  --fasta_score               include chimera score in fasta output\n"
               "  --nonchimeras FILENAME      output non-chimeric sequences to file\n"
               "  --relabel STRING            relabel nonchimeras with this prefix string\n"
               "  --relabel_keep              keep the old label after the new when relabelling\n"
               "  --relabel_md5               relabel with md5 digest of normalized sequence\n"
               "  --relabel_sha1              relabel with sha1 digest of normalized sequence\n"
-              "  --self                      exclude identical labels for --uchime_ref\n"
-              "  --selfid                    exclude identical sequences for --uchime_ref\n"
               "  --sizeout                   include abundance information when relabelling\n"
               "  --uchimealns FILENAME       output chimera alignments to file\n"
               "  --uchimeout FILENAME        output to chimera info to tab-separated file\n"
               "  --uchimeout5                make output compatible with uchime version 5\n"
-              "  --xn REAL                   'no' vote weight (8.0)\n"
               "  --xsize                     strip abundance information in output\n"
               "\n"
               "Clustering\n"
               "  --cluster_fast FILENAME     cluster sequences after sorting by length\n"
               "  --cluster_size FILENAME     cluster sequences after sorting by abundance\n"
               "  --cluster_smallmem FILENAME cluster already sorted sequences (see -usersort)\n"
-              "Options (most searching options also apply)\n"
+              " Parameters (most searching options also apply)\n"
+              "  --cons_truncate             do not ignore terminal gaps in MSA for consensus\n"
+              "  --id REAL                   reject if identity lower, accepted values: 0-1.0\n"
+              "  --iddef INT                 id definition, 0-4=CD-HIT,all,int,MBL,BLAST (2)\n"
+              "  --qmask none|dust|soft      mask seqs with dust, soft or no method (dust)\n"
+              "  --sizein                    propagate abundance annotation from input\n"
+              "  --strand plus|both          cluster using plus or both strands (plus)\n"
+              "  --usersort                  indicate sequences not pre-sorted by length\n"
+              " Output\n"
+              "  --biomout FILENAME          filename for OTU table output in biom 1.0 format\n"
               "  --centroids FILENAME        output centroid sequences to FASTA file\n"
               "  --clusterout_id             add cluster id info to consout and profile files\n"
               "  --clusterout_sort           order msaout, consout, profile by decr abundance\n"
               "  --clusters STRING           output each cluster to a separate FASTA file\n"
               "  --consout FILENAME          output cluster consensus sequences to FASTA file\n"
-              "  --cons_truncate             do not ignore terminal gaps in MSA for consensus\n"
-              "  --id REAL                   reject if identity lower\n"
-              "  --iddef INT                 id definition, 0-4=CD-HIT,all,int,MBL,BLAST (2)\n"
+              "  --mothur_shared_out FN      filename for OTU table output in mothur format\n"
               "  --msaout FILENAME           output multiple seq. alignments to FASTA file\n"
+              "  --otutabout FILENAME        filename for OTU table output in classic format\n"
               "  --profile FILENAME          output sequence profile of each cluster to file\n"
-              "  --qmask none|dust|soft      mask seqs with dust, soft or no method (dust)\n"
               "  --relabel STRING            relabel centroids with this prefix string\n"
               "  --relabel_keep              keep the old label after the new when relabelling\n"
               "  --relabel_md5               relabel with md5 digest of normalized sequence\n"
               "  --relabel_sha1              relabel with sha1 digest of normalized sequence\n"
-              "  --sizein                    propagate abundance annotation from input\n"
               "  --sizeorder                 sort accepted centroids by abundance (AGC)\n"
               "  --sizeout                   write cluster abundances to centroid file\n"
-              "  --strand plus|both          cluster using plus or both strands (plus)\n"
               "  --uc FILENAME               specify filename for UCLUST-like output\n"
-              "  --usersort                  indicate sequences not pre-sorted by length\n"
               "  --xsize                     strip abundance information in output\n"
               "\n"
               "Dereplication and rereplication\n"
               "  --derep_fulllength FILENAME dereplicate sequences in the given FASTA file\n"
               "  --derep_prefix FILENAME     dereplicate sequences in file based on prefixes\n"
               "  --rereplicate FILENAME      rereplicate sequences in the given FASTA file\n"
-              "Options\n"
+              " Parameters\n"
               "  --maxuniquesize INT         maximum abundance for output from dereplication\n"
               "  --minuniquesize INT         minimum abundance for output from dereplication\n"
+              "  --sizein                    propagate abundance annotation from input\n"
+              "  --strand plus|both          dereplicate plus or both strands (plus)\n"
+              " Output\n"
               "  --output FILENAME           output FASTA file\n"
               "  --relabel STRING            relabel with this prefix string\n"
               "  --relabel_keep              keep the old label after the new when relabelling\n"
               "  --relabel_md5               relabel with md5 digest of normalized sequence\n"
               "  --relabel_sha1              relabel with sha1 digest of normalized sequence\n"
-              "  --sizein                    propagate abundance annotation from input\n"
               "  --sizeout                   write abundance annotation to output\n"
-              "  --strand plus|both          dereplicate plus or both strands (plus)\n"
               "  --topn INT                  output only n most abundant sequences after derep\n"
               "  --uc FILENAME               filename for UCLUST-like dereplication output\n"
               "  --xsize                     strip abundance information in derep output\n"
               "\n"
-              "FASTQ filtering\n"
-              "  --fastq_filter FILENAME     filter FASTQ file, output to FASTQ or FASTA file\n"
-              "Options\n"
-              "  --eeout                     include expected errors in FASTQ filter output\n"
-              "  --fastaout FILENAME         FASTA output filename for passed sequences\n"
-              "  --fastaout_discarded FNAME  FASTA filename for discarded sequences\n"
-              "  --fastqout FILENAME         FASTQ output filename for passed sequences\n"
-              "  --fastqout_discarded FNAME  FASTQ filename for discarded sequences\n"
-              "  --fastq_ascii INT           FASTQ input quality score ASCII base char (33)\n"
-              "  --fastq_maxee REAL          maximum expected error value for FASTQ filter\n"
-              "  --fastq_maxee_rate REAL     maximum expected error rate for FASTQ filter\n"
-              "  --fastq_maxns INT           maximum number of N's for FASTQ filter\n"
-              "  --fastq_minlen INT          minimum length for FASTQ filter\n"
-              "  --fastq_stripleft INT       bases on the left to delete for FASTQ filter\n"
-              "  --fastq_trunclen INT        read length for FASTQ filter truncation\n"
-              "  --fastq_truncqual INT       base quality value for FASTQ filter truncation\n"
-              "  --relabel STRING            relabel filtered sequences with given prefix\n"
-              "  --relabel_keep              keep the old label after the new when relabelling\n"
-              "  --relabel_md5               relabel filtered sequences with md5 digest\n"
-              "  --relabel_sha1              relabel filtered sequences with sha1 digest\n"
-              "  --sizeout                   include abundance information when relabelling\n"
-              "  --xsize                     strip abundance information in output\n"
-              "\n"
               "FASTQ format conversion\n"
               "  --fastq_convert FILENAME    convert between FASTQ file formats\n"
-              "Options\n"
+              " Parameters\n"
               "  --fastq_ascii INT           FASTQ input quality score ASCII base char (33)\n"
               "  --fastq_asciiout INT        FASTQ output quality score ASCII base char (33)\n"
               "  --fastq_qmax INT            maximum base quality value for FASTQ input (41)\n"
               "  --fastq_qmaxout INT         maximum base quality value for FASTQ output (41)\n"
               "  --fastq_qmin INT            minimum base quality value for FASTQ input (0)\n"
               "  --fastq_qminout INT         minimum base quality value for FASTQ output (0)\n"
+              " Output\n"
+              "  --fastqout FILENAME         FASTQ output filename for converted sequences\n"
               "\n"
               "FASTQ format detection and quality analysis\n"
               "  --fastq_chars FILENAME      analyse FASTQ file for version and quality range\n"
-              "Options\n"
+              " Parameters\n"
               "  --fastq_tail INT            min length of tails to count for fastq_chars (4)\n"
               "\n"
-              "FASTQ paired-end reads merging\n"
-              "  --fastq_mergepairs FILENAME merge paired-end reads into one sequence\n"
-              "Options:\n"
-              "  --eetabbedout FILENAME      output error statistics to specified file\n"
-              "  --fastaout FILENAME         FASTA output filename for merged sequences\n"
-              "  --fastaout_notmerged_fwd FN FASTA filename for non-merged forward sequences\n"
-              "  --fastaout_notmerged_rev FN FASTA filename for non-merged reverse sequences\n"
-              "  --fastq_allowmergestagger   Allow merging of staggered reads\n"
+              "FASTQ quality statistics\n"
+              "  --fastq_stats FILENAME      report statistics on FASTQ file\n"
+              "  --fastq_eestats FILENAME    quality score and expected error statistics\n"
+              " Parameters\n"
               "  --fastq_ascii INT           FASTQ input quality score ASCII base char (33)\n"
-              "  --fastq_eeout               include expected errors in FASTQ output\n"
-              "  --fastq_maxdiffs            maximum number of different bases in overlap (5)\n"
-              "  --fastq_maxee REAL          maximum expected error value for merged sequence\n"
-              "  --fastq_maxmergelen         maximum length of entire merged sequence\n"
-              "  --fastq_maxns INT           maximum number of N's\n"
-              "  --fastq_minlen INT          minimum input read length after truncation (1)\n"
-              "  --fastq_minmergelen         minimum length of entire merged sequence\n"
-              "  --fastq_minovlen            minimum length of overlap between reads (16)\n"
-              "  --fastq_nostagger           disallow merging of staggered reads (default)\n"
               "  --fastq_qmax INT            maximum base quality value for FASTQ input (41)\n"
-              "  --fastq_qmaxout INT         maximum base quality value for FASTQ output (41)\n"
               "  --fastq_qmin INT            minimum base quality value for FASTQ input (0)\n"
-              "  --fastq_qminout INT         minimum base quality value for FASTQ output (0)\n"
-              "  --fastq_truncqual INT       base quality value for truncation\n"
-              "  --fastqout FILENAME         FASTQ output filename for merged sequences\n"
-              "  --fastqout_notmerged_fwd  F FASTQ filename for non-merged forward sequences\n"
-              "  --fastqout_notmerged_rev  F FASTQ filename for non-merged reverse sequences\n"
-              "  --label_suffix              suffix to append to label of merged sequences\n"
-              "  --reverse FILENAME          specify FASTQ file with reverse reads\n"
+              " Output\n"
+              "  --log FILENAME              output statistics\n"
               "\n"
-              "FASTQ quality statistics\n"
-              "  --fastq_stats FILENAME      report FASTQ file statistics\n"
-              "  --fastq_eestats FILENAME    quality score and expected error statistics\n"
-              "Options\n"
+              "Filtering\n"
+              "  --fastx_filter FILENAME     filter and truncate sequences in FASTA/FASTQ file\n"
+              "  --fastq_filter FILENAME     filter and truncate sequences in FASTQ file\n"
+              " Parameters\n"
               "  --fastq_ascii INT           FASTQ input quality score ASCII base char (33)\n"
+              "  --fastq_maxee REAL          maximum expected error value for filter\n"
+              "  --fastq_maxee_rate REAL     maximum expected error rate for filter\n"
+              "  --fastq_maxlen INT          maximum length of sequence for filter\n"
+              "  --fastq_maxns INT           maximum number of N's for filter\n"
+              "  --fastq_minlen INT          minimum length of sequence for filter\n"
               "  --fastq_qmax INT            maximum base quality value for FASTQ input (41)\n"
               "  --fastq_qmin INT            minimum base quality value for FASTQ input (0)\n"
+              "  --fastq_stripleft INT       bases on the left to delete\n"
+              "  --fastq_truncee REAL        maximum total expected error for truncation\n"
+              "  --fastq_trunclen INT        read length for sequence truncation\n"
+              "  --fastq_truncqual INT       minimum base quality value for truncation\n"
+              " Output\n"
+              "  --eeout                     include expected errors in output\n"
+              "  --fastaout FILENAME         FASTA output filename for passed sequences\n"
+              "  --fastaout_discarded FNAME  FASTA filename for discarded sequences\n"
+              "  --fastqout FILENAME         FASTQ output filename for passed sequences\n"
+              "  --fastqout_discarded FNAME  FASTQ filename for discarded sequences\n"
+              "  --relabel STRING            relabel filtered sequences with given prefix\n"
+              "  --relabel_keep              keep the old label after the new when relabelling\n"
+              "  --relabel_md5               relabel filtered sequences with md5 digest\n"
+              "  --relabel_sha1              relabel filtered sequences with sha1 digest\n"
+              "  --sizeout                   include abundance information when relabelling\n"
+              "  --xsize                     strip abundance information in output\n"
               "\n"
               "Masking (new)\n"
               "  --fastx_mask FILENAME       mask sequences in the given FASTA or FASTQ file\n"
-              "Options\n"
+              " Parameters\n"
               "  --fastq_ascii INT           FASTQ input quality score ASCII base char (33)\n"
               "  --fastq_qmax INT            maximum base quality value for FASTQ input (41)\n"
               "  --fastq_qmin INT            minimum base quality value for FASTQ input (0)\n"
-              "  --fastaout FILENAME         output to specified FASTA file\n"
-              "  --fastqout FILENAME         output to specified FASTQ file\n"
               "  --hardmask                  mask by replacing with N instead of lower case\n"
               "  --max_unmasked_pct          max unmasked %% of sequences to keep (100.0)\n"
               "  --min_unmasked_pct          min unmasked %% of sequences to keep (0.0)\n"
               "  --qmask none|dust|soft      mask seqs with dust, soft or no method (dust)\n"
+              " Output\n"
+              "  --fastaout FILENAME         output to specified FASTA file\n"
+              "  --fastqout FILENAME         output to specified FASTQ file\n"
               "\n"
               "Masking (old)\n"
               "  --maskfasta FILENAME        mask sequences in the given FASTA file\n"
-              "Options\n"
+              " Parameters\n"
               "  --hardmask                  mask by replacing with N instead of lower case\n"
-              "  --output FILENAME           output to specified FASTA file\n"
               "  --qmask none|dust|soft      mask seqs with dust, soft or no method (dust)\n"
+              " Output\n"
+              "  --output FILENAME           output to specified FASTA file\n"
+              "\n"
+              "Paired-end reads merging\n"
+              "  --fastq_mergepairs FILENAME merge paired-end reads into one sequence\n"
+              " Data\n"
+              "  --reverse FILENAME          specify FASTQ file with reverse reads\n"
+              " Parameters\n"
+              "  --fastq_allowmergestagger   Allow merging of staggered reads\n"
+              "  --fastq_ascii INT           FASTQ input quality score ASCII base char (33)\n"
+              "  --fastq_maxdiffs INT        maximum number of different bases in overlap (5)\n"
+              "  --fastq_maxee REAL          maximum expected error value for merged sequence\n"
+              "  --fastq_maxmergelen         maximum length of entire merged sequence\n"
+              "  --fastq_maxns INT           maximum number of N's\n"
+              "  --fastq_minlen INT          minimum input read length after truncation (1)\n"
+              "  --fastq_minmergelen         minimum length of entire merged sequence\n"
+              "  --fastq_minovlen            minimum length of overlap between reads (16)\n"
+              "  --fastq_nostagger           disallow merging of staggered reads (default)\n"
+              "  --fastq_qmax INT            maximum base quality value for FASTQ input (41)\n"
+              "  --fastq_qmaxout INT         maximum base quality value for FASTQ output (41)\n"
+              "  --fastq_qmin INT            minimum base quality value for FASTQ input (0)\n"
+              "  --fastq_qminout INT         minimum base quality value for FASTQ output (0)\n"
+              "  --fastq_truncqual INT       base quality value for truncation\n"
+              " Output\n"
+              "  --eetabbedout FILENAME      output error statistics to specified file\n"
+              "  --fastaout FILENAME         FASTA output filename for merged sequences\n"
+              "  --fastaout_notmerged_fwd FN FASTA filename for non-merged forward sequences\n"
+              "  --fastaout_notmerged_rev FN FASTA filename for non-merged reverse sequences\n"
+              "  --fastq_eeout               include expected errors in FASTQ output\n"
+              "  --fastqout FILENAME         FASTQ output filename for merged sequences\n"
+              "  --fastqout_notmerged_fwd FN FASTQ filename for non-merged forward sequences\n"
+              "  --fastqout_notmerged_rev FN FASTQ filename for non-merged reverse sequences\n"
+              "  --label_suffix              suffix to append to label of merged sequences\n"
               "\n"
               "Pairwise alignment\n"
               "  --allpairs_global FILENAME  perform global alignment of all sequence pairs\n"
-              "Options (most searching options also apply)\n"
+              " Output (most searching options also apply)\n"
               "  --alnout FILENAME           filename for human-readable alignment output\n"
               "  --acceptall                 output all pairwise alignments\n"
               "\n"
               "Reverse complementation\n"
               "  --fastx_revcomp FILENAME    Reverse-complement seqs in FASTA or FASTQ file\n"
-              "Options\n"
-              "  --fastaout FILENAME         FASTA output filename\n"
+              " Parameters\n"
               "  --fastq_ascii INT           FASTQ input quality score ASCII base char (33)\n"
               "  --fastq_qmax INT            maximum base quality value for FASTQ input (41)\n"
               "  --fastq_qmin INT            minimum base quality value for FASTQ input (0)\n"
+              " Output\n"
+              "  --fastaout FILENAME         FASTA output filename\n"
               "  --fastqout FILENAME         FASTQ output filename\n"
               "  --label_suffix STRING       Label to append to identifier in the output\n"
               "\n"
               "Searching\n"
               "  --search_exact FILENAME     filename of queries for exact match search\n"
               "  --usearch_global FILENAME   filename of queries for global alignment search\n"
-              "Options\n"
-              "  --alnout FILENAME           filename for human-readable alignment output\n"
-              "  --blast6out FILENAME        filename for blast-like tab-separated output\n"
+              " Data\n"
               "  --db FILENAME               filename for FASTA formatted database for search\n"
+              " Parameters\n"
               "  --dbmask none|dust|soft     mask db with dust, soft or no method (dust)\n"
-              "  --dbmatched FILENAME        FASTA file for matching database sequences\n"
-              "  --dbnotmatched FILENAME     FASTA file for non-matching database sequences\n"
-              "  --fastapairs FILENAME       FASTA file with pairs of query and target\n"
               "  --fulldp                    full dynamic programming alignment (always on)\n"
               "  --gapext STRING             penalties for gap extension (2I/1E)\n"
               "  --gapopen STRING            penalties for gap opening (20I/2E)\n"
@@ -2132,7 +2204,6 @@ void cmd_help()
               "  --idsuffix INT              reject if last n nucleotides do not match\n"
               "  --leftjust                  reject if terminal gaps at alignment left end\n"
               "  --match INT                 score for match (2)\n"
-              "  --matched FILENAME          FASTA file for matching query sequences\n"
               "  --maxaccepts INT            number of hits to accept and show per strand (1)\n"
               "  --maxdiffs INT              reject if more substitutions or indels\n"
               "  --maxgaps INT               reject if more indels\n"
@@ -2150,40 +2221,54 @@ void cmd_help()
               "  --minsizeratio REAL         reject if query/target abundance ratio lower\n"
               "  --minsl REAL                reject if shorter/longer length ratio lower\n"
               "  --mintsize INT              reject if target abundance lower\n"
-              "  --minwordmatches INT        minimum number of word matches required (10)\n"
+              "  --minwordmatches INT        minimum number of word matches required (12)\n"
               "  --mismatch INT              score for mismatch (-4)\n"
-              "  --notmatched FILENAME       FASTA file for non-matching query sequences\n"
-              "  --output_no_hits            output non-matching queries to output files\n"
               "  --pattern STRING            option is ignored\n"
               "  --qmask none|dust|soft      mask query with dust, soft or no method (dust)\n"
               "  --query_cov REAL            reject if fraction of query seq. aligned lower\n"
               "  --rightjust                 reject if terminal gaps at alignment right end\n"
-              "  --rowlen INT                width of alignment lines in alnout output (64)\n"
-              "  --samheader                 include a header in the SAM output file\n"
-              "  --samout FILENAME           filename for SAM format output\n"
+              "  --sizein                    propagate abundance annotation from input\n"
               "  --self                      reject if labels identical\n"
               "  --selfid                    reject if sequences identical\n"
-              "  --sizeout                   write abundance annotation to dbmatched file\n"
               "  --slots INT                 option is ignored\n"
               "  --strand plus|both          search plus or both strands (plus)\n"
               "  --target_cov REAL           reject if fraction of target seq. aligned lower\n"
+              "  --weak_id REAL              include aligned hits with >= id; continue search\n"
+              "  --wordlength INT            length of words for database index 3-15 (8)\n"
+              " Output\n"
+              "  --alnout FILENAME           filename for human-readable alignment output\n"
+              "  --biomout FILENAME          filename for OTU table output in biom 1.0 format\n"
+              "  --blast6out FILENAME        filename for blast-like tab-separated output\n"
+              "  --dbmatched FILENAME        FASTA file for matching database sequences\n"
+              "  --dbnotmatched FILENAME     FASTA file for non-matching database sequences\n"
+              "  --fastapairs FILENAME       FASTA file with pairs of query and target\n"
+              "  --matched FILENAME          FASTA file for matching query sequences\n"
+              "  --mothur_shared_out FN      filename for OTU table output in mothur format\n"
+              "  --notmatched FILENAME       FASTA file for non-matching query sequences\n"
+              "  --otutabout FILENAME        filename for OTU table output in classic format\n"
+              "  --output_no_hits            output non-matching queries to output files\n"
+              "  --rowlen INT                width of alignment lines in alnout output (64)\n"
+              "  --samheader                 include a header in the SAM output file\n"
+              "  --samout FILENAME           filename for SAM format output\n"
+              "  --sizeout                   write abundance annotation to dbmatched file\n"
               "  --top_hits_only             output only hits with identity equal to the best\n"
               "  --uc FILENAME               filename for UCLUST-like output\n"
               "  --uc_allhits                show all, not just top hit with uc output\n"
               "  --userfields STRING         fields to output in userout file\n"
               "  --userout FILENAME          filename for user-defined tab-separated output\n"
-              "  --weak_id REAL              include aligned hits with >= id; continue search\n"
               "  --wordlength INT            length of words for database index 3-15 (8)\n"
               "\n"
               "Shuffling and sorting\n"
               "  --shuffle FILENAME          shuffle order of sequences in FASTA file randomly\n"
               "  --sortbylength FILENAME     sort sequences by length in given FASTA file\n"
               "  --sortbysize FILENAME       abundance sort sequences in given FASTA file\n"
-              "Options\n"
+              " Parameters\n"
               "  --maxsize INT               maximum abundance for sortbysize\n"
               "  --minsize INT               minimum abundance for sortbysize\n"
-              "  --output FILENAME           output to specified FASTA file\n"
               "  --randseed INT              seed for PRNG, zero to use random data source (0)\n"
+              "  --sizein                    propagate abundance annotation from input\n"
+              " Output\n"
+              "  --output FILENAME           output to specified FASTA file\n"
               "  --relabel STRING            relabel sequences with this prefix string\n"
               "  --relabel_keep              keep the old label after the new when relabelling\n"
               "  --relabel_md5               relabel with md5 digest of normalized sequence\n"
@@ -2194,20 +2279,23 @@ void cmd_help()
               "\n"
               "Subsampling\n"
               "  --fastx_subsample FILENAME  subsample sequences from given FASTA/FASTQ file\n"
-              "Options\n"
-              "  --fastaout FILENAME         output FASTA file for subsamples\n"
+              " Parameters\n"
               "  --fastq_ascii INT           FASTQ input quality score ASCII base char (33)\n"
               "  --fastq_qmax INT            maximum base quality value for FASTQ input (41)\n"
               "  --fastq_qmin INT            minimum base quality value for FASTQ input (0)\n"
-              "  --fastqout FILENAME         output FASTQ file for subsamples\n"
               "  --randseed INT              seed for PRNG, zero to use random data source (0)\n"
+              "  --sample_pct REAL           sampling percentage between 0.0 and 100.0\n"
+              "  --sample_size INT           sampling size\n"
+              "  --sizein                    consider abundance info from input, do not ignore\n"
+              " Output\n"
+              "  --fastaout FILENAME         output subsampled sequences to FASTA file\n"
+              "  --fastaout_discarded FILE   output non-subsampled sequences to FASTA file\n"
+              "  --fastqout FILENAME         output subsampled sequences to FASTQ file\n"
+              "  --fastqout_discarded        output non-subsampled sequences to FASTQ file\n"
               "  --relabel STRING            relabel sequences with this prefix string\n"
               "  --relabel_keep              keep the old label after the new when relabelling\n"
               "  --relabel_md5               relabel with md5 digest of normalized sequence\n"
               "  --relabel_sha1              relabel with sha1 digest of normalized sequence\n"
-              "  --sample_pct REAL           sampling percentage between 0.0 and 100.0\n"
-              "  --sample_size INT           sampling size\n"
-              "  --sizein                    consider abundance info from input, do not ignore\n"
               "  --sizeout                   update abundance information in output\n"
               "  --xsize                     strip abundance information in output\n"
           );
@@ -2238,7 +2326,8 @@ void cmd_usearch_global()
       (!opt_uc) && (!opt_blast6out) &&
       (!opt_matched) && (!opt_notmatched) &&
       (!opt_dbmatched) && (!opt_dbnotmatched) &&
-      (!opt_samout))
+      (!opt_samout) && (!opt_otutabout) &&
+      (!opt_biomout) && (!opt_mothur_shared_out))
     fatal("No output files specified");
 
   if (!opt_db)
@@ -2258,7 +2347,8 @@ void cmd_search_exact()
       (!opt_uc) && (!opt_blast6out) &&
       (!opt_matched) && (!opt_notmatched) &&
       (!opt_dbmatched) && (!opt_dbnotmatched) &&
-      (!opt_samout))
+      (!opt_samout) && (!opt_otutabout) &&
+      (!opt_biomout) && (!opt_mothur_shared_out))
     fatal("No output files specified");
 
   if (!opt_db)
@@ -2369,9 +2459,9 @@ void cmd_none()
             "vsearch --fastq_chars FILENAME\n"
             "vsearch --fastq_convert FILENAME --fastqout FILENAME --fastq_ascii 64\n"
             "vsearch --fastq_eestats FILENAME --output FILENAME\n"
-            "vsearch --fastq_filter FILENAME --fastqout FILENAME --fastq_truncqual 20\n"
             "vsearch --fastq_mergepairs FILENAME --reverse FILENAME --fastqout FILENAME\n"
             "vsearch --fastq_stats FILENAME --log FILENAME\n"
+            "vsearch --fastx_filter FILENAME --fastaout FILENAME --fastq_trunclen 100\n"
             "vsearch --fastx_mask FILENAME --fastaout FILENAME\n"
             "vsearch --fastx_revcomp FILENAME --fastqout FILENAME\n"
             "vsearch --fastx_subsample FILENAME --fastaout FILENAME --sample_pct 1\n"
@@ -2410,7 +2500,9 @@ void cmd_cluster()
       (!opt_matched) && (!opt_notmatched) &&
       (!opt_centroids) && (!opt_clusters) &&
       (!opt_consout) && (!opt_msaout) &&
-      (!opt_samout) && (!opt_profile))
+      (!opt_samout) && (!opt_profile) &&
+      (!opt_otutabout) && (!opt_biomout) &&
+      (!opt_mothur_shared_out))
     fatal("No output files specified");
 
   if ((opt_id < 0.0) || (opt_id > 1.0))
@@ -2464,6 +2556,14 @@ void cmd_fastq_filter()
   fastq_filter();
 }
 
+void cmd_fastx_filter()
+{
+  if ((!opt_fastqout) && (!opt_fastaout) &&
+      (!opt_fastqout_discarded) && (!opt_fastaout_discarded))
+    fatal("No output files specified");
+  fastx_filter();
+}
+
 void cmd_fastq_mergepairs()
 {
   if (!opt_reverse)
@@ -2576,6 +2676,8 @@ int main(int argc, char** argv)
     fastq_stats();
   else if (opt_fastq_filter)
     cmd_fastq_filter();
+  else if (opt_fastx_filter)
+    cmd_fastx_filter();
   else if (opt_fastx_revcomp)
     cmd_fastx_revcomp();
   else if (opt_search_exact)
diff --git a/src/vsearch.h b/src/vsearch.h
index 43ba02c..d13d4a6 100644
--- a/src/vsearch.h
+++ b/src/vsearch.h
@@ -147,6 +147,7 @@
 #include "mergepairs.h"
 #include "eestats.h"
 #include "rerep.h"
+#include "otutable.h"
 
 #define PROG_NAME PACKAGE
 #define PROG_VERSION PACKAGE_VERSION
@@ -178,6 +179,7 @@ extern bool opt_xsize;
 extern char * opt_allpairs_global;
 extern char * opt_alnout;
 extern char * opt_blast6out;
+extern char * opt_biomout;
 extern char * opt_borderline;
 extern char * opt_centroids;
 extern char * opt_chimeras;
@@ -207,6 +209,7 @@ extern char * opt_fastqout;
 extern char * opt_fastqout_discarded;
 extern char * opt_fastqout_notmerged_fwd;
 extern char * opt_fastqout_notmerged_rev;
+extern char * opt_fastx_filter;
 extern char * opt_fastx_mask;
 extern char * opt_fastx_revcomp;
 extern char * opt_fastx_subsample;
@@ -214,9 +217,11 @@ extern char * opt_label_suffix;
 extern char * opt_log;
 extern char * opt_maskfasta;
 extern char * opt_matched;
+extern char * opt_mothur_shared_out;
 extern char * opt_msaout;
 extern char * opt_nonchimeras;
 extern char * opt_notmatched;
+extern char * opt_otutabout;
 extern char * opt_output;
 extern char * opt_pattern;
 extern char * opt_profile;
@@ -239,6 +244,7 @@ extern double opt_abskew;
 extern double opt_dn;
 extern double opt_fastq_maxee;
 extern double opt_fastq_maxee_rate;
+extern double opt_fastq_truncee;
 extern double opt_id;
 extern double opt_max_unmasked_pct;
 extern double opt_maxid;
@@ -283,6 +289,7 @@ extern long opt_fasta_width;
 extern long opt_fastq_ascii;
 extern long opt_fastq_asciiout;
 extern long opt_fastq_maxdiffs;
+extern long opt_fastq_maxlen;
 extern long opt_fastq_maxmergelen;
 extern long opt_fastq_maxns;
 extern long opt_fastq_minlen;
@@ -295,6 +302,7 @@ extern long opt_fastq_qminout;
 extern long opt_fastq_stripleft;
 extern long opt_fastq_tail;
 extern long opt_fastq_trunclen;
+extern long opt_fastq_trunclen_keep;
 extern long opt_fastq_truncqual;
 extern long opt_fulldp;
 extern long opt_hardmask;
diff --git a/test/unclassified.sh b/test/unclassified.sh
new file mode 100644
index 0000000..f9196b9
--- /dev/null
+++ b/test/unclassified.sh
@@ -0,0 +1,314 @@
+#!/bin/bash -
+
+## Print a header
+SCRIPT_NAME="Unclassified tests"
+LINE=$(printf "%076s\n" | tr " " "-")
+printf "# %s %s\n" "${LINE:${#SCRIPT_NAME}}" "${SCRIPT_NAME}"
+
+## Declare a color code for test results
+RED="\033[1;31m"
+GREEN="\033[1;32m"
+NO_COLOR="\033[0m"
+
+failure () {
+    printf "${RED}FAIL${NO_COLOR}: ${1}\n"
+    # exit -1
+}
+
+success () {
+    printf "${GREEN}PASS${NO_COLOR}: ${1}\n"
+}
+
+
+## Is vsearch installed?
+VSEARCH=$(which vsearch)
+DESCRIPTION="check if vsearch is in the PATH"
+[[ "${VSEARCH}" ]] && success "${DESCRIPTION}" || failure "${DESCRIPTION}"
+
+#*****************************************************************************#
+#                                                                             #
+#                    Clustering UC format CIGAR alignment                     #
+#                                                                             #
+#*****************************************************************************#
+
+## usearch 6, 7 and 8 output a "=" when the sequences are identical
+DESCRIPTION="CIGAR alignment is \"=\" when the sequences are identical"
+UC_OUT=$("${VSEARCH}" \
+             --cluster_fast <(printf ">seq1\nACGT\n>seq2\nACGT\n") \
+             --id 0.97 \
+             --quiet \
+             --minseqlength 1 \
+             --uc - | grep "^H" | cut -f 8)
+
+[[ "${UC_OUT}" == "=" ]] && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+## clean
+unset UC_OUT
+
+
+## is the 3rd column of H the query length or the alignment length?
+DESCRIPTION="3rd column of H is the query length"
+UC_OUT=$("${VSEARCH}" \
+             --cluster_fast <(printf ">seq1\nACGT\n>seq2\nACAGT\n") \
+             --id 0.5 \
+             --quiet \
+             --minseqlength 1 \
+             --uc - | grep "^H")
+
+awk 'BEGIN {FS = "\t"} {$3 == 4 && $9 == "seq1"}' <<< "${UC_OUT}" && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+## clean
+unset UC_OUT
+
+
+#*****************************************************************************#
+#                                                                             #
+#                        UC format when dereplicating                         #
+#                                                                             #
+#*****************************************************************************#
+
+## sizein is taken into account
+DESCRIPTION="when prefix dereplicating, --uc output accounts for --sizein"
+s=$(printf ">seq1;size=3;\nACGT\n>seq2;size=1;\nACGT\n" | \
+           "${VSEARCH}" \
+               --derep_prefix - \
+               --quiet \
+               --sizein \
+               --minseqlength 1 \
+               --uc - | grep "^C" | cut -f 3)
+
+(( ${s} == 4 )) && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+# clean
+unset s
+
+
+## vsearch reports H record when sequences have the same length
+DESCRIPTION="when prefix dereplicating same length sequences, --uc reports H record"
+H=$(printf ">seq1\nACGT\n>seq2\nACGT\n" | \
+           "${VSEARCH}" \
+               --derep_prefix - \
+               --quiet \
+               --minseqlength 1 \
+               --uc - | grep "^H")
+
+[[ -n ${H} ]] && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+# clean
+unset H
+
+## vsearch reports H record when sequences have different lengths
+DESCRIPTION="when prefix dereplicating a shorter sequence, --uc reports H record"
+H=$(printf ">seq1\nACGTA\n>seq2\nACGT\n" | \
+           "${VSEARCH}" \
+               --derep_prefix - \
+               --quiet \
+               --minseqlength 1 \
+               --uc - | grep "^H")
+
+[[ -n ${H} ]] && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+## clean
+unset H
+
+
+## --derep_prefix does not support the option --strand
+DESCRIPTION="--derep_prefix does not support the option --strand"
+printf ">seq1\nAATT\n>seq2\nTTAA\n" | \
+    "${VSEARCH}" \
+        --derep_prefix - \
+        --quiet \
+        --strand both \
+        --minseqlength 1 \
+        --uc - 2> /dev/null && \
+    failure "${DESCRIPTION}" || \
+        success  "${DESCRIPTION}"
+
+# clean
+unset H
+
+
+## --derep_fulllength accepts the option --strand
+DESCRIPTION="--derep_fulllength accepts the option --strand"
+printf ">seq1\nAATT\n>seq2\nTTAA\n" | \
+    "${VSEARCH}" \
+        --derep_fulllength - \
+        --quiet \
+        --strand both \
+        --minseqlength 1 \
+        --uc /dev/null 2> /dev/null && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+# clean
+unset H
+
+
+## --derep_fulllength searches both strands
+DESCRIPTION="--derep_fulllength searches both strands"
+C=$(printf ">seq1\nAACC\n>seq2\nGGTT\n" | \
+           "${VSEARCH}" \
+               --derep_fulllength - \
+               --quiet \
+               --strand both \
+               --minseqlength 1 \
+               --uc - | grep -c "^C")
+
+# There should be only cluster
+(( ${C} == 1 )) && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+# clean
+unset C
+
+#*****************************************************************************#
+#                                                                             #
+#         fastq_trunclen and discarded short sequences (issue 203)            #
+#                                                                             #
+#*****************************************************************************#
+
+DESCRIPTION="entries shorter than the --fastq_trunclength value are discarded"
+"${VSEARCH}" \
+    --fastq_filter <(printf "@seq1\nACGT\n+\nIIII\n") \
+    --fastq_trunclen 5 \
+    --quiet \
+    --fastqout - \
+    2> /dev/null | \
+    grep -q "seq1" && \
+    failure "${DESCRIPTION}" || \
+        success  "${DESCRIPTION}"
+
+DESCRIPTION="entries equal or longer than the --fastq_trunclength value are kept"
+"${VSEARCH}" \
+    --fastq_filter <(printf "@seq1\nACGT\n+\nIIII\n") \
+    --fastq_trunclen 4 \
+    --quiet \
+    --fastqout - \
+    2> /dev/null | \
+    grep -q "seq1" && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+
+#*****************************************************************************#
+#                                                                             #
+#     fastx_filter ignores sizein when relabeling fasta input (issue #204)    #
+#                                                                             #
+#*****************************************************************************#
+
+# https://github.com/torognes/vsearch/issues/204
+#
+# --fastx_filter ignores input sequence abundances when relabeling
+# with fasta input, --sizein and --sizeout options
+DESCRIPTION="fastx_filter reports sizein when relabeling fasta (issue #204)"
+"${VSEARCH}" \
+    --fastx_filter <(printf ">seq1;size=5;\nACGT\n") \
+    --sizein \
+    --relabel_md5 \
+    --sizeout \
+    --quiet \
+    --fastaout - \
+    2> /dev/null | \
+    grep -q ";size=5;" && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+
+#*****************************************************************************#
+#                                                                             #
+#         check pairwise alignment correctness (Flouri et al., 2015)          #
+#                                                                             #
+#*****************************************************************************#
+
+# http://biorxiv.org/content/early/2015/11/12/031500
+
+# In USEARCH and VSEARCH the gap opening penalty includes the gap
+# extension penalty of the first residue, while in other programs it
+# does not. So if the gap open penalty is 40 and the gap extension
+# penalty is 1, then a single nucleotide gap will get a penalty of 40
+# in USEARCH and VSEARCH, and 41 in other programs.
+
+# In Flouri's tests, the gap opening penalty does not include the gap
+# extension penalty, and the optimal alignments contain two
+# independent gaps. Therefore, USEARCH and VSEARCH should return score
+# values equal to the scores indicated by Flouri, minus twice the gap
+# extension penalty (e.g., a score of -72 reported by Flouri
+# corresponds to a score of -70 with USEARCH and VSEARCH). The
+# expected score values in the tests below take that into account.
+
+# test 1 requires the possibility to set independent match/mismatch
+# scores for the different pairs of nucleotides. Not possible to
+# replicate in vsearch: ">seq1\nGGTGTGA\n>seq2\nTCGCGT\n"
+
+# test 2 uses a match score of zero, not possible with vsearch (Fatal
+# error: The argument to --match must be positive)
+# ">seq1\nAAAGGG\n>seq2\nTTAAAAGGGGTT\n"
+
+# test 3 (score should be -70 in USEARCH/VSEARCH)
+DESCRIPTION="Flouri 2015 pairwise alignment correctness tests (test 3)"
+score=$("${VSEARCH}" \
+            --allpairs_global <(printf ">seq1\nAAATTTGC\n>seq2\nCGCCTTAC\n") \
+            --acceptall \
+            --gapopen 40 \
+            --gapext 1\
+            --match 10 \
+            --mismatch -30 \
+            --qmask none \
+            --quiet \
+            --userfields raw \
+            --userout -)
+
+(( ${score} == -70 )) && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+# test 4 (score should be -60 in USEARCH/VSEARCH
+DESCRIPTION="Flouri 2015 pairwise alignment correctness tests (test 4)"
+score=$("${VSEARCH}" \
+            --allpairs_global <(printf ">seq1\nTAAATTTGC\n>seq2\nTCGCCTTAC\n") \
+            --acceptall \
+            --gapopen 40 \
+            --gapext 1\
+            --match 10 \
+            --mismatch -30 \
+            --qmask none \
+            --quiet \
+            --userfields raw \
+            --userout -)
+
+(( ${score} == -60 )) && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+# test 5 (identical to test 3)
+
+# test 6 (score should be -44 in USEARCH/VSEARCH
+DESCRIPTION="Flouri 2015 pairwise alignment correctness tests (test 6)"
+score=$("${VSEARCH}" \
+            --allpairs_global <(printf ">seq1\nAGAT\n>seq2\nCTCT\n") \
+            --acceptall \
+            --gapopen 25 \
+            --gapext 1\
+            --match 10 \
+            --mismatch -30 \
+            --qmask none \
+            --quiet \
+            --userfields raw \
+            --userout -)
+
+(( ${score} == -44 )) && \
+    success  "${DESCRIPTION}" || \
+        failure "${DESCRIPTION}"
+
+exit 0

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/vsearch.git



More information about the debian-med-commit mailing list