[med-svn] [vsearch] 01/04: Imported Upstream version 2.0.0
Andreas Tille
tille at debian.org
Thu Jun 30 08:33:39 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository vsearch.
commit ecb9cf5f8bb758b6c534bf550b1ba065df752163
Author: Andreas Tille <tille at debian.org>
Date: Thu Jun 30 10:05:17 2016 +0200
Imported Upstream version 2.0.0
---
configure.ac | 2 +-
man/vsearch.1 | 87 +++++++---
src/chimera.cc | 2 +-
src/dynlibs.cc | 17 +-
src/dynlibs.h | 4 +
src/eestats.cc | 2 +-
src/fasta.cc | 316 +++-------------------------------
src/fasta.h | 63 ++-----
src/fastq.cc | 368 ++++-----------------------------------
src/fastq.h | 71 ++------
src/fastqops.cc | 8 +-
src/fastx.cc | 491 ++++++++++++++++++++++++++++++++++++++---------------
src/fastx.h | 53 +++++-
src/maps.cc | 53 ++++++
src/maps.h | 1 +
src/mergepairs.cc | 4 +-
src/msa.cc | 82 +++------
src/rerep.cc | 2 +-
src/search.cc | 2 +-
src/searchcore.cc | 4 +-
src/searchexact.cc | 2 +-
src/util.cc | 2 +-
src/vsearch.cc | 113 +++++++++++-
src/vsearch.h | 4 +-
24 files changed, 783 insertions(+), 970 deletions(-)
diff --git a/configure.ac b/configure.ac
index 79d3c5c..6260dd7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ([2.63])
-AC_INIT([vsearch], [1.11.1], [torognes at ifi.uio.no])
+AC_INIT([vsearch], [2.0.0], [torognes at ifi.uio.no])
AM_INIT_AUTOMAKE([subdir-objects])
AC_LANG([C++])
AC_CONFIG_SRCDIR([src/vsearch.cc])
diff --git a/man/vsearch.1 b/man/vsearch.1
index 4572afb..672d911 100644
--- a/man/vsearch.1
+++ b/man/vsearch.1
@@ -1,5 +1,5 @@
.\" ============================================================================
-.TH vsearch 1 "April 13, 2016" "version 1.11.1" "USER COMMANDS"
+.TH vsearch 1 "June 24, 2016" "version 2.0.0" "USER COMMANDS"
.\" ============================================================================
.SH NAME
vsearch \(em chimera detection, clustering, dereplication and
@@ -169,12 +169,15 @@ both of the symbols are ambiguous (RYSWKMDBHVN) in which case the
score is zero. Alignment of two identical ambiguous symbols (e.g. R vs
R) also receives a score of zero.
.PP
-\fBvsearch\fR can be compiled to accepted compressed fasta files as
-input (gz and bzip2 formats). On the other hand, special files like
-pipes, named pipes, or sockets cannot be used as input. To present a
-progress indicator, \fBvsearch\fR needs to seek to the end of
-\fIfilename\fR to find its length. Consequently, \fIfilename\fR must
-be a regular file, not a stream.
+Input files compressed with gzip or bzip2 are automatically
+detected and decompressed if vsearch was compiled with the appropriate
+libraries. Input from pipes is supported, but then compressed input
+must be indicated using the \-\-gzip_decompress or
+\-\-bzip2_decompress options. For input files the name '\-'
+represents standard input (/dev/stdin). Multiple FASTA or FASTQ files
+may be piped into vsearch for dereplication or other operations. When
+reading from a pipe, the progress indicator is not updated. For
+output files the name '\-' represents standard output (/dev/stdout).
.\" ----------------------------------------------------------------------------
.SS Options
\fBvsearch\fR recognizes a large number of command-line options. For
@@ -182,16 +185,26 @@ easier navigation, options are grouped below by theme (chimera
detection, clustering, dereplication and rereplication, FASTA/FASTQ
file processing, masking, pairwise alignment, searching, shuffling,
sorting, and subsampling). We start with the general options that
-apply to all themes.
+apply to all themes. Options may start with a single (\-) or double
+dash (\-\-). Option names may be shortened as long as they are not
+ambiguous (e.g. \-\-derep_f).
.PP
General options:
.RS
.TP 9
+.B \-\-bzip2_decompress
+Decompress input using bzip2. The option is required only when reading
+from a pipe, otherwise compression is automatically detected.
+.TP
.BI \-\-fasta_width\~ "positive integer"
Fasta files produced by \fBvsearch\fR are wrapped (sequences are
written on lines of \fIinteger\fR nucleotides, 80 by default). Set
that value to 0 to eliminate the wrapping.
.TP
+.B \-\-gzip_decompress
+Decompress input using gzip. The option is required only when reading
+from a pipe, otherwise compression is automatically detected.
+.TP
.B \-\-help | \-\-h
Display help text and exit.
.TP
@@ -1163,8 +1176,11 @@ length. Output order may vary when using multiple threads.
Write search results to \fIfilename\fR using a blast-like
tab-separated format of twelve fields (listed below), with one line
per query-target matching (or lack of matching if \-\-output_no_hits
-is used). Output order may vary when using multiple threads. A similar
-output can be obtain with \-\-userout \fIfilename\fR and
+is used). Warning, vsearch uses global pairwise alignments, not
+blast's seed-and-extend algorithm. Therefore, some common blast output
+values (alignment start and end, evalue, bit score) are reported
+differently. Output order may vary when using multiple threads. A
+similar output can be obtain with \-\-userout \fIfilename\fR and
\-\-userfields
query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits. A
complete list and description is available in the section "Userfields"
@@ -1193,18 +1209,20 @@ integer value).
positive integer value).
.IP \n+[step].
\fIqlo\fR: first nucleotide of the query aligned with the
-target. Always equal to 1 if there is an alignment, 0 otherwise.
+target. Always equal to 1 if there is an alignment, 0 otherwise (see
+\fIqilo\fR to ignore initial gaps).
.IP \n+[step].
\fIqhi\fR: last nucleotide of the query aligned with the
-target. Always equal to the length of the pairwise alignment. The
-field is set to 0 if there is no alignment.
+target. Always equal to the length of the pairwise alignment, 0
+otherwise (see \fIqihi\fR to ignore terminal gaps).
.IP \n+[step].
\fItlo\fR: irst nucleotide of the target aligned with the
-query. Always equal to 1 if there is an alignment, 0 otherwise.
+query. Always equal to 1 if there is an alignment, 0 otherwise (see
+\fItilo\fR to ignore initial gaps).
.IP \n+[step].
\fIthi\fR: last nucleotide of the target aligned with the
-query. Always equal to the length of the pairwise alignment. The field
-is set to 0 if there is no alignment.
+query. Always equal to the length of the pairwise alignment, 0
+otherwise (see \fItihi\fR to ignore terminal gaps).
.IP \n+[step].
\fIevalue\fR: expectancy-value (not computed for nucleotide
alignments). Always set to -1.
@@ -1814,8 +1832,8 @@ is not computed by \fBvsearch\fR. Always set to +0.
.TP
.B qhi
Last nucleotide of the query aligned with the target. Always equal to
-the length of the pairwise alignment. The field is set to 0 if there
-is no alignment.
+the length of the pairwise alignment, 0 otherwise (see \fIqihi\fR to
+ignore terminal gaps).
.TP
.B qihi
Last nucleotide of the query aligned with the target (ignoring
@@ -1833,7 +1851,8 @@ if there is no alignment.
.TP
.B qlo
First nucleotide of the query aligned with the target. Always equal to
-1 if there is an alignment, 0 otherwise.
+1 if there is an alignment, 0 otherwise (see \fIqilo\fR to ignore
+initial gaps).
.TP
.B qrow
Print the sequence of the query segment as seen in the pairwise
@@ -1873,8 +1892,8 @@ is not computed by \fBvsearch\fR. Always set to +0.
.TP
.B thi
Last nucleotide of the target aligned with the query. Always equal to
-the length of the pairwise alignment. The field is set to 0 if there
-is no alignment.
+the length of the pairwise alignment, 0 otherwise (see \fItihi\fR to
+ignore terminal gaps).
.TP
.B tihi
Last nucleotide of the target aligned with the query (ignoring
@@ -1892,7 +1911,8 @@ if there is no alignment.
.TP
.B tlo
First nucleotide of the target aligned with the query. Always equal to
-1 if there is an alignment, 0 otherwise.
+1 if there is an alignment, 0 otherwise (see \fItilo\fR to ignore
+initial gaps).
.TP
.B trow
Print the sequence of the target segment as seen in the pairwise
@@ -2452,6 +2472,29 @@ specified with \-\-fastaout and \-\-fastaout_discarded when \-\-eeout
or \-\-fastq_eeout option is in effect for fastq_filter and
fastq_mergepairs. The options \-\-eeout and \-\-fastq_eeout are now
equivalent.
+.TP
+.BR v1.11.2\~ "released June 21, 2016"
+Two bugs were fixed. The first issue was related to the \-\-query_cov
+option that used a different coverage definition than the qcov
+userfield. The coverage is now defined as the fraction of the whole
+query sequence length that is aligned with matching or mismatching
+residues in the target. All gaps are ignored. The other issue was
+related to the consensus sequences produced during clustering when
+only N's were present in some positions. Previously these would be
+converted to A's in the consensus. The behaviour is changed so that
+N's are produced in the consensus, and it should now be more
+compatible with usearch.
+.TP
+.BR v2.0.0\~ "released June 24, 2016"
+This major new version supports reading from pipes. Two new options
+are added: \-\-gzip_decompress and \-\-bzip2_decompress. One of these
+options must be specified if reading compressed input from a pipe, but
+are not required when reading from ordinary files. The vsearch header
+that was previously written to stdout is now written to stderr. This
+enables piping of results for further processing. The file name '\-'
+now represent standard input (/dev/stdin) or standard outout
+(/dev/stdout) when reading or writing files, respectively. Code for
+reading FASTA and FASTQ files have been refactored.
.RE
.LP
.\" ============================================================================
diff --git a/src/chimera.cc b/src/chimera.cc
index a01fd50..c750dd9 100644
--- a/src/chimera.cc
+++ b/src/chimera.cc
@@ -79,7 +79,7 @@ const double chimera_id = 0.55;
static int tophits;
static pthread_attr_t attr;
static pthread_t * pthread;
-static fasta_handle query_fasta_h;
+static fastx_handle query_fasta_h;
/* mutexes and global data protected by mutex */
static pthread_mutex_t mutex_input;
diff --git a/src/dynlibs.cc b/src/dynlibs.cc
index 2cc804c..4f6cc2e 100644
--- a/src/dynlibs.cc
+++ b/src/dynlibs.cc
@@ -70,6 +70,10 @@ void * gz_lib;
gzFile (*gzdopen_p)(int, const char *);
int (*gzclose_p)(gzFile);
int (*gzread_p)(gzFile, void *, unsigned);
+int (*gzgetc_p)(gzFile);
+int (*gzrewind_p)(gzFile);
+int (*gzungetc_p)(int, gzFile);
+const char * (*gzerror_p)(gzFile, int*);
#endif
#ifdef HAVE_BZLIB_H
@@ -90,12 +94,13 @@ void dynlibs_open()
gz_lib = dlopen(gz_libname, RTLD_LAZY);
if (gz_lib)
{
- gzdopen_p = (gzFile (*)(int, const char*))
- dlsym(gz_lib, "gzdopen");
- gzclose_p = (int (*)(gzFile))
- dlsym(gz_lib, "gzclose");
- gzread_p = (int (*)(gzFile, void*, unsigned))
- dlsym(gz_lib, "gzread");
+ gzdopen_p = (gzFile (*)(int, const char*)) dlsym(gz_lib, "gzdopen");
+ gzclose_p = (int (*)(gzFile)) dlsym(gz_lib, "gzclose");
+ gzread_p = (int (*)(gzFile, void*, unsigned)) dlsym(gz_lib, "gzread");
+ gzgetc_p = (int (*)(gzFile)) dlsym(gz_lib, "gzgetc");
+ gzrewind_p = (int (*)(gzFile)) dlsym(gz_lib, "gzrewind");
+ gzerror_p = (const char * (*)(gzFile, int*)) dlsym(gz_lib, "gzerror");
+ gzungetc_p = (int (*)(int, gzFile)) dlsym(gz_lib, "gzungetc");
}
#endif
diff --git a/src/dynlibs.h b/src/dynlibs.h
index 711b570..0d13cbc 100644
--- a/src/dynlibs.h
+++ b/src/dynlibs.h
@@ -63,6 +63,10 @@ extern void * gz_lib;
extern gzFile (*gzdopen_p)(int, const char *);
extern int (*gzclose_p)(gzFile);
extern int (*gzread_p)(gzFile, void*, unsigned);
+extern int (*gzgetc_p)(gzFile);
+extern int (*gzrewind_p)(gzFile);
+extern int (*gzungetc_p)(int, gzFile);
+extern const char * (*gzerror_p)(gzFile, int*);
#endif
#ifdef HAVE_BZLIB_H
diff --git a/src/eestats.cc b/src/eestats.cc
index ff2aad4..bb6ea6b 100644
--- a/src/eestats.cc
+++ b/src/eestats.cc
@@ -72,7 +72,7 @@ long ee_start(int pos, int resolution)
void fastq_eestats()
{
- fastq_handle h = fastq_open(opt_fastq_eestats);
+ fastx_handle h = fastq_open(opt_fastq_eestats);
unsigned long filesize = fastq_get_size(h);
diff --git a/src/fasta.cc b/src/fasta.cc
index 6a80abf..d788c4b 100644
--- a/src/fasta.cc
+++ b/src/fasta.cc
@@ -60,279 +60,22 @@
#include "vsearch.h"
-#define FASTA_BUFFER_ALLOC 8192
-
-#ifdef HAVE_BZLIB_H
-#define BZ_VERBOSE_0 0
-#define BZ_VERBOSE_1 1
-#define BZ_VERBOSE_2 2
-#define BZ_VERBOSE_3 3
-#define BZ_VERBOSE_4 4
-#define BZ_MORE_MEM 0 /* faster decompression using more memory */
-#define BZ_LESS_MEM 1 /* slower decompression but requires less memory */
-#endif
-
-#define FORMAT_PLAIN 1
-#define FORMAT_BZIP 2
-#define FORMAT_GZIP 3
-
-static unsigned char MAGIC_GZIP[] = "\x1f\x8b";
-static unsigned char MAGIC_BZIP[] = "BZ";
-
-void buffer_init(struct fasta_buffer_s * buffer)
+fastx_handle fasta_open(const char * filename)
{
- buffer->alloc = FASTA_BUFFER_ALLOC;
- buffer->data = (char*) xmalloc(buffer->alloc);
- buffer->data[0] = 0;
- buffer->length = 0;
- buffer->position = 0;
-}
-
-void buffer_free(struct fasta_buffer_s * buffer)
-{
- if (buffer->data)
- free(buffer->data);
- buffer->data = 0;
- buffer->alloc = 0;
- buffer->length = 0;
- buffer->position = 0;
-}
-
-fasta_handle fasta_open(const char * filename)
-{
- fasta_handle h = (fasta_handle) xmalloc(sizeof(struct fasta_s));
-
- h->fp = NULL;
- h->fp = fopen(filename, "rb");
- if(!h->fp)
- fatal("Error: Unable to open fasta file for reading (%s)", filename);
-
- /* detect compression (plain, gzipped or bzipped) */
-
- unsigned char magic[2];
- h->format = FORMAT_PLAIN;
- if (fread(&magic, 1, 2, h->fp) >= 2)
- {
- if (!memcmp(magic, MAGIC_GZIP, 2))
- h->format = FORMAT_GZIP;
- else if (!memcmp(magic, MAGIC_BZIP, 2))
- h->format = FORMAT_BZIP;
- }
-
- /* Get size of original (uncompressed) file */
-
- if (fseek(h->fp, 0, SEEK_END))
- fatal("Error: Unable to seek in fasta file (%s)", filename);
- h->file_size = ftell(h->fp);
- rewind(h->fp);
-
- if (h->format == FORMAT_GZIP)
- {
- /* GZIP: Keep original file open, then open as gzipped file as well */
-#ifdef HAVE_ZLIB_H
- if (!gz_lib)
- fatal("Files compressed with gzip are not supported");
- if (! (h->fp_gz = (*gzdopen_p)(fileno(h->fp), "rb")))
- fatal("Unable to open gzip compressed fasta file (%s)", filename);
-#else
- fatal("Files compressed with gzip are not supported");
-#endif
- }
-
- if (h->format == FORMAT_BZIP)
- {
- /* BZIP2: Keep original file open, then open as bzipped file as well */
-#ifdef HAVE_BZLIB_H
- if (!bz2_lib)
- fatal("Files compressed with bzip2 are not supported");
- int bzError;
- if (! (h->fp_bz = (*BZ2_bzReadOpen_p)(& bzError, h->fp,
- BZ_VERBOSE_0, BZ_MORE_MEM,
- NULL, 0)))
- fatal("Unable to open bzip2 compressed fasta file (%s)", filename);
-#else
- fatal("Files compressed with bzip2 are not supported");
-#endif
- }
+ fastx_handle h = fastx_open(filename);
-
-
- h->stripped_all = 0;
-
- for(int i=0; i<256; i++)
- h->stripped[i] = 0;
-
- h->file_position = 0;
-
- buffer_init(& h->file_buffer);
- buffer_init(& h->header_buffer);
- buffer_init(& h->sequence_buffer);
-
- h->lineno = 1;
- h->lineno_start = 1;
- h->seqno = -1;
+ if (fastx_is_fastq(h))
+ fatal("FASTA file expected, FASTQ file found (%s)", filename);
return h;
}
-void fasta_close(fasta_handle h)
-{
- /* Warn about stripped chars */
-
- if (h->stripped_all)
- {
- fprintf(stderr, "WARNING: invalid characters stripped from fasta file:");
- for (int i=0; i<256;i++)
- if (h->stripped[i])
- fprintf(stderr, " %c(%lu)", i, h->stripped[i]);
- fprintf(stderr, "\n");
-
- if (opt_log)
- {
- fprintf(fp_log, "WARNING: invalid characters stripped from fasta file:");
- for (int i=0; i<256;i++)
- if (h->stripped[i])
- fprintf(fp_log, " %c(%lu)", i, h->stripped[i]);
- fprintf(fp_log, "\n");
- }
- }
-
-#ifdef HAVE_BZLIB_H
- int bz_error;
-#endif
-
- switch(h->format)
- {
- case FORMAT_PLAIN:
- fclose(h->fp);
- h->fp = 0;
- break;
-
- case FORMAT_GZIP:
-#ifdef HAVE_ZLIB_H
- (*gzclose_p)(h->fp_gz);
- h->fp_gz = 0;
- h->fp = 0;
- break;
-#endif
-
- case FORMAT_BZIP:
-#ifdef HAVE_BZLIB_H
- (*BZ2_bzReadClose_p)(&bz_error, h->fp_bz);
- h->fp_bz = 0;
- h->fp = 0;
- break;
-#endif
-
- default:
- fatal("Internal error");
- }
-
- buffer_free(& h->file_buffer);
- buffer_free(& h->header_buffer);
- buffer_free(& h->sequence_buffer);
-
- h->file_size = 0;
- h->file_position = 0;
-
- h->lineno = 0;
- h->seqno = -1;
-
- free(h);
-}
-
-
-unsigned long fasta_file_fill_buffer(fasta_handle h)
-{
- /* read more data if necessary */
- unsigned long rest = h->file_buffer.length - h->file_buffer.position;
-
- if (rest > 0)
- return rest;
- else
- {
- unsigned long space = h->file_buffer.alloc - h->file_buffer.length;
-
- if (space == 0)
- {
- /* back to beginning of buffer */
- h->file_buffer.position = 0;
- h->file_buffer.length = 0;
- space = h->file_buffer.alloc;
- }
-
- int bytes_read = 0;
-
-#ifdef HAVE_BZLIB_H
- int bzError = 0;
-#endif
-
- switch(h->format)
- {
- case FORMAT_PLAIN:
- bytes_read = fread(h->file_buffer.data
- + h->file_buffer.position,
- 1,
- space,
- h->fp);
- break;
-
- case FORMAT_GZIP:
-#ifdef HAVE_ZLIB_H
- bytes_read = (*gzread_p)(h->fp_gz,
- h->file_buffer.data
- + h->file_buffer.position,
- space);
- if (bytes_read < 0)
- fatal("Error reading gzip compressed fasta file");
- break;
-#endif
-
- case FORMAT_BZIP:
-#ifdef HAVE_BZLIB_H
- bytes_read = (*BZ2_bzRead_p)(& bzError,
- h->fp_bz,
- h->file_buffer.data
- + h->file_buffer.position,
- space);
- if ((bytes_read < 0) ||
- ! ((bzError == BZ_OK) ||
- (bzError == BZ_STREAM_END) ||
- (bzError == BZ_SEQUENCE_ERROR)))
- fatal("Error reading bzip2 compressed fasta file");
- break;
-#endif
-
- default:
- fatal("Internal error");
- }
-
- h->file_buffer.length += bytes_read;
- return bytes_read;
- }
-}
-
-void buffer_extend(struct fasta_buffer_s * buffer, char * buf, unsigned long len)
+void fasta_close(fastx_handle h)
{
- if (buffer->length + len + 1 > buffer->alloc)
- {
- /* alloc space for len more characters + terminating zero,
- but round up to nearest block size */
- buffer->alloc =
- FASTA_BUFFER_ALLOC *
- (((buffer->length + len) / FASTA_BUFFER_ALLOC) + 1);
- buffer->data = (char*) xrealloc(buffer->data, buffer->alloc);
- }
-
- /* copy string */
- memcpy(buffer->data + buffer->length, buf, len);
- buffer->length += len;
-
- /* add terminator */
- buffer->data[buffer->length] = 0;
+ fastx_close(h);
}
-void fasta_truncate_header(fasta_handle h, bool truncateatspace)
+void fasta_truncate_header(fastx_handle h, bool truncateatspace)
{
/* Truncate the zero-terminated header string by inserting a new
terminator (zero byte) at the first space/tab character
@@ -346,7 +89,7 @@ void fasta_truncate_header(fasta_handle h, bool truncateatspace)
h->header_buffer.data[h->header_buffer.length] = 0;
}
-void fasta_filter_sequence(fasta_handle h,
+void fasta_filter_sequence(fastx_handle h,
unsigned int * char_action,
char * char_mapping)
{
@@ -408,7 +151,7 @@ void fasta_filter_sequence(fasta_handle h,
h->sequence_buffer.length = q - h->sequence_buffer.data;
}
-bool fasta_next(fasta_handle h,
+bool fasta_next(fastx_handle h,
bool truncateatspace,
char * char_mapping)
{
@@ -417,7 +160,7 @@ bool fasta_next(fasta_handle h,
h->header_buffer.length = 0;
h->sequence_buffer.length = 0;
- unsigned long rest = fasta_file_fill_buffer(h);
+ unsigned long rest = fastx_file_fill_buffer(h);
if (rest == 0)
return 0;
@@ -427,7 +170,10 @@ bool fasta_next(fasta_handle h,
/* check initial > character */
if (h->file_buffer.data[h->file_buffer.position] != '>')
- fatal("Invalid FASTA - header must start with > character");
+ {
+ fprintf(stderr, "Found character %02x\n", (unsigned char)(h->file_buffer.data[h->file_buffer.position]));
+ fatal("Invalid FASTA - header must start with > character");
+ }
h->file_buffer.position++;
rest--;
@@ -435,7 +181,7 @@ bool fasta_next(fasta_handle h,
while (lf == 0)
{
/* get more data if buffer empty*/
- rest = fasta_file_fill_buffer(h);
+ rest = fastx_file_fill_buffer(h);
if (rest == 0)
fatal("Invalid FASTA - header must be terminated with newline");
@@ -464,7 +210,7 @@ bool fasta_next(fasta_handle h,
while (1)
{
/* get more data, if necessary */
- rest = fasta_file_fill_buffer(h);
+ rest = fastx_file_fill_buffer(h);
/* end if no more data */
if (rest == 0)
@@ -496,62 +242,50 @@ bool fasta_next(fasta_handle h,
fasta_truncate_header(h, truncateatspace);
fasta_filter_sequence(h, char_fasta_action, char_mapping);
-#ifdef HAVE_ZLIB_H
- if (h->format == FORMAT_GZIP)
- {
- /* Circumvent the missing gzoffset function in zlib 1.2.3 and earlier */
- int fd = dup(fileno(h->fp));
- h->file_position = lseek(fd, 0, SEEK_CUR);
- close(fd);
- }
- else
-#endif
- h->file_position = ftell(h->fp);
-
return 1;
}
-long fasta_get_abundance(fasta_handle h)
+long fasta_get_abundance(fastx_handle h)
{
return abundance_get(global_abundance, h->header_buffer.data);
}
-unsigned long fasta_get_position(fasta_handle h)
+unsigned long fasta_get_position(fastx_handle h)
{
return h->file_position;
}
-unsigned long fasta_get_size(fasta_handle h)
+unsigned long fasta_get_size(fastx_handle h)
{
return h->file_size;
}
-unsigned long fasta_get_lineno(fasta_handle h)
+unsigned long fasta_get_lineno(fastx_handle h)
{
return h->lineno_start;
}
-unsigned long fasta_get_seqno(fasta_handle h)
+unsigned long fasta_get_seqno(fastx_handle h)
{
return h->seqno;
}
-unsigned long fasta_get_header_length(fasta_handle h)
+unsigned long fasta_get_header_length(fastx_handle h)
{
return h->header_buffer.length;
}
-unsigned long fasta_get_sequence_length(fasta_handle h)
+unsigned long fasta_get_sequence_length(fastx_handle h)
{
return h->sequence_buffer.length;
}
-char * fasta_get_header(fasta_handle h)
+char * fasta_get_header(fastx_handle h)
{
return h->header_buffer.data;
}
-char * fasta_get_sequence(fasta_handle h)
+char * fasta_get_sequence(fastx_handle h)
{
return h->sequence_buffer.data;
}
diff --git a/src/fasta.h b/src/fasta.h
index 0c1d5a9..38f5f4d 100644
--- a/src/fasta.h
+++ b/src/fasta.h
@@ -58,61 +58,24 @@
*/
-struct fasta_buffer_s
-{
- char * data;
- unsigned long length;
- unsigned long alloc;
- unsigned long position;
-};
-
-struct fasta_s
-{
- FILE * fp;
-
-#ifdef HAVE_ZLIB_H
- gzFile fp_gz;
-#endif
-
-#ifdef HAVE_BZLIB_H
- BZFILE * fp_bz;
-#endif
-
- struct fasta_buffer_s file_buffer;
- struct fasta_buffer_s header_buffer;
- struct fasta_buffer_s sequence_buffer;
-
- unsigned long file_size;
- unsigned long file_position;
-
- unsigned long lineno;
- unsigned long lineno_start;
- long seqno;
-
- unsigned long stripped_all;
- unsigned long stripped[256];
-
- int format;
-};
-
-typedef struct fasta_s * fasta_handle;
/* fasta input */
-fasta_handle fasta_open(const char * filename);
-void fasta_close(fasta_handle h);
-bool fasta_next(fasta_handle h,
+void fasta_open_rest(fastx_handle h);
+fastx_handle fasta_open(const char * filename);
+void fasta_close(fastx_handle h);
+bool fasta_next(fastx_handle h,
bool truncateatspace,
char * char_mapping);
-unsigned long fasta_get_position(fasta_handle h);
-unsigned long fasta_get_size(fasta_handle h);
-unsigned long fasta_get_lineno(fasta_handle h);
-unsigned long fasta_get_seqno(fasta_handle h);
-char * fasta_get_header(fasta_handle h);
-char * fasta_get_sequence(fasta_handle h);
-unsigned long fasta_get_header_length(fasta_handle h);
-unsigned long fasta_get_sequence_length(fasta_handle h);
-long fasta_get_abundance(fasta_handle h);
+unsigned long fasta_get_position(fastx_handle h);
+unsigned long fasta_get_size(fastx_handle h);
+unsigned long fasta_get_lineno(fastx_handle h);
+unsigned long fasta_get_seqno(fastx_handle h);
+char * fasta_get_header(fastx_handle h);
+char * fasta_get_sequence(fastx_handle h);
+unsigned long fasta_get_header_length(fastx_handle h);
+unsigned long fasta_get_sequence_length(fastx_handle h);
+long fasta_get_abundance(fastx_handle h);
/* fasta output */
diff --git a/src/fastq.cc b/src/fastq.cc
index 894d4ea..b45cedc 100644
--- a/src/fastq.cc
+++ b/src/fastq.cc
@@ -60,27 +60,6 @@
#include "vsearch.h"
-#define FASTQ_BUFFER_ALLOC 8192
-
-#ifdef HAVE_BZLIB_H
-#define BZ_VERBOSE_0 0
-#define BZ_VERBOSE_1 1
-#define BZ_VERBOSE_2 2
-#define BZ_VERBOSE_3 3
-#define BZ_VERBOSE_4 4
-#define BZ_MORE_MEM 0 /* faster decompression using more memory */
-#define BZ_LESS_MEM 1 /* slower decompression but requires less memory */
-#endif
-
-#define FORMAT_PLAIN 1
-#define FORMAT_BZIP 2
-#define FORMAT_GZIP 3
-
-static unsigned char MAGIC_GZIP[] = "\x1f\x8b";
-static unsigned char MAGIC_BZIP[] = "BZ";
-
-static char map_identity[256];
-
void fastq_fatal(unsigned long lineno, const char * msg)
{
char * string;
@@ -99,286 +78,8 @@ void fastq_fatal(unsigned long lineno, const char * msg)
fatal("Out of memory");
}
-void buffer_init(struct fastq_buffer_s * buffer)
-{
- buffer->alloc = FASTQ_BUFFER_ALLOC;
- buffer->data = (char*) xmalloc(buffer->alloc);
- buffer->data[0] = 0;
- buffer->length = 0;
- buffer->position = 0;
-}
-
-void buffer_free(struct fastq_buffer_s * buffer)
-{
- if (buffer->data)
- free(buffer->data);
- buffer->data = 0;
- buffer->alloc = 0;
- buffer->length = 0;
- buffer->position = 0;
-}
-
-void buffer_makespace(struct fastq_buffer_s * buffer, unsigned long x)
-{
- /* make sure there is space for x more chars in buffer */
-
- if (buffer->length + x > buffer->alloc)
- {
- /* alloc space for x more characters,
- but round up to nearest block size */
- buffer->alloc =
- ((buffer->length + x + FASTQ_BUFFER_ALLOC - 1) / FASTQ_BUFFER_ALLOC)
- * FASTQ_BUFFER_ALLOC;
- buffer->data = (char*) xrealloc(buffer->data, buffer->alloc);
- }
-}
-
-void buffer_truncate(struct fastq_buffer_s * buffer, bool truncateatspace)
-{
- /* Truncate the zero-terminated header string by inserting a new
- terminator (zero byte) at the first space/tab character
- (if truncateatspace) or first linefeed. */
-
- if (truncateatspace)
- buffer->length = strcspn(buffer->data, " \t\n");
- else
- buffer->length = strcspn(buffer->data, "\n");
-
- buffer->data[buffer->length] = 0;
-}
-
-
-fastq_handle fastq_open(const char * filename)
-{
- fastq_handle h = (fastq_handle) xmalloc(sizeof(struct fastq_s));
-
- h->fp = NULL;
- h->fp = fopen(filename, "rb");
- if(!h->fp)
- fatal("Error: Unable to open fastq file for reading (%s)", filename);
-
- /* detect compression (plain, gzipped or bzipped) */
-
- unsigned char magic[2];
- h->format = FORMAT_PLAIN;
- if (fread(&magic, 1, 2, h->fp) >= 2)
- {
- if (!memcmp(magic, MAGIC_GZIP, 2))
- h->format = FORMAT_GZIP;
- else if (!memcmp(magic, MAGIC_BZIP, 2))
- h->format = FORMAT_BZIP;
- }
-
- /* Get size of original (uncompressed) file */
-
- if (fseek(h->fp, 0, SEEK_END))
- fatal("Error: Unable to seek in fastq file (%s)", filename);
- h->file_size = ftell(h->fp);
- rewind(h->fp);
-
- if (h->format == FORMAT_GZIP)
- {
- /* GZIP: Keep original file open, then open as gzipped file as well */
-#ifdef HAVE_ZLIB_H
- if (!gz_lib)
- fatal("Files compressed with gzip are not supported");
- if (! (h->fp_gz = (*gzdopen_p)(dup(fileno(h->fp)), "rb")))
- fatal("Unable to open gzip compressed fastq file (%s)", filename);
-#else
- fatal("Files compressed with gzip are not supported");
-#endif
- }
-
- if (h->format == FORMAT_BZIP)
- {
- /* BZIP2: Keep original file open, then open as bzipped file as well */
-#ifdef HAVE_BZLIB_H
- if (!bz2_lib)
- fatal("Files compressed with bzip2 are not supported");
- int bzError;
- if (! (h->fp_bz = (*BZ2_bzReadOpen_p)(& bzError, h->fp,
- BZ_VERBOSE_0, BZ_MORE_MEM,
- NULL, 0)))
- fatal("Unable to open bzip2 compressed fastq file (%s)", filename);
-#else
- fatal("Files compressed with bzip2 are not supported");
-#endif
- }
-
- h->stripped_all = 0;
-
- for(int i=0; i<256; i++)
- h->stripped[i] = 0;
-
- h->file_position = 0;
-
- buffer_init(& h->file_buffer);
- buffer_init(& h->header_buffer);
- buffer_init(& h->sequence_buffer);
- buffer_init(& h->quality_buffer);
-
- h->lineno = 1;
- h->lineno_start = 1;
- h->seqno = -1;
-
- for(int i=0; i<256; i++)
- map_identity[i] = i;
-
- return h;
-}
-
-void fastq_close(fastq_handle h)
-{
- /* Warn about stripped chars */
-
- if (h->stripped_all)
- {
- fprintf(stderr, "WARNING: invalid characters stripped from fastq file:");
- for (int i=0; i<256;i++)
- if (h->stripped[i])
- fprintf(stderr, " %c(%lu)", i, h->stripped[i]);
- fprintf(stderr, "\n");
-
- if (opt_log)
- {
- fprintf(fp_log, "WARNING: invalid characters stripped from fastq file:");
- for (int i=0; i<256;i++)
- if (h->stripped[i])
- fprintf(fp_log, " %c(%lu)", i, h->stripped[i]);
- fprintf(fp_log, "\n");
- }
- }
-
-#ifdef HAVE_BZLIB_H
- int bz_error;
-#endif
-
- switch(h->format)
- {
- case FORMAT_PLAIN:
- break;
-
- case FORMAT_GZIP:
-#ifdef HAVE_ZLIB_H
- (*gzclose_p)(h->fp_gz);
- h->fp_gz = 0;
- break;
-#endif
-
- case FORMAT_BZIP:
-#ifdef HAVE_BZLIB_H
- (*BZ2_bzReadClose_p)(&bz_error, h->fp_bz);
- h->fp_bz = 0;
- break;
-#endif
-
- default:
- fatal("Internal error");
- }
-
- fclose(h->fp);
- h->fp = 0;
-
- buffer_free(& h->file_buffer);
- buffer_free(& h->header_buffer);
- buffer_free(& h->sequence_buffer);
- buffer_free(& h->quality_buffer);
-
- h->file_size = 0;
- h->file_position = 0;
-
- h->lineno = 0;
- h->seqno = -1;
-
- free(h);
- h=0;
-}
-
-
-unsigned long fastq_file_fill_buffer(fastq_handle h)
-{
- /* read more data if necessary */
- unsigned long rest = h->file_buffer.length - h->file_buffer.position;
-
- if (rest > 0)
- return rest;
- else
- {
- unsigned long space = h->file_buffer.alloc - h->file_buffer.length;
-
- if (space == 0)
- {
- /* back to beginning of buffer */
- h->file_buffer.position = 0;
- h->file_buffer.length = 0;
- space = h->file_buffer.alloc;
- }
-
- int bytes_read = 0;
-
-#ifdef HAVE_BZLIB_H
- int bzError = 0;
-#endif
-
- switch(h->format)
- {
- case FORMAT_PLAIN:
- bytes_read = fread(h->file_buffer.data
- + h->file_buffer.position,
- 1,
- space,
- h->fp);
- break;
-
- case FORMAT_GZIP:
-#ifdef HAVE_ZLIB_H
- bytes_read = (*gzread_p)(h->fp_gz,
- h->file_buffer.data
- + h->file_buffer.position,
- space);
- if (bytes_read < 0)
- fatal("Error reading gzip compressed fastq file");
- break;
-#endif
-
- case FORMAT_BZIP:
-#ifdef HAVE_BZLIB_H
- bytes_read = (*BZ2_bzRead_p)(& bzError,
- h->fp_bz,
- h->file_buffer.data
- + h->file_buffer.position,
- space);
- if ((bytes_read < 0) ||
- ! ((bzError == BZ_OK) ||
- (bzError == BZ_STREAM_END) ||
- (bzError == BZ_SEQUENCE_ERROR)))
- fatal("Error reading bzip2 compressed fastq file");
- break;
-#endif
-
- default:
- fatal("Internal error");
- }
-
- h->file_buffer.length += bytes_read;
- return bytes_read;
- }
-}
-
-void buffer_extend(struct fastq_buffer_s * dest_buffer,
- char * source_buf,
- unsigned long len)
-{
- buffer_makespace(dest_buffer, len+1);
- memcpy(dest_buffer->data + dest_buffer->length,
- source_buf,
- len);
- dest_buffer->length += len;
- dest_buffer->data[dest_buffer->length] = 0;
-}
-
-void buffer_filter_extend(fastq_handle h,
- struct fastq_buffer_s * dest_buffer,
+void buffer_filter_extend(fastx_handle h,
+ struct fastx_buffer_s * dest_buffer,
char * source_buf,
unsigned long len,
unsigned int * char_action,
@@ -446,7 +147,22 @@ void buffer_filter_extend(fastq_handle h,
dest_buffer->length += q - d;
}
-bool fastq_next(fastq_handle h,
+fastx_handle fastq_open(const char * filename)
+{
+ fastx_handle h = fastx_open(filename);
+
+ if (!fastx_is_fastq(h))
+ fatal("FASTQ file expected, FASTA file found (%s)", filename);
+
+ return h;
+}
+
+void fastq_close(fastx_handle h)
+{
+ fastx_close(h);
+}
+
+bool fastq_next(fastx_handle h,
bool truncateatspace,
char * char_mapping)
{
@@ -459,7 +175,7 @@ bool fastq_next(fastq_handle h,
h->lineno_start = h->lineno;
- unsigned long rest = fastq_file_fill_buffer(h);
+ unsigned long rest = fastx_file_fill_buffer(h);
/* check end of file */
@@ -479,7 +195,7 @@ bool fastq_next(fastq_handle h,
while (lf == 0)
{
/* get more data if buffer empty */
- rest = fastq_file_fill_buffer(h);
+ rest = fastx_file_fill_buffer(h);
if (rest == 0)
fastq_fatal(h->lineno, "Unexpected end of file");
@@ -510,7 +226,7 @@ bool fastq_next(fastq_handle h,
while (1)
{
/* get more data, if necessary */
- rest = fastq_file_fill_buffer(h);
+ rest = fastx_file_fill_buffer(h);
/* cannot end here */
if (rest == 0)
@@ -548,7 +264,7 @@ bool fastq_next(fastq_handle h,
unsigned long lineno_plus = h->lineno;
/* read + line */
- fastq_buffer_s plusline_buffer;
+ fastx_buffer_s plusline_buffer;
buffer_init(&plusline_buffer);
/* skip + character */
@@ -559,7 +275,7 @@ bool fastq_next(fastq_handle h,
while (lf == 0)
{
/* get more data if buffer empty */
- rest = fastq_file_fill_buffer(h);
+ rest = fastx_file_fill_buffer(h);
/* cannot end here */
if (rest == 0)
@@ -614,7 +330,7 @@ bool fastq_next(fastq_handle h,
while (1)
{
/* get more data, if necessary */
- rest = fastq_file_fill_buffer(h);
+ rest = fastx_file_fill_buffer(h);
/* end if no more data */
if (rest == 0)
@@ -642,7 +358,7 @@ bool fastq_next(fastq_handle h,
& h->quality_buffer,
h->file_buffer.data + h->file_buffer.position,
len,
- char_fq_action_qual, map_identity, lineno_qual);
+ char_fq_action_qual, chrmap_identity, lineno_qual);
h->file_buffer.position += len;
rest -= len;
}
@@ -656,69 +372,57 @@ bool fastq_next(fastq_handle h,
buffer_truncate(& h->header_buffer, truncateatspace);
-#ifdef HAVE_ZLIB_H
- if (h->format == FORMAT_GZIP)
- {
- /* Circumvent the missing gzoffset function in zlib 1.2.3 and earlier */
- int fd = dup(fileno(h->fp));
- h->file_position = lseek(fd, 0, SEEK_CUR);
- close(fd);
- }
- else
-#endif
- h->file_position = ftell(h->fp);
-
h->seqno++;
return 1;
}
-char * fastq_get_quality(fastq_handle h)
+char * fastq_get_quality(fastx_handle h)
{
return h->quality_buffer.data;
}
-unsigned long fastq_get_quality_length(fastq_handle h)
+unsigned long fastq_get_quality_length(fastx_handle h)
{
return h->quality_buffer.length;
}
-unsigned long fastq_get_position(fastq_handle h)
+unsigned long fastq_get_position(fastx_handle h)
{
return h->file_position;
}
-unsigned long fastq_get_size(fastq_handle h)
+unsigned long fastq_get_size(fastx_handle h)
{
return h->file_size;
}
-unsigned long fastq_get_lineno(fastq_handle h)
+unsigned long fastq_get_lineno(fastx_handle h)
{
return h->lineno_start;
}
-unsigned long fastq_get_seqno(fastq_handle h)
+unsigned long fastq_get_seqno(fastx_handle h)
{
return h->seqno;
}
-unsigned long fastq_get_header_length(fastq_handle h)
+unsigned long fastq_get_header_length(fastx_handle h)
{
return h->header_buffer.length;
}
-unsigned long fastq_get_sequence_length(fastq_handle h)
+unsigned long fastq_get_sequence_length(fastx_handle h)
{
return h->sequence_buffer.length;
}
-char * fastq_get_header(fastq_handle h)
+char * fastq_get_header(fastx_handle h)
{
return h->header_buffer.data;
}
-char * fastq_get_sequence(fastq_handle h)
+char * fastq_get_sequence(fastx_handle h)
{
return h->sequence_buffer.data;
}
@@ -802,7 +506,7 @@ void fastq_print_relabel(FILE * fp,
fastq_print_quality(fp, quality);
}
-long fastq_get_abundance(fastq_handle h)
+long fastq_get_abundance(fastx_handle h)
{
return abundance_get(global_abundance, h->header_buffer.data);
}
diff --git a/src/fastq.h b/src/fastq.h
index 0bffae2..daad7c7 100644
--- a/src/fastq.h
+++ b/src/fastq.h
@@ -58,64 +58,23 @@
*/
-struct fastq_buffer_s
-{
- char * data;
- unsigned long length;
- unsigned long alloc;
- unsigned long position;
-};
-
-struct fastq_s
-{
- FILE * fp;
-
-#ifdef HAVE_ZLIB_H
- gzFile fp_gz;
-#endif
-
-#ifdef HAVE_BZLIB_H
- BZFILE * fp_bz;
-#endif
-
- struct fastq_buffer_s file_buffer;
-
- struct fastq_buffer_s header_buffer;
- struct fastq_buffer_s sequence_buffer;
- struct fastq_buffer_s quality_buffer;
-
- unsigned long file_size;
- unsigned long file_position;
-
- unsigned long lineno;
- unsigned long lineno_start;
- long seqno;
-
- unsigned long stripped_all;
- unsigned long stripped[256];
-
- int format;
-
-};
-
-typedef struct fastq_s * fastq_handle;
-
-fastq_handle fastq_open(const char * filename);
-void fastq_close(fastq_handle h);
-bool fastq_next(fastq_handle h,
+void fastq_open_rest(fastx_handle h);
+fastx_handle fastq_open(const char * filename);
+void fastq_close(fastx_handle h);
+bool fastq_next(fastx_handle h,
bool truncateatspace,
char * char_mapping);
-unsigned long fastq_get_position(fastq_handle h);
-unsigned long fastq_get_size(fastq_handle h);
-unsigned long fastq_get_lineno(fastq_handle h);
-unsigned long fastq_get_seqno(fastq_handle h);
-char * fastq_get_header(fastq_handle h);
-char * fastq_get_sequence(fastq_handle h);
-char * fastq_get_quality(fastq_handle h);
-long fastq_get_abundance(fastq_handle h);
-unsigned long fastq_get_header_length(fastq_handle h);
-unsigned long fastq_get_sequence_length(fastq_handle h);
-unsigned long fastq_get_quality_length(fastq_handle h);
+unsigned long fastq_get_position(fastx_handle h);
+unsigned long fastq_get_size(fastx_handle h);
+unsigned long fastq_get_lineno(fastx_handle h);
+unsigned long fastq_get_seqno(fastx_handle h);
+char * fastq_get_header(fastx_handle h);
+char * fastq_get_sequence(fastx_handle h);
+char * fastq_get_quality(fastx_handle h);
+long fastq_get_abundance(fastx_handle h);
+unsigned long fastq_get_header_length(fastx_handle h);
+unsigned long fastq_get_sequence_length(fastx_handle h);
+unsigned long fastq_get_quality_length(fastx_handle h);
void fastq_print(FILE * fp, char * header, char * sequence, char * quality);
diff --git a/src/fastqops.cc b/src/fastqops.cc
index 2281e31..4721e3b 100644
--- a/src/fastqops.cc
+++ b/src/fastqops.cc
@@ -82,7 +82,7 @@ int fastq_get_qual(char q)
void fastq_filter()
{
- fastq_handle h = fastq_open(opt_fastq_filter);
+ fastx_handle h = fastq_open(opt_fastq_filter);
unsigned long filesize = fastq_get_size(h);
@@ -347,7 +347,7 @@ void fastq_chars()
maxrun[c] = 0;
}
- fastq_handle h = fastq_open(opt_fastq_chars);
+ fastx_handle h = fastq_open(opt_fastq_chars);
unsigned long filesize = fastq_get_size(h);
@@ -525,7 +525,7 @@ double q2p(double q)
void fastq_stats()
{
- fastq_handle h = fastq_open(opt_fastq_stats);
+ fastx_handle h = fastq_open(opt_fastq_stats);
unsigned long filesize = fastq_get_size(h);
@@ -951,7 +951,7 @@ void fastx_revcomp()
void fastq_convert()
{
- fastq_handle h = fastq_open(opt_fastq_convert);
+ fastx_handle h = fastq_open(opt_fastq_convert);
if (!h)
fatal("Unable to open FASTQ file");
diff --git a/src/fastx.cc b/src/fastx.cc
index 7953d69..0d5344a 100644
--- a/src/fastx.cc
+++ b/src/fastx.cc
@@ -60,7 +60,10 @@
#include "vsearch.h"
-/* file type detector and wrapper for fastq and fastx parser */
+/* file compression and format detector */
+/* basic file buffering function for fastq and fastx parsers */
+
+#define FASTX_BUFFER_ALLOC 8192
#ifdef HAVE_BZLIB_H
#define BZ_VERBOSE_0 0
@@ -79,151 +82,308 @@
static unsigned char MAGIC_GZIP[] = "\x1f\x8b";
static unsigned char MAGIC_BZIP[] = "BZ";
-int fastx_detect(const char * filename)
+
+void buffer_init(struct fastx_buffer_s * buffer)
+{
+ buffer->alloc = FASTX_BUFFER_ALLOC;
+ buffer->data = (char*) xmalloc(buffer->alloc);
+ buffer->data[0] = 0;
+ buffer->length = 0;
+ buffer->position = 0;
+}
+
+void buffer_free(struct fastx_buffer_s * buffer)
+{
+ if (buffer->data)
+ free(buffer->data);
+ buffer->data = 0;
+ buffer->alloc = 0;
+ buffer->length = 0;
+ buffer->position = 0;
+}
+
+void buffer_makespace(struct fastx_buffer_s * buffer, unsigned long x)
+{
+ /* make sure there is space for x more chars in buffer */
+
+ if (buffer->length + x > buffer->alloc)
+ {
+ /* alloc space for x more characters,
+ but round up to nearest block size */
+ buffer->alloc =
+ ((buffer->length + x + FASTX_BUFFER_ALLOC - 1) / FASTX_BUFFER_ALLOC)
+ * FASTX_BUFFER_ALLOC;
+ buffer->data = (char*) xrealloc(buffer->data, buffer->alloc);
+ }
+}
+
+void buffer_truncate(struct fastx_buffer_s * buffer, bool truncateatspace)
{
+ /* Truncate the zero-terminated header string by inserting a new
+ terminator (zero byte) at the first space/tab character
+ (if truncateatspace) or first linefeed. */
+
+ if (truncateatspace)
+ buffer->length = strcspn(buffer->data, " \t\n");
+ else
+ buffer->length = strcspn(buffer->data, "\n");
+
+ buffer->data[buffer->length] = 0;
+}
+
+void buffer_extend(struct fastx_buffer_s * dest_buffer,
+ char * source_buf,
+ unsigned long len)
+{
+ buffer_makespace(dest_buffer, len+1);
+ memcpy(dest_buffer->data + dest_buffer->length,
+ source_buf,
+ len);
+ dest_buffer->length += len;
+ dest_buffer->data[dest_buffer->length] = 0;
+}
+
+fastx_handle fastx_open(const char * filename)
+{
+ fastx_handle h = (fastx_handle) xmalloc(sizeof(struct fastx_s));
+
+ h->fp = 0;
+
#ifdef HAVE_ZLIB_H
- gzFile fp_gz = 0;
+ h->fp_gz = 0;
#endif
#ifdef HAVE_BZLIB_H
- BZFILE * fp_bz = 0;
+ h->fp_bz = 0;
+ int bzError = 0;
#endif
+
+ h->fp = fopen(filename, "rb");
+ if (!h->fp)
+ fatal("Unable to open file for reading (%s)", filename);
- int format;
+ /* Get mode and size of original (uncompressed) file */
- FILE * fp = fopen(filename, "rb");
- if (!fp)
- fatal("Error: Unable to open file for reading (%s)", filename);
-
- /* detect compression (plain, gzipped or bzipped) */
-
- unsigned char magic[2];
- format = FORMAT_PLAIN;
- if (fread(&magic, 1, 2, fp) >= 2)
+ struct stat fs;
+ h->file_size = 0;
+
+ if (fstat(fileno(h->fp), & fs))
+ fatal("Unable to fstat on input file (%s)", filename);
+
+ h->is_pipe = ! S_ISREG(fs.st_mode);
+
+ h->file_size = 0;
+
+ if (! h->is_pipe)
{
- if (!memcmp(magic, MAGIC_GZIP, 2))
- format = FORMAT_GZIP;
- else if (!memcmp(magic, MAGIC_BZIP, 2))
- format = FORMAT_BZIP;
+ if (fseek(h->fp, 0, SEEK_END))
+ fatal("Unable to seek in input file (%s)", filename);
+ h->file_size = ftell(h->fp);
+ rewind(h->fp);
}
- /* close and reopen to avoid problems with gzip library */
- /* rewind was not enough */
+ if (opt_gzip_decompress)
+ {
+ h->format = FORMAT_GZIP;
+ }
+ else if (opt_bzip2_decompress)
+ {
+ h->format = FORMAT_BZIP;
+ }
+ else if (h->is_pipe)
+ {
+ h->format = FORMAT_PLAIN;
+ }
+ else
+ {
+ /* autodetect compression (plain, gzipped or bzipped) */
+
+ /* read two characters and compare with magic */
- fclose(fp);
- fp = fopen(filename, "rb");
- if (!fp)
- fatal("Error: Unable to open file for reading (%s)", filename);
+ unsigned char magic[2];
- if (format == FORMAT_GZIP)
+ h->format = FORMAT_PLAIN;
+ if (fread(&magic, 1, 2, h->fp) < 2)
+ fatal("Unable to read from file (%s)", filename);
+
+ if (memcmp(magic, MAGIC_GZIP, 2) == 0)
+ h->format = FORMAT_GZIP;
+ else if (memcmp(magic, MAGIC_BZIP, 2) == 0)
+ h->format = FORMAT_BZIP;
+
+ /* close and reopen to avoid problems with gzip library */
+ /* rewind was not enough */
+
+ fclose(h->fp);
+ h->fp = fopen(filename, "rb");
+ if (!h->fp)
+ fatal("Unable to open file for reading (%s)", filename);
+ }
+
+ if (h->format == FORMAT_GZIP)
{
- /* GZIP: Keep original file open, then open as bzipped file as well */
+ /* GZIP: Keep original file open, then open as gzipped file as well */
#ifdef HAVE_ZLIB_H
if (!gz_lib)
fatal("Files compressed with gzip are not supported");
- if (! (fp_gz = (*gzdopen_p)(fileno(fp), "rb")))
+ if (! (h->fp_gz = (*gzdopen_p)(fileno(h->fp), "rb"))) // dup?
fatal("Unable to open gzip compressed file (%s)", filename);
#else
fatal("Files compressed with gzip are not supported");
#endif
}
- if (format == FORMAT_BZIP)
+ if (h->format == FORMAT_BZIP)
{
/* BZIP2: Keep original file open, then open as bzipped file as well */
#ifdef HAVE_BZLIB_H
if (!bz2_lib)
fatal("Files compressed with bzip2 are not supported");
- int bzError;
- if (! (fp_bz = (*BZ2_bzReadOpen_p)(& bzError, fp,
- BZ_VERBOSE_0, BZ_MORE_MEM, NULL, 0)))
+ if (! (h->fp_bz = (*BZ2_bzReadOpen_p)(& bzError, h->fp,
+ BZ_VERBOSE_0, BZ_MORE_MEM,
+ NULL, 0)))
fatal("Unable to open bzip2 compressed file (%s)", filename);
#else
fatal("Files compressed with bzip2 are not supported");
#endif
}
- /* read one char and see if it starts with > or @ */
+ /* init buffers */
- const int BUFFERLEN = 1;
- char buffer[BUFFERLEN];
-
- int bytes_read = 0;
+ h->file_position = 0;
+
+ buffer_init(& h->file_buffer);
+
+ /* start filling up file buffer */
+
+ unsigned long rest = fastx_file_fill_buffer(h);
-#ifdef HAVE_BZLIB_H
- int bzError = 0;
-#endif
-
- switch(format)
+ if (rest < 2)
+ fatal("File too small");
+
+
+ /* examine first char and see if it starts with > or @ */
+
+ int filetype = 0;
+ char * first = h->file_buffer.data;
+
+ if (*first == '>')
{
- case FORMAT_PLAIN:
- bytes_read = fread(buffer,
- 1,
- BUFFERLEN,
- fp);
- break;
+ filetype = 1;
+ h->is_fastq = 0;
+ }
+ else if (*first == '@')
+ {
+ filetype = 2;
+ h->is_fastq = 1;
+ }
+
+ if (filetype == 0)
+ {
+ /* close files if unrecognized file type */
- case FORMAT_GZIP:
+ switch(h->format)
+ {
+ case FORMAT_PLAIN:
+ break;
+
+ case FORMAT_GZIP:
#ifdef HAVE_ZLIB_H
- bytes_read = (*gzread_p)(fp_gz,
- buffer,
- BUFFERLEN);
- if (bytes_read < 0)
- fatal("Error reading gzip compressed file (%s)", filename);
- break;
+ (*gzclose_p)(h->fp_gz);
+ h->fp_gz = 0;
+ break;
#endif
-
- case FORMAT_BZIP:
+
+ case FORMAT_BZIP:
#ifdef HAVE_BZLIB_H
- bytes_read = (*BZ2_bzRead_p)(& bzError,
- fp_bz,
- buffer,
- BUFFERLEN);
- if ((bytes_read < 0) ||
- ! ((bzError == BZ_OK) ||
- (bzError == BZ_STREAM_END) ||
- (bzError == BZ_SEQUENCE_ERROR)))
- fatal("Error reading bzip2 compressed file (%s)", filename);
- break;
+ (*BZ2_bzReadClose_p)(&bzError, h->fp_bz);
+ h->fp_bz = 0;
+ break;
#endif
+
+ default:
+ fatal("Internal error");
+ }
- default:
- fatal("Internal error");
+ fclose(h->fp);
+ h->fp = 0;
+
+ if (memcmp(first, MAGIC_GZIP, 2) == 0)
+ fatal("File appears to be gzip compressed. Please use --gzip_decompress");
+
+ if (memcmp(first, MAGIC_BZIP, 2) == 0)
+ fatal("File appears to be bzip2 compressed. Please use --bzip2_decompress");
+
+ fatal("File type not recognized.");
+
+ return 0;
}
- if (bytes_read < BUFFERLEN)
- fatal("Error reading file (%s)", filename);
+ /* more initialization */
- int filetype = 0;
- if (buffer[0] == '>')
- filetype = 1;
- else if (buffer[0] == '@')
- filetype = 2;
+ buffer_init(& h->header_buffer);
+ buffer_init(& h->sequence_buffer);
+ buffer_init(& h->quality_buffer);
+
+ h->stripped_all = 0;
+
+ for(int i=0; i<256; i++)
+ h->stripped[i] = 0;
- /* close files */
+ h->lineno = 1;
+ h->lineno_start = 1;
+ h->seqno = -1;
+
+ return h;
+}
+
+bool fastx_is_fastq(fastx_handle h)
+{
+ return h->is_fastq;
+}
+
+void fastx_close(fastx_handle h)
+{
+ /* Warn about stripped chars */
+
+ if (h->stripped_all)
+ {
+ fprintf(stderr, "WARNING: invalid characters stripped from fastq file:");
+ for (int i=0; i<256;i++)
+ if (h->stripped[i])
+ fprintf(stderr, " %c(%lu)", i, h->stripped[i]);
+ fprintf(stderr, "\n");
+
+ if (opt_log)
+ {
+ fprintf(fp_log, "WARNING: invalid characters stripped from fastq file:");
+ for (int i=0; i<256;i++)
+ if (h->stripped[i])
+ fprintf(fp_log, " %c(%lu)", i, h->stripped[i]);
+ fprintf(fp_log, "\n");
+ }
+ }
#ifdef HAVE_BZLIB_H
int bz_error;
#endif
-
- switch(format)
+
+ switch(h->format)
{
case FORMAT_PLAIN:
- fclose(fp);
- fp = 0;
break;
case FORMAT_GZIP:
#ifdef HAVE_ZLIB_H
- (*gzclose_p)(fp_gz);
- fp_gz = 0;
+ (*gzclose_p)(h->fp_gz);
+ h->fp_gz = 0;
break;
#endif
-
+
case FORMAT_BZIP:
#ifdef HAVE_BZLIB_H
- (*BZ2_bzReadClose_p)(&bz_error, fp_bz);
- fp_bz = 0;
+ (*BZ2_bzReadClose_p)(&bz_error, h->fp_bz);
+ h->fp_bz = 0;
break;
#endif
@@ -231,46 +391,107 @@ int fastx_detect(const char * filename)
fatal("Internal error");
}
- return filetype;
-}
+ fclose(h->fp);
+ h->fp = 0;
-bool fastx_is_fastq(fastx_handle h)
-{
- return h->is_fastq;
+ buffer_free(& h->file_buffer);
+ buffer_free(& h->header_buffer);
+ buffer_free(& h->sequence_buffer);
+ buffer_free(& h->quality_buffer);
+
+ h->file_size = 0;
+ h->file_position = 0;
+
+ h->lineno = 0;
+ h->seqno = -1;
+
+ free(h);
+ h=0;
}
-fastx_handle fastx_open(const char * filename)
+unsigned long fastx_file_fill_buffer(fastx_handle h)
{
- int filetype = fastx_detect(filename);
+ /* read more data if necessary */
+ unsigned long rest = h->file_buffer.length - h->file_buffer.position;
- if (filetype == 0)
- return 0;
+ if (rest > 0)
+ return rest;
+ else
+ {
+ unsigned long space = h->file_buffer.alloc - h->file_buffer.length;
- fastx_handle h = (fastx_handle) xmalloc(sizeof(struct fastx_s));
+ if (space == 0)
+ {
+ /* back to beginning of buffer */
+ h->file_buffer.position = 0;
+ h->file_buffer.length = 0;
+ space = h->file_buffer.alloc;
+ }
- h->is_fastq = (filetype == 2);
+ int bytes_read = 0;
- if (h->is_fastq)
- h->handle.fastq = fastq_open(filename);
- else
- h->handle.fasta = fasta_open(filename);
+#ifdef HAVE_BZLIB_H
+ int bzError = 0;
+#endif
- return h;
-}
+ switch(h->format)
+ {
+ case FORMAT_PLAIN:
+ bytes_read = fread(h->file_buffer.data
+ + h->file_buffer.position,
+ 1,
+ space,
+ h->fp);
+ break;
+
+ case FORMAT_GZIP:
+#ifdef HAVE_ZLIB_H
+ bytes_read = (*gzread_p)(h->fp_gz,
+ h->file_buffer.data
+ + h->file_buffer.position,
+ space);
+ if (bytes_read < 0)
+ fatal("Unable to read gzip compressed file");
+ break;
+#endif
-void fastx_close(fastx_handle h)
-{
- if (h->is_fastq)
- {
- fastq_close(h->handle.fastq);
- h->handle.fastq = 0;
- }
- else
- {
- fasta_close(h->handle.fasta);
- h->handle.fasta = 0;
+ case FORMAT_BZIP:
+#ifdef HAVE_BZLIB_H
+ bytes_read = (*BZ2_bzRead_p)(& bzError,
+ h->fp_bz,
+ h->file_buffer.data
+ + h->file_buffer.position,
+ space);
+ if ((bytes_read < 0) ||
+ ! ((bzError == BZ_OK) ||
+ (bzError == BZ_STREAM_END) ||
+ (bzError == BZ_SEQUENCE_ERROR)))
+ fatal("Unable to read from bzip2 compressed file");
+ break;
+#endif
+
+ default:
+ fatal("Internal error");
+ }
+
+ if (!h->is_pipe)
+ {
+#ifdef HAVE_ZLIB_H
+ if (h->format == FORMAT_GZIP)
+ {
+ /* Circumvent the missing gzoffset function in zlib 1.2.3 and earlier */
+ int fd = dup(fileno(h->fp));
+ h->file_position = lseek(fd, 0, SEEK_CUR);
+ close(fd);
+ }
+ else
+#endif
+ h->file_position = ftell(h->fp);
+ }
+
+ h->file_buffer.length += bytes_read;
+ return bytes_read;
}
- free(h);
}
bool fastx_next(fastx_handle h,
@@ -278,83 +499,83 @@ bool fastx_next(fastx_handle h,
char * char_mapping)
{
if (h->is_fastq)
- return fastq_next(h->handle.fastq, truncateatspace, char_mapping);
+ return fastq_next(h, truncateatspace, char_mapping);
else
- return fasta_next(h->handle.fasta, truncateatspace, char_mapping);
+ return fasta_next(h, truncateatspace, char_mapping);
}
unsigned long fastx_get_position(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_position(h->handle.fastq);
+ return fastq_get_position(h);
else
- return fasta_get_position(h->handle.fasta);
+ return fasta_get_position(h);
}
unsigned long fastx_get_size(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_size(h->handle.fastq);
+ return fastq_get_size(h);
else
- return fasta_get_size(h->handle.fasta);
+ return fasta_get_size(h);
}
unsigned long fastx_get_lineno(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_lineno(h->handle.fastq);
+ return fastq_get_lineno(h);
else
- return fasta_get_lineno(h->handle.fasta);
+ return fasta_get_lineno(h);
}
unsigned long fastx_get_seqno(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_seqno(h->handle.fastq);
+ return fastq_get_seqno(h);
else
- return fasta_get_seqno(h->handle.fasta);
+ return fasta_get_seqno(h);
}
char * fastx_get_header(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_header(h->handle.fastq);
+ return fastq_get_header(h);
else
- return fasta_get_header(h->handle.fasta);
+ return fasta_get_header(h);
}
char * fastx_get_sequence(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_sequence(h->handle.fastq);
+ return fastq_get_sequence(h);
else
- return fasta_get_sequence(h->handle.fasta);
+ return fasta_get_sequence(h);
}
unsigned long fastx_get_header_length(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_header_length(h->handle.fastq);
+ return fastq_get_header_length(h);
else
- return fasta_get_header_length(h->handle.fasta);
+ return fasta_get_header_length(h);
}
unsigned long fastx_get_sequence_length(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_sequence_length(h->handle.fastq);
+ return fastq_get_sequence_length(h);
else
- return fasta_get_sequence_length(h->handle.fasta);
+ return fasta_get_sequence_length(h);
}
char * fastx_get_quality(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_quality(h->handle.fastq);
+ return fastq_get_quality(h);
else
return 0;
}
@@ -362,8 +583,8 @@ char * fastx_get_quality(fastx_handle h)
long fastx_get_abundance(fastx_handle h)
{
if (h->is_fastq)
- return fastq_get_abundance(h->handle.fastq);
+ return fastq_get_abundance(h);
else
- return fasta_get_abundance(h->handle.fasta);
+ return fasta_get_abundance(h);
}
diff --git a/src/fastx.h b/src/fastx.h
index de9d68f..d3b396f 100644
--- a/src/fastx.h
+++ b/src/fastx.h
@@ -58,18 +58,59 @@
*/
+struct fastx_buffer_s
+{
+ char * data;
+ unsigned long length;
+ unsigned long alloc;
+ unsigned long position;
+};
+
+void buffer_init(struct fastx_buffer_s * buffer);
+void buffer_free(struct fastx_buffer_s * buffer);
+void buffer_extend(struct fastx_buffer_s * dest_buffer,
+ char * source_buf,
+ unsigned long len);
+void buffer_makespace(struct fastx_buffer_s * buffer, unsigned long x);
+void buffer_truncate(struct fastx_buffer_s * buffer, bool truncateatspace);
+
struct fastx_s
{
+ bool is_pipe;
bool is_fastq;
- union
- {
- fasta_handle fasta;
- fastq_handle fastq;
- } handle;
+
+ FILE * fp;
+
+#ifdef HAVE_ZLIB_H
+ gzFile fp_gz;
+#endif
+
+#ifdef HAVE_BZLIB_H
+ BZFILE * fp_bz;
+#endif
+
+ struct fastx_buffer_s file_buffer;
+
+ struct fastx_buffer_s header_buffer;
+ struct fastx_buffer_s sequence_buffer;
+ struct fastx_buffer_s quality_buffer;
+
+ unsigned long file_size;
+ unsigned long file_position;
+
+ unsigned long lineno;
+ unsigned long lineno_start;
+ long seqno;
+
+ unsigned long stripped_all;
+ unsigned long stripped[256];
+
+ int format;
};
typedef struct fastx_s * fastx_handle;
+
/* fastx input */
bool fastx_is_fastq(fastx_handle h);
@@ -89,3 +130,5 @@ unsigned long fastx_get_sequence_length(fastx_handle h);
char * fastx_get_quality(fastx_handle h);
long fastx_get_abundance(fastx_handle h);
+
+unsigned long fastx_file_fill_buffer(fastx_handle h);
diff --git a/src/maps.cc b/src/maps.cc
index 043212c..7ad4351 100644
--- a/src/maps.cc
+++ b/src/maps.cc
@@ -430,3 +430,56 @@ char chrmap_no_change[256] =
'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N',
'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N'
};
+
+char chrmap_identity[256] =
+ {
+ /* identity map */
+
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+ };
diff --git a/src/maps.h b/src/maps.h
index fd51831..10bf8ce 100644
--- a/src/maps.h
+++ b/src/maps.h
@@ -72,3 +72,4 @@ extern char chrmap_complement[256];
extern char chrmap_normalize[256];
extern char chrmap_upcase[256];
extern char chrmap_no_change[256];
+extern char chrmap_identity[256];
diff --git a/src/mergepairs.cc b/src/mergepairs.cc
index 33d478c..1d5fbd0 100644
--- a/src/mergepairs.cc
+++ b/src/mergepairs.cc
@@ -77,8 +77,8 @@ static FILE * fp_fastqout_notmerged_rev = 0;
static FILE * fp_fastaout_notmerged_fwd = 0;
static FILE * fp_fastaout_notmerged_rev = 0;
static FILE * fp_eetabbedout = 0;
-static fastq_handle fastq_fwd;
-static fastq_handle fastq_rev;
+static fastx_handle fastq_fwd;
+static fastx_handle fastq_rev;
static long merged = 0;
static long notmerged = 0;
static long total = 0;
diff --git a/src/msa.cc b/src/msa.cc
index af873c6..959fc9b 100644
--- a/src/msa.cc
+++ b/src/msa.cc
@@ -62,80 +62,48 @@
/* Compute consensus sequence and msa of clustered sequences */
+#define PROFSIZE 6
+
static char * aln;
static int alnpos;
static int * profile;
void msa_add(char c)
{
- int * p = profile + 4 * alnpos;
+ int * p = profile + PROFSIZE * alnpos;
switch(toupper(c))
{
case 'A':
- p[0] += 12;
+ p[0]++;
break;
case 'C':
- p[1] += 12;
+ p[1]++;
break;
case 'G':
- p[2] += 12;
+ p[2]++;
break;
case 'T':
case 'U':
- p[3] += 12;
+ p[3]++;
break;
case 'R':
- p[0] += 6;
- p[2] += 6;
- break;
case 'Y':
- p[1] += 6;
- p[3] += 6;
- break;
case 'S':
- p[1] += 6;
- p[2] += 6;
- break;
case 'W':
- p[0] += 6;
- p[3] += 6;
- break;
case 'K':
- p[2] += 6;
- p[3] += 6;
- break;
case 'M':
- p[0] += 6;
- p[1] += 6;
- break;
case 'B':
- p[1] += 4;
- p[2] += 4;
- p[3] += 4;
- break;
case 'D':
- p[0] += 4;
- p[2] += 4;
- p[3] += 4;
- break;
case 'H':
- p[0] += 4;
- p[1] += 4;
- p[3] += 4;
- break;
case 'V':
- p[0] += 4;
- p[1] += 4;
- p[2] += 4;
- break;
case 'N':
- p[0] += 3;
- p[1] += 3;
- p[2] += 3;
- p[3] += 3;
+ p[4]++;
+ break;
+ case '-':
+ p[5]++;
break;
- }
+ }
aln[alnpos++] = c;
}
@@ -185,8 +153,8 @@ void msa(FILE * fp_msaout, FILE * fp_consout, FILE * fp_profile,
alnlen += centroid_len;
/* allocate memory for profile (for consensus) and aligned seq */
- profile = (int *) xmalloc(4 * sizeof(int) * alnlen);
- memset(profile, 0, 4 * sizeof(int) * alnlen);
+ profile = (int *) xmalloc(PROFSIZE * sizeof(int) * alnlen);
+ memset(profile, 0, PROFSIZE * sizeof(int) * alnlen);
aln = (char *) xmalloc(alnlen+1);
char * cons = (char *) xmalloc(alnlen+1);
@@ -313,26 +281,32 @@ void msa(FILE * fp_msaout, FILE * fp_consout, FILE * fp_profile,
}
else
{
- /* find most common symbol */
+ /* find most common symbol of A, C, G and T */
char best_sym = 0;
- int best_count = -1;
- int nongap_count = 0;
+ int best_count = 0;
for(int c=0; c<4; c++)
{
- int count = profile[4*i+c];
+ int count = profile[PROFSIZE*i+c];
if (count > best_count)
{
best_count = count;
- best_sym = c;
+ best_sym = c+1;
}
- nongap_count += count;
}
- int gap_count = 12 * target_count - nongap_count;
+ /* if no A, C, G, or T, check if there are any N's */
+ int n_count = profile[PROFSIZE*i+4];
+ if ((best_count == 0) && (n_count > 0))
+ {
+ best_count = n_count;
+ best_sym = 15; // N
+ }
+ /* compare to the number of gap symbols */
+ int gap_count = profile[PROFSIZE*i+5];
if (best_count >= gap_count)
{
- char sym = sym_nt_2bit[(int)best_sym];
+ char sym = sym_nt_4bit[(int)best_sym];
aln[i] = sym;
cons[conslen++] = sym;
}
diff --git a/src/rerep.cc b/src/rerep.cc
index d886f93..cf15b1b 100644
--- a/src/rerep.cc
+++ b/src/rerep.cc
@@ -74,7 +74,7 @@ void rereplicate()
fatal("Unable to open fasta output file for writing");
}
- fasta_handle fh = fasta_open(opt_rereplicate);
+ fastx_handle fh = fasta_open(opt_rereplicate);
long filesize = fasta_get_size(fh);
progress_init("Rereplicating", filesize);
diff --git a/src/search.cc b/src/search.cc
index 0fad7bb..8b5dcee 100644
--- a/src/search.cc
+++ b/src/search.cc
@@ -68,7 +68,7 @@ static pthread_t * pthread;
static int tophits; /* the maximum number of hits to keep */
static int seqcount; /* number of database sequences */
static pthread_attr_t attr;
-static fasta_handle query_fasta_h;
+static fastx_handle query_fasta_h;
/* global data protected by mutex */
static pthread_mutex_t mutex_input;
diff --git a/src/searchcore.cc b/src/searchcore.cc
index 9bdcfd9..c4f3e23 100644
--- a/src/searchcore.cc
+++ b/src/searchcore.cc
@@ -433,9 +433,9 @@ int search_acceptable_aligned(struct searchinfo_s * si,
((!opt_rightjust) || (hit->trim_q_right +
hit->trim_t_right == 0)) &&
/* query_cov */
- (hit->internal_alignmentlength >= opt_query_cov * si->qseqlen) &&
+ (hit->matches + hit->mismatches >= opt_query_cov * si->qseqlen) &&
/* target_cov */
- (hit->internal_alignmentlength >=
+ (hit->matches + hit->mismatches >=
opt_target_cov * db_getsequencelen(hit->target)) &&
/* maxid */
(hit->id <= 100.0 * opt_maxid) &&
diff --git a/src/searchexact.cc b/src/searchexact.cc
index 677ccca..739a46d 100644
--- a/src/searchexact.cc
+++ b/src/searchexact.cc
@@ -68,7 +68,7 @@ static pthread_t * pthread;
static int tophits; /* the maximum number of hits to keep */
static int seqcount; /* number of database sequences */
static pthread_attr_t attr;
-static fasta_handle query_fasta_h;
+static fastx_handle query_fasta_h;
/* global data protected by mutex */
static pthread_mutex_t mutex_input;
diff --git a/src/util.cc b/src/util.cc
index f9afc5c..c28aac0 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -93,7 +93,7 @@ void progress_update(unsigned long progress)
fprintf(stderr, " \r%s %.0f%%", progress_prompt,
100.0 * progress / progress_size);
else
- fprintf(stderr, " \r%s ?%%", progress_prompt);
+ fprintf(stderr, " \r%s 0%%", progress_prompt);
progress_next = progress + progress_chunk;
}
}
diff --git a/src/vsearch.cc b/src/vsearch.cc
index bd62f36..8cfdc10 100644
--- a/src/vsearch.cc
+++ b/src/vsearch.cc
@@ -62,6 +62,7 @@
/* options */
+bool opt_bzip2_decompress;
bool opt_clusterout_id;
bool opt_clusterout_sort;
bool opt_eeout;
@@ -69,6 +70,7 @@ bool opt_fasta_score;
bool opt_fastq_allowmergestagger;
bool opt_fastq_eeout;
bool opt_fastq_nostagger;
+bool opt_gzip_decompress;
bool opt_quiet;
bool opt_relabel_keep;
bool opt_relabel_md5;
@@ -264,6 +266,9 @@ FILE * fp_log = 0;
abundance_t * global_abundance;
+char * STDIN_NAME = (char*) "/dev/stdin";
+char * STDOUT_NAME = (char*) "/dev/stdout";
+
#define cpuid(f1, f2, a, b, c, d) \
__asm__ __volatile__ ("cpuid" \
: "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
@@ -514,6 +519,7 @@ void args_init(int argc, char **argv)
opt_alnout = 0;
opt_blast6out = 0;
opt_borderline = 0;
+ opt_bzip2_decompress = 0;
opt_centroids = 0;
opt_chimeras = 0;
opt_cluster_fast = 0;
@@ -587,6 +593,7 @@ void args_init(int argc, char **argv)
opt_gap_open_target_interior=20;
opt_gap_open_target_left=2;
opt_gap_open_target_right=2;
+ opt_gzip_decompress = 0;
opt_hardmask = 0;
opt_help = 0;
opt_id = -1.0;
@@ -862,6 +869,8 @@ void args_init(int argc, char **argv)
{"minhsp", required_argument, 0, 0 },
{"band", required_argument, 0, 0 },
{"hspw", required_argument, 0, 0 },
+ {"gzip_decompress", no_argument, 0, 0 },
+ {"bzip2_decompress", no_argument, 0, 0 },
{ 0, 0, 0, 0 }
};
@@ -1600,6 +1609,16 @@ void args_init(int argc, char **argv)
fprintf(stderr, "WARNING: Option --hspw is ignored\n");
break;
+ case 173:
+ /* gzip_decompress */
+ opt_gzip_decompress = 1;
+ break;
+
+ case 174:
+ /* bzip2_decompress */
+ opt_bzip2_decompress = 1;
+ break;
+
default:
fatal("Internal error in option parsing");
}
@@ -1751,6 +1770,9 @@ void args_init(int argc, char **argv)
if (opt_fastq_qminout > opt_fastq_qmaxout)
fatal("The argument to --fastq_qminout cannot be larger than to --fastq_qmaxout");
+ if (opt_gzip_decompress && opt_bzip2_decompress)
+ fatal("Specify either --gzip_decompress or --bzip2_decompress, not both");
+
/* TODO: check valid range of gap penalties */
/* adapt/adjust parameters */
@@ -1794,6 +1816,87 @@ void args_init(int argc, char **argv)
else
opt_minseqlength = 1;
}
+
+ /* replace filename "-" by "/dev/stdin" for input file options */
+
+ char * * stdin_options[] =
+ {
+ & opt_allpairs_global,
+ & opt_cluster_fast,
+ & opt_cluster_size,
+ & opt_cluster_smallmem,
+ & opt_db,
+ & opt_derep_fulllength,
+ & opt_derep_prefix,
+ & opt_fastq_chars,
+ & opt_fastq_convert,
+ & opt_fastq_eestats,
+ & opt_fastq_filter,
+ & opt_fastq_mergepairs,
+ & opt_fastq_stats,
+ & opt_fastx_mask,
+ & opt_fastx_revcomp,
+ & opt_fastx_subsample,
+ & opt_maskfasta,
+ & opt_rereplicate,
+ & opt_reverse,
+ & opt_search_exact,
+ & opt_shuffle,
+ & opt_sortbylength,
+ & opt_sortbysize,
+ & opt_uchime_denovo,
+ & opt_uchime_ref,
+ & opt_usearch_global,
+ 0
+ };
+
+ int i = 0;
+ while(char * * stdin_opt = stdin_options[i++])
+ if ((*stdin_opt) && (!strcmp(*stdin_opt, "-")))
+ *stdin_opt = STDIN_NAME;
+
+
+ /* replace filename "-" by "/dev/stdout" for output file options */
+
+ char * * stdout_options[] =
+ {
+ & opt_alnout,
+ & opt_blast6out,
+ & opt_borderline,
+ & opt_centroids,
+ & opt_chimeras,
+ & opt_consout,
+ & opt_dbmatched,
+ & opt_dbnotmatched,
+ & opt_eetabbedout,
+ & opt_fastaout,
+ & opt_fastaout_discarded,
+ & opt_fastaout_notmerged_fwd,
+ & opt_fastaout_notmerged_rev,
+ & opt_fastapairs,
+ & opt_fastqout,
+ & opt_fastqout_discarded,
+ & opt_fastqout_notmerged_fwd,
+ & opt_fastqout_notmerged_rev,
+ & opt_log,
+ & opt_matched,
+ & opt_msaout,
+ & opt_nonchimeras,
+ & opt_notmatched,
+ & opt_output,
+ & opt_profile,
+ & opt_samout,
+ & opt_uc,
+ & opt_uchimealns,
+ & opt_uchimeout,
+ & opt_userout,
+ 0
+ };
+
+ int o = 0;
+ while(char * * stdout_opt = stdout_options[o++])
+ if ((*stdout_opt) && (!strcmp(*stdout_opt, "-")))
+ *stdout_opt = STDOUT_NAME;
}
@@ -1810,7 +1913,9 @@ void cmd_help()
fprintf(stdout,
"\n"
"General options\n"
+ " --bzip2_decompress decompress input with bzip2 (required for pipes)\n"
" --fasta_width INT width of FASTA seq lines, 0 for no wrap (80)\n"
+ " --gzip_decompress decompress input with gzip (required for pipes)\n"
" --help | --h display help information\n"
" --log FILENAME write messages, timing and memory info to file\n"
" --maxseqlength INT maximum sequence length (50000)\n"
@@ -2343,8 +2448,10 @@ void cmd_uchime()
if (opt_minh <= 0.0)
fatal("Argument to --minh must be > 0");
+#if 0
if (opt_abskew <= 1.0)
fatal("Argument to --abskew must be > 1");
+#endif
chimera();
}
@@ -2401,9 +2508,9 @@ void show_header()
{
if (! opt_quiet)
{
- fprintf(stdout, "%s\n", progheader);
- fprintf(stdout, "https://github.com/torognes/vsearch\n");
- fprintf(stdout, "\n");
+ fprintf(stderr, "%s\n", progheader);
+ fprintf(stderr, "https://github.com/torognes/vsearch\n");
+ fprintf(stderr, "\n");
}
}
diff --git a/src/vsearch.h b/src/vsearch.h
index a2353ec..43ba02c 100644
--- a/src/vsearch.h
+++ b/src/vsearch.h
@@ -138,9 +138,9 @@
#include "cpu.h"
#include "allpairs.h"
#include "subsample.h"
+#include "fastx.h"
#include "fasta.h"
#include "fastq.h"
-#include "fastx.h"
#include "fastqops.h"
#include "dbhash.h"
#include "searchexact.h"
@@ -159,6 +159,7 @@
/* options */
+extern bool opt_bzip2_decompress;
extern bool opt_clusterout_id;
extern bool opt_clusterout_sort;
extern bool opt_eeout;
@@ -166,6 +167,7 @@ extern bool opt_fasta_score;
extern bool opt_fastq_allowmergestagger;
extern bool opt_fastq_eeout;
extern bool opt_fastq_nostagger;
+extern bool opt_gzip_decompress;
extern bool opt_quiet;
extern bool opt_relabel_keep;
extern bool opt_relabel_md5;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/vsearch.git
More information about the debian-med-commit
mailing list