[med-svn] [Git][med-team/andi][upstream] New upstream version 0.13
Andreas Tille
gitlab at salsa.debian.org
Fri Mar 27 18:04:29 GMT 2020
Andreas Tille pushed to branch upstream at Debian Med / andi
Commits:
b8148b95 by Andreas Tille at 2020-03-27T19:01:08+01:00
New upstream version 0.13
- - - - -
29 changed files:
- .travis.yml
- Makefile.am
- README.md
- andi-manual.pdf
- configure.ac
- docs/andi.1.in
- docs/manual/andi-manual.tex
- libs/pfasta.c
- libs/pfasta.h
- − opt/psufsort/Makefile.am
- − opt/psufsort/c_interface.cxx
- − opt/psufsort/interface.h
- − opt/psufsort/psufsort.cxx
- src/Makefile.am
- src/andi.c
- src/dist_hack.h
- src/esa.c
- src/esa.h
- src/io.c
- src/process.c
- src/sequence.c
- src/sequence.h
- test/Makefile.am
- test/low_homo.sh
- test/nan.sh
- test/test_esa.c
- test/test_extra.sh
- test/test_join.sh
- test/test_seq.c
Changes:
=====================================
.travis.yml
=====================================
@@ -36,11 +36,5 @@ script:
- export MYFLAGS="-I$LIBDIVDIR/include"
- ./configure $CONFIGURE_FLAGS --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
- make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\" $CONFIGURE_FLAGS"
-- tar xzvf andi-*.tar.gz
-- cd andi-*
-- ./configure $CONFIGURE_FLAGS --enable-unit-tests --without-libdivsufsort
-- make
-- make check || cat ./test-suite.log || exit 1
-- cd ..
after_success:
- if [ "$CXX" = "g++" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi
=====================================
Makefile.am
=====================================
@@ -3,12 +3,8 @@ AM_DISTCHECK_CONFIGURE_FLAGS="--enable-unit-tests"
.PHONY: all
-if !BUILD_WITH_LIBDIVSUFSORT
-OPT_PSUFSORT = opt/psufsort
-endif
-
-SUBDIRS = . $(OPT_PSUFSORT) libs opt src docs
-DIST_SUBDIRS = . libs opt src docs test opt/psufsort
+SUBDIRS = . libs opt src docs
+DIST_SUBDIRS = . libs opt src docs test
# Conditionally build the tests
if BUILD_TESTS
=====================================
README.md
=====================================
@@ -11,13 +11,14 @@ This readme covers all necessary instructions for the impatient to get `andi` up
Stable versions of `andi` are available via package managers. For manual installation see below.
-For Debian and Ubuntu (starting 16.04):
+For Debian and Ubuntu:
sudo apt-get install andi
For OS X with Homebrew:
- brew install homebrew/science/andi
+ brew tap brewsci/bio
+ brew install andi
For ArchLinux with aura:
@@ -46,7 +47,7 @@ This program has the following external dependencies: [libdivsufsort](https://gi
Assuming you have installed all prerequisites, building is as easy as follows.
- $ autoreconf -fi -Im4 # optional when build from tarball
+ $ autoreconf -fi -Im4 # optional when building from tarball
$ ./configure
$ make
$ make install
@@ -67,7 +68,7 @@ The release of this software is accompanied by a paper from [Haubold et al.](htt
## License
-Copyright © 2014 - 2017 Fabian Klötzl
+Copyright © 2014 - 2020 Fabian Klötzl
License GPLv3+: GNU GPL version 3 or later.
This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. The full license text is available at <http://gnu.org/licenses/gpl.html>.
=====================================
andi-manual.pdf
=====================================
Binary files a/andi-manual.pdf and b/andi-manual.pdf differ
=====================================
configure.ac
=====================================
@@ -1,4 +1,4 @@
-AC_INIT([andi], [0.12])
+AC_INIT([andi], [0.13])
AM_INIT_AUTOMAKE([-Wall foreign])
AC_CONFIG_MACRO_DIR([m4])
@@ -26,36 +26,16 @@ AS_IF([test "x$have_gsl" = "xno"],[
AC_MSG_ERROR([Missing the Gnu Scientific Library.])
])
-# By default try to build with libdivsufsort.
-AC_ARG_WITH([libdivsufsort],
- AS_HELP_STRING([--without-libdivsufsort], [Build without libdivsufsort and use psufsort instead.]))
-
-AS_IF([test "x$with_libdivsufsort" != "xno"],
- [
- # The libdivsufsort header contains some Microsoft extension making
- # compilation fail on certain systems (i.e. OS X). Add the following
- # flag so the build runs smoothly.
- CPPFLAGS="$CPPFLAGS -fms-extensions"
- AC_CHECK_HEADERS([divsufsort.h],[have_libdivsufsort=yes],[have_libdivsufsort=no])
- AC_CHECK_LIB(divsufsort, divsufsort, [], [have_libdivsufsort=no])
- ],
- [
- have_libdivsufsort=no
- # psufsort needs C++11
- AX_CXX_COMPILE_STDCXX_11([],[mandatory])
- ]
-)
-
-AS_IF([test "x$have_libdivsufsort" = "xno"],
- [AS_IF([test "x$with_libdivsufsort" != "xno"],
- [AC_MSG_ERROR([Configuration for libdivsufsort failed. Either install libdivsufsort, or use our replacement, psufsort, instead.
- ./configure --without-libdivsufsort
-The latter may result in longer runtimes.])
- ])
-])
-
-AM_CONDITIONAL([BUILD_WITH_LIBDIVSUFSORT],[test "x${with_libdivsufsort}" != "xno"])
+# The libdivsufsort header contains some Microsoft extension making
+# compilation fail on certain systems (i.e. OS X). Add the following
+# flag so the build runs smoothly.
+CPPFLAGS="$CPPFLAGS -fms-extensions"
+AC_CHECK_HEADERS([divsufsort.h],[have_libdivsufsort=yes],[have_libdivsufsort=no])
+AC_CHECK_LIB(divsufsort, divsufsort, [], [have_libdivsufsort=no])
+AS_IF([test "x$have_libdivsufsort" = "xno"],[
+ AC_MSG_ERROR([Missing libdivsufsort.])
+])
# The unit tests require GLIB2. So by default do not build the test.
@@ -124,7 +104,6 @@ AC_CONFIG_FILES([
docs/Makefile
libs/Makefile
opt/Makefile
- opt/psufsort/Makefile
src/Makefile
test/Makefile
])
=====================================
docs/andi.1.in
=====================================
@@ -1,11 +1,10 @@
-.TH ANDI "1" "2017-09-17" "@VERSION@" "andi manual"
+.TH ANDI "1" "2020-01-09" "@VERSION@" "andi manual"
.SH NAME
-andi \- estimates evolutionary distance
+andi \- estimates evolutionary distances
.SH SYNOPSIS
.B andi
[\fIOPTIONS...\fR] \fIFILES\fR...
.SH DESCRIPTION
-.TP
\fBandi\fR estimates the evolutionary distance between closely related genomes. For this \fBandi\fR reads the input sequences from \fIFASTA\fR files and computes the pairwise anchor distance. The idea behind this is explained in a paper by Haubold et al. (2015).
.SH OUTPUT
The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse (K>0.5) for our method to work properly.
@@ -23,7 +22,7 @@ Use this mode if each of your \fIFASTA\fR files represents one assembly with num
\fB\-l\fR, \fB\-\-low-memory\fR
In multithreaded mode, \fBandi\fR requires memory linear to the amount of threads. The low memory mode changes this to a constant demand independent from the used number of threads. Unfortunately, this comes at a significant runtime cost.
.TP
-\fB\-m\fR \fIMODEL\fR, \fB\-\-model\fR=\fIMODEL\fr
+\fB\-m\fR \fIMODEL\fR, \fB\-\-model\fR=\fIMODEL\fR
Set the nucleotide evolution model to one of 'Raw', 'JC', or 'Kimura'. By default the Jukes-Cantor correction is used.
.TP
\fB\-p\fR \fIFLOAT\fR
@@ -49,7 +48,7 @@ Prints the synopsis and an explanation of available options.
\fB\-\-version\fR
Outputs version information and acknowledgments.
.SH COPYRIGHT
-Copyright \(co 2014 - 2017 Fabian Klötzl
+Copyright \(co 2014 - 2020 Fabian Klötzl
License GPLv3+: GNU GPL version 3 or later.
.br
This is free software: you are free to change and redistribute it.
@@ -57,14 +56,13 @@ There is NO WARRANTY, to the extent permitted by law.
The full license text is available at <http://gnu.org/licenses/gpl.html>..
.PP
.SH ACKNOWLEDGMENTS
-1) andi: Haubold, B. Klötzl, F. and Pfaffelhuber, P. (2015). andi: Fast and accurate estimation of evolutionary distances between closely related genomes
+1) andi: Haubold, B. Klötzl, F. and Pfaffelhuber, P. (2015). andi: Fast and accurate estimation of evolutionary distances between closely related genomes, Bioinformatics 31.8.
.br
2) Algorithms: Ohlebusch, E. (2013). Bioinformatics Algorithms. Sequence Analysis, Genome Rearrangements, and Phylogenetic Reconstruction. pp 118f.
.br
-3) SA construction: Mori, Y. (2005). Short description of improved two\-stage suffix sorting algorithm. http://homepage3.nifty.com/wpage/software/itssort.txt
+3) SA construction: Mori, Y. (2005). libdivsufsort, unpublished.
.br
-4) Bootstrapping: Klötzl, F. and Haubold, B. (2016). Support Values for Genome Phylogenies
+4) Bootstrapping: Klötzl, F. and Haubold, B. (2016). Support Values for Genome Phylogenies, Life 6.1.
.SH BUGS
.SS Reporting Bugs
Please report bugs to <kloetzl at evolbio.mpg.de> or at <https://github.com/EvolBioInf/andi>.
-.SS
=====================================
docs/manual/andi-manual.tex
=====================================
@@ -115,7 +115,8 @@ The easiest way to install \andi is via a package manager. This also handles all
\noindent macOS with homebrew:
\begin{lstlisting}
-~ % brew install homebrew/science/andi
+~ % brew tap brewsci/bio
+~ % brew install andi
\end{lstlisting}
\noindent ArchLinux AUR package with aura:
@@ -128,7 +129,7 @@ The easiest way to install \andi is via a package manager. This also handles all
\section{Source Package} \label{sub:regular}
-To build \andi from source, download the latest \href{https://github.com/EvolBioInf/andi/releases}{release} from GitHub. Please note, that \andi requires the \algo{Gnu Scientific Library} and optionally \algo{libdivsufsort}\footnote{\url{https://github.com/y-256/libdivsufsort}} for optimal performance \cite{divsufsort}. If you wish to install \andi without \algo{libdivsufsort}, consult Section~\ref{sub:wo-divsufsort}.
+To build \andi from source, download the latest \href{https://github.com/EvolBioInf/andi/releases}{release} from GitHub. Please note, that \andi requires the \algo{Gnu Scientific Library} and \algo{libdivsufsort}\footnote{\url{https://github.com/y-256/libdivsufsort}} for optimal performance \cite{divsufsort}.
Once you have downloaded the package, unzip it and change into the newly created directory.
@@ -170,16 +171,6 @@ Options:
\noindent \andi also comes with a man page, which can be accessed via \lstinline$man andi$. % But once you are done with this documentation, you will require it scarcely.
-\section{Installing without \algo{libdivsufsort}} \label{sub:wo-divsufsort}
-
-In case you cannot or do not want to install \algo{libdivsufsort}, \andi comes with its own implementation of a \emph{suffix array construction algorithm}. It is called \algo{psufsort} and is also available, separately.\footnote{\url{https://github.com/kloetzl/psufsort}} To activate it for \algo{andi}, proceed as described in Section~\ref{sub:regular}, but with the following twist:
-
-\begin{lstlisting}
-~/andi % ./configure --without-libdivsufsort
-\end{lstlisting}
-
-Please note that this requires a C++11 compiler. Also, \algo{psufsort} may be slower than \algo{libdivsufsort} and thus lead to inferior runtimes..
-
\section{Installing from Git Repository}
To build \andi from the \algo{Git} repo, you will also need the \algo{autotools}. Refer to your OS documentation for installation instructions. Once done, execute the following steps.
=====================================
libs/pfasta.c
=====================================
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015-2016, Fabian Klötzl <fabian-pfasta at kloetzl.info>
+ * Copyright (c) 2015-2018, Fabian Klötzl <fabian-pfasta at kloetzl.info>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -17,388 +17,472 @@
#include <assert.h>
#include <ctype.h>
+#include <err.h>
#include <errno.h>
-#include <stdint.h>
+#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <sys/types.h>
#include <unistd.h>
-#include <compat-stdlib.h>
#include "pfasta.h"
-#define BUFFERSIZE 4096
+#define VERSION "v14"
-/** @file
- *
- * Welcome to the code of `pfasta`, the pedantic FASTA parser. For future
- * reference I here explain some general, noteworthy things about the code.
- *
- * - Most functions returning an `int` follow the zero-errors convention. On
- * success a `0` is returned. A negative number indicates an error. Positive
- * numbers can be used to signal a different exceptional state (i.e. EOF).
- * - All functions use pointers to objects for their parameters.
- * - The low-level Unix `read(2)` function is used to grab bytes from a file
- * descriptor. These are stored in a buffer. The contents of this buffer is
- * checked one char at a time and then appended to string buffer. Finally,
- * the resulting strings are returned.
- * - To work around the fact that C has no exceptions, I declare some nifty
- * macros below.
- * - As the length of the sequences are not known in advance I implemented a
- * simple structure for growable strings called `dynstr`. One character at
- * a time can be appended using `dynstr_put`. Internally an array is
- * realloced with growth factor 1.5.
- * - The functions which do the actual parsing are prefixed `pfasta_read_*`.
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#ifdef __STDC_NO_THREADS__
+#define thread_local
+#else
+#include <threads.h>
+#endif
+
+/** The following is the maximum length of an error string. It has to be
+ * carefully chosen, so that all calls to PF_FAIL_STR succeed. For instance,
+ * the line number can account for up to 20 characters.
*/
+#define PF_ERROR_STRING_LENGTH 100
+
+thread_local char errstr_buffer[PF_ERROR_STRING_LENGTH];
-#define PF_EXIT_ERRNO() \
+void *pfasta_reallocarray(void *ptr, size_t nmemb, size_t size);
+
+#define BUFFER_SIZE 16384
+
+#define LIKELY(X) __builtin_expect((intptr_t)(X), 1)
+#define UNLIKELY(X) __builtin_expect((intptr_t)(X), 0)
+
+enum { NO_ERROR, E_EOF, E_ERROR, E_ERRNO, E_BUBBLE, E_STR, E_STR_CONST };
+
+#define PF_FAIL_ERRNO(PP) \
do { \
- pf->errno__ = errno; \
- pf->errstr = NULL; \
- return -2; \
+ (void)strerror_r(errno, errstr_buffer, PF_ERROR_STRING_LENGTH); \
+ (PP)->errstr = errstr_buffer; \
+ return_code = E_ERRNO; \
+ goto cleanup; \
} while (0)
-#define PF_EXIT_FORWARD() return -1
+#define PF_FAIL_BUBBLE_CHECK(PP, CHECK) \
+ do { \
+ if (UNLIKELY(CHECK)) { \
+ return_code = CHECK; \
+ goto cleanup; \
+ } \
+ } while (0)
-#define PF_FAIL_FORWARD() \
+#define PF_FAIL_BUBBLE(PP) \
do { \
- return_code = -1; \
- goto cleanup; \
+ if (UNLIKELY((PP)->errstr)) { \
+ return_code = E_BUBBLE; \
+ goto cleanup; \
+ } \
} while (0)
-#define PF_FAIL_ERRNO() \
+#define PF_FAIL_STR_CONST(PP, STR) \
do { \
- pf->errno__ = errno; \
- pf->errstr = NULL; \
- return_code = -2; \
+ (PP)->errstr = (STR); \
+ return_code = E_STR_CONST; \
goto cleanup; \
} while (0)
-#define PF_FAIL_STR(...) \
+#define PF_FAIL_STR(PP, ...) \
do { \
- pf->errno__ = 0; \
- (void)snprintf(pf->errstr_buf, PF_ERROR_STRING_LENGTH, __VA_ARGS__); \
- pf->errstr = pf->errstr_buf; \
- return_code = -1; \
+ (void)snprintf(errstr_buffer, PF_ERROR_STRING_LENGTH, __VA_ARGS__); \
+ (PP)->errstr = errstr_buffer; \
+ return_code = E_STR; \
goto cleanup; \
} while (0)
-static int buffer_init(pfasta_file *pf);
-static inline int buffer_peek(const pfasta_file *pf);
-static inline int buffer_adv(pfasta_file *pf);
-static int buffer_read(pfasta_file *pf);
+int pfasta_read_name(struct pfasta_parser *pp, struct pfasta_record *pr);
+int pfasta_read_comment(struct pfasta_parser *pp, struct pfasta_record *pr);
+int pfasta_read_sequence(struct pfasta_parser *pp, struct pfasta_record *pr);
+
+static inline char *buffer_begin(struct pfasta_parser *pp);
+static inline char *buffer_end(struct pfasta_parser *pp);
+static inline int buffer_advance(struct pfasta_parser *pp, size_t steps);
+static inline int buffer_is_empty(const struct pfasta_parser *pp);
+static inline int buffer_is_eof(const struct pfasta_parser *pp);
+static inline int buffer_peek(struct pfasta_parser *pp);
+static inline int buffer_read(struct pfasta_parser *pp);
typedef struct dynstr {
char *str;
size_t capacity, count;
} dynstr;
-static inline int dynstr_init(dynstr *ds);
-static inline int dynstr_put(dynstr *ds, char c);
-static inline void dynstr_free(dynstr *ds);
static inline char *dynstr_move(dynstr *ds);
+static inline int dynstr_init(dynstr *ds, struct pfasta_parser *pp);
static inline size_t dynstr_len(const dynstr *ds);
+static inline void dynstr_free(dynstr *ds);
+static inline int dynstr_append(dynstr *ds, const char *str, size_t length,
+ struct pfasta_parser *pp);
-int pfasta_read_name(pfasta_file *pf, pfasta_seq *ps);
-int pfasta_read_comment(pfasta_file *pf, pfasta_seq *ps);
-int pfasta_read_seq(pfasta_file *pf, pfasta_seq *ps);
+static inline int my_isspace(int c) {
+ // ascii whitespace
+ return (c >= '\t' && c <= '\r') || (c == ' ');
+}
-/*
- * When reading from a buffer, basically three things can happen.
- *
- * 1. Bytes are read (success)
- * 2. Low-level error (fail)
- * 3. No more bytes (EOF)
- *
- * A low-level error is indicated by `buffer_adv` returning non-zero. As
- * end-of-file is technically a successful read, EOF is instead signalled by
- * `buffer_peek`.
- */
+const char *pfasta_version(void) { return VERSION; }
-/** @brief Initialises the read buffer. First, memory is allocated and then
- * filled. Both operations may fail. But this ensure we detect problems with
- * an unreadable file at the the initialisation of the parser!
- *
- * @param pf - The parser we want to initialise.
- * @returns 0 iff successful.
- */
-static int buffer_init(pfasta_file *pf) {
- char *buffer = malloc(BUFFERSIZE);
- if (!buffer) PF_EXIT_ERRNO();
+int buffer_init(struct pfasta_parser *pp) {
+ int return_code = 0;
- pf->buffer = pf->readptr = pf->fillptr = buffer;
- if (buffer_read(pf) < 0) PF_EXIT_FORWARD();
- return 0;
+ pp->buffer = malloc(BUFFER_SIZE);
+ if (!pp->buffer) PF_FAIL_ERRNO(pp);
+
+ int check = buffer_read(pp);
+ PF_FAIL_BUBBLE_CHECK(pp, check);
+
+cleanup:
+ return return_code;
}
-/** @brief Returns the current character or EOF.
- *
- * @param pf - The parser to read from.
- * @returns The current character or EOF.
- */
-static inline int buffer_peek(const pfasta_file *pf) {
- if (pf->readptr < pf->fillptr) {
- return (int)*(pf->readptr);
+int buffer_read(struct pfasta_parser *pp) {
+ int return_code = NO_ERROR;
+ ssize_t count = read(pp->file_descriptor, pp->buffer, BUFFER_SIZE);
+
+ if (UNLIKELY(count < 0)) PF_FAIL_ERRNO(pp);
+ if (UNLIKELY(count == 0)) { // EOF
+ pp->fill_ptr = pp->buffer;
+ pp->read_ptr = pp->buffer + 1;
+ pp->errstr = "EOF (maybe error)"; // enable bubbling
+ return E_EOF;
}
- return EOF;
+
+ pp->read_ptr = pp->buffer;
+ pp->fill_ptr = pp->buffer + count;
+
+cleanup:
+ return return_code;
}
-/** @brief Advances the read pointer in the buffer to the next character. If
- * needed, the buffer is filled with fresh bytes. A non-zero value is returned
- * if reading fails.
- *
- * @param pf - The parser which should be advanced.
- * @returns 0 iff successful.
- */
-static inline int buffer_adv(pfasta_file *pf) {
- if (buffer_peek(pf) == '\n') {
- pf->line++;
- }
+int buffer_peek(struct pfasta_parser *pp) {
+ return LIKELY(pp->read_ptr < pp->fill_ptr) ? *pp->read_ptr : EOF;
+}
- if (pf->readptr < pf->fillptr - 1) {
- pf->readptr++;
- return 0;
+char *buffer_begin(struct pfasta_parser *pp) { return pp->read_ptr; }
+
+char *buffer_end(struct pfasta_parser *pp) { return pp->fill_ptr; }
+
+inline int buffer_advance(struct pfasta_parser *pp, size_t steps) {
+ int return_code = 0;
+
+ pp->read_ptr += steps;
+ if (UNLIKELY(pp->read_ptr >= pp->fill_ptr)) {
+ assert(pp->read_ptr == pp->fill_ptr);
+ int check = buffer_read(pp); // resets pointers
+ PF_FAIL_BUBBLE_CHECK(pp, check);
}
- if (buffer_read(pf) < 0) PF_EXIT_FORWARD();
+cleanup:
+ return return_code;
+}
- return 0;
+int buffer_is_empty(const struct pfasta_parser *pp) {
+ return pp->read_ptr == pp->fill_ptr;
}
-/** @brief Fills the buffer with new data.
- *
- * @param pf - The parser which should be updated.
- * @returns 0 iff successful.
- */
-static int buffer_read(pfasta_file *pf) {
- ssize_t count = read(pf->fd, pf->buffer, BUFFERSIZE);
- if (count < 0) PF_EXIT_ERRNO();
- if (count == 0) { // EOF
- pf->fillptr = pf->buffer;
- pf->readptr = pf->buffer + 1;
- return 1;
+int buffer_is_eof(const struct pfasta_parser *pp) {
+ return pp->read_ptr > pp->fill_ptr;
+}
+
+char *find_first_space(const char *begin, const char *end) {
+ size_t offset = 0;
+ size_t length = end - begin;
+
+#ifdef __SSE2__
+
+ typedef __m128i vec_type;
+ static const size_t vec_size = sizeof(vec_type);
+
+ const vec_type all_tab = _mm_set1_epi8('\t' - 1);
+ const vec_type all_carriage = _mm_set1_epi8('\r' + 1);
+ const vec_type all_space = _mm_set1_epi8(' ');
+
+ size_t vec_offset = 0;
+ size_t vec_length = (end - begin) / vec_size;
+
+ for (; vec_offset < vec_length; vec_offset++) {
+ vec_type chunk;
+ memcpy(&chunk, begin + vec_offset * vec_size, vec_size);
+
+ // isspace: \t <= char <= \r || char == space
+ vec_type v1 = _mm_cmplt_epi8(all_tab, chunk);
+ vec_type v2 = _mm_cmplt_epi8(chunk, all_carriage);
+ vec_type v3 = _mm_cmpeq_epi8(chunk, all_space);
+
+ unsigned int vmask = (_mm_movemask_epi8(v1) & _mm_movemask_epi8(v2)) |
+ _mm_movemask_epi8(v3);
+
+ if (UNLIKELY(vmask)) {
+ offset += __builtin_ctz(vmask);
+ offset += vec_offset * vec_size;
+ return (char *)begin + offset;
+ }
}
- pf->readptr = pf->buffer;
- pf->fillptr = pf->buffer + count;
+ offset += vec_offset * vec_size;
+#endif
- return 0;
+ for (; offset < length; offset++) {
+ if (my_isspace(begin[offset])) break;
+ }
+ return (char *)begin + offset;
}
-/** @brief Frees all data associated with a parser. Also nulls pointers to avoid
- * a potential double-free.
- *
- * @param pf - The parser that shall be freed.
- */
-void pfasta_free(pfasta_file *pf) {
- if (!pf) return;
- free(pf->buffer);
- pf->buffer = pf->readptr = pf->fillptr = pf->errstr = NULL;
- pf->errno__ = 0;
- pf->fd = -1;
- pf->line = 0;
- pf->unexpected_char = '\0';
+char *find_first_not_space(const char *begin, const char *end) {
+ size_t offset = 0;
+ size_t length = end - begin;
+
+ for (; offset < length; offset++) {
+ if (!my_isspace(begin[offset])) break;
+ }
+ return (char *)begin + offset;
}
-/** @brief Creates a new parser for the given file. This includes allocating the
- * buffer and reading the first few bytes. These are then used to break on empty
- * or non-FASTA files.
- *
- * @param pf - A pointer to the parser structure we intend to initialise. No
- * assumption is made about the referenced memory except its existence.
- * @returns 0 iff successful.
- */
-int pfasta_parse(pfasta_file *pf, int file_descriptor) {
- assert(pf && file_descriptor >= 0);
+size_t count_newlines(const char *begin, const char *end) {
+ size_t offset = 0;
+ size_t length = end - begin;
+ size_t newlines = 0;
+
+ for (; offset < length; offset++) {
+ if (begin[offset] == '\n') newlines++;
+ }
+
+ return newlines;
+}
+
+static int copy_word(struct pfasta_parser *pp, dynstr *target) {
int return_code = 0;
- pf->errno__ = 0;
- pf->buffer = pf->readptr = pf->fillptr = pf->errstr = NULL;
- pf->fd = file_descriptor;
- pf->line = 1;
- pf->unexpected_char = '\0';
+ while (LIKELY(!my_isspace(buffer_peek(pp)))) {
+ char *end_of_word = find_first_space(buffer_begin(pp), buffer_end(pp));
+ size_t word_length = end_of_word - buffer_begin(pp);
- if (buffer_init(pf) != 0) PF_FAIL_FORWARD();
+ assert(word_length > 0);
- int c = buffer_peek(pf);
- if (c == EOF) PF_FAIL_STR("Empty file");
- if (c != '>') PF_FAIL_STR("File does not start with '>'");
+ int check = dynstr_append(target, buffer_begin(pp), word_length, pp);
+ PF_FAIL_BUBBLE_CHECK(pp, check);
+
+ check = buffer_advance(pp, word_length);
+ PF_FAIL_BUBBLE_CHECK(pp, check);
+ }
cleanup:
return return_code;
}
-/** @brief Frees the memory of a FASTA sequence. **/
-void pfasta_seq_free(pfasta_seq *ps) {
- if (!ps) return;
- free(ps->name);
- free(ps->comment);
- free(ps->seq);
- ps->name = ps->comment = ps->seq = NULL;
-}
-
-/** @brief Reads the next sequence from the parser into the memory pointed to by
- * the parameter `ps`. This may fail for various reasons. No matter, what
- * happens, always free `ps` after usage!
- *
- * @param pf - The parser to read from.
- * @param ps - A reference to memory for the sequence data.
- *
- * @returns 0 if successful, 1 if the end of the file was reached and a negative
- * number on error.
- */
-int pfasta_read(pfasta_file *pf, pfasta_seq *ps) {
- assert(pf && ps && pf->buffer);
- *ps = (pfasta_seq){NULL, NULL, NULL};
+static int skip_whitespace(struct pfasta_parser *pp) {
int return_code = 0;
- int c = buffer_peek(pf);
- if (c == EOF) return 1;
- if (c != '>') {
- PF_FAIL_STR("Expected '>', but found '%c' on line %zu", c, pf->line);
- }
+ while (my_isspace(buffer_peek(pp))) {
+ char *split = find_first_not_space(buffer_begin(pp), buffer_end(pp));
- if (pfasta_read_name(pf, ps) < 0) PF_FAIL_FORWARD();
- if (isblank(buffer_peek(pf))) {
- if (pfasta_read_comment(pf, ps) < 0) PF_FAIL_FORWARD();
- }
- if (pfasta_read_seq(pf, ps) < 0) PF_FAIL_FORWARD();
+ // advance may clear the buffer. So count first …
+ size_t newlines = count_newlines(buffer_begin(pp), split);
+ int check = buffer_advance(pp, split - buffer_begin(pp));
+ PF_FAIL_BUBBLE_CHECK(pp, check);
- // Skip blank lines
- while (buffer_peek(pf) == '\n') {
- if (buffer_adv(pf) != 0) PF_FAIL_FORWARD();
+ // … and then increase the counter.
+ pp->line_number += newlines;
}
cleanup:
return return_code;
}
-/** @brief Reads the sequence name and saves it into the structure.
- *
- * @param pf - The parser used for reading.
- * @param ps - The structure used to hold the name, later.
- *
- * @returns 0 iff successful
- */
-int pfasta_read_name(pfasta_file *pf, pfasta_seq *ps) {
+struct pfasta_parser pfasta_init(int file_descriptor) {
int return_code = 0;
- dynstr name;
- if (dynstr_init(&name) != 0) PF_FAIL_ERRNO();
+ struct pfasta_parser pp = {0};
+ pp.line_number = 1;
- while (1) {
- if (buffer_adv(pf) != 0) PF_FAIL_FORWARD();
+ pp.file_descriptor = file_descriptor;
+ int check = buffer_init(&pp);
+ if (check && check != E_EOF) PF_FAIL_BUBBLE_CHECK(&pp, check);
- int c = buffer_peek(pf);
- if (c == EOF) {
- PF_FAIL_STR("Unexpected EOF in sequence name on line %zu",
- pf->line);
- }
- if (!isgraph(c)) break;
+ if (buffer_is_empty(&pp) || buffer_is_eof(&pp)) {
+ PF_FAIL_STR(&pp, "File is empty.");
+ }
- if (dynstr_put(&name, c) != 0) PF_FAIL_ERRNO();
+ if (buffer_peek(&pp) != '>') {
+ PF_FAIL_STR(&pp, "File must start with '>'.");
}
- if (dynstr_len(&name) == 0) PF_FAIL_STR("Empty name on line %zu", pf->line);
+cleanup:
+ // free buffer if necessary
+ if (return_code) {
+ pfasta_free(&pp);
+ }
+ pp.done = return_code || buffer_is_eof(&pp);
+ return pp;
+}
+
+struct pfasta_record pfasta_read(struct pfasta_parser *pp) {
+ int return_code = 0;
+ struct pfasta_record pr = {0};
+
+ int check = pfasta_read_name(pp, &pr);
+ PF_FAIL_BUBBLE_CHECK(pp, check);
+
+ check = pfasta_read_comment(pp, &pr);
+ PF_FAIL_BUBBLE_CHECK(pp, check);
- ps->name = dynstr_move(&name);
+ check = pfasta_read_sequence(pp, &pr);
+ PF_FAIL_BUBBLE_CHECK(pp, check);
cleanup:
- dynstr_free(&name);
- return return_code;
+ if (return_code) {
+ pfasta_record_free(&pr);
+ pfasta_free(pp);
+ }
+ pp->done = return_code || buffer_is_eof(pp);
+ return pr;
}
-/** @brief Reads the sequence comment and saves it into the structure.
- *
- * @param pf - The parser used for reading.
- * @param ps - The structure used to hold the comment, later.
- *
- * @returns 0 iff successful
- */
-int pfasta_read_comment(pfasta_file *pf, pfasta_seq *ps) {
+int pfasta_read_name(struct pfasta_parser *pp, struct pfasta_record *pr) {
int return_code = 0;
- dynstr comment;
- if (dynstr_init(&comment) != 0) PF_FAIL_ERRNO();
- while (1) {
- if (buffer_adv(pf) != 0) PF_FAIL_FORWARD();
-
- int c = buffer_peek(pf);
- if (c == '\n') break;
- if (c == EOF) {
- PF_FAIL_STR("Unexpected EOF in sequence comment on line %zu",
- pf->line);
- }
+ dynstr name;
+ dynstr_init(&name, pp);
+ PF_FAIL_BUBBLE(pp);
- if (dynstr_put(&comment, c) != 0) PF_FAIL_ERRNO();
+ assert(!buffer_is_empty(pp));
+ if (buffer_peek(pp) != '>') {
+ PF_FAIL_STR(pp, "Expected '>' but found '%c' on line %zu.",
+ buffer_peek(pp), pp->line_number);
}
- ps->comment = dynstr_move(&comment);
+ int check = buffer_advance(pp, 1); // skip >
+ if (check == E_EOF)
+ PF_FAIL_STR(pp, "Unexpected EOF in name on line %zu.", pp->line_number);
+ PF_FAIL_BUBBLE(pp);
+
+ check = copy_word(pp, &name);
+ if (check == E_EOF)
+ PF_FAIL_STR(pp, "Unexpected EOF in name on line %zu.", pp->line_number);
+ PF_FAIL_BUBBLE(pp);
+
+ if (dynstr_len(&name) == 0)
+ PF_FAIL_STR(pp, "Empty name on line %zu.", pp->line_number);
+
+ pr->name_length = dynstr_len(&name);
+ pr->name = dynstr_move(&name);
cleanup:
- dynstr_free(&comment);
+ if (return_code) {
+ dynstr_free(&name);
+ }
return return_code;
}
-/** @brief Reads the sequence data and saves it into the structure.
- *
- * @param pf - The parser used for reading.
- * @param ps - The structure used to hold the data, later.
- *
- * @returns 0 iff successful
- */
-int pfasta_read_seq(pfasta_file *pf, pfasta_seq *ps) {
+int pfasta_read_comment(struct pfasta_parser *pp, struct pfasta_record *pr) {
int return_code = 0;
- dynstr seq;
- if (dynstr_init(&seq) != 0) PF_FAIL_ERRNO();
- while (1) {
- // The only guaranty is !graph && !blank
- assert(!isgraph(buffer_peek(pf)) && !isblank(buffer_peek(pf)));
+ if (buffer_peek(pp) == '\n') {
+ pr->comment_length = 0;
+ pr->comment = NULL;
+ return 0;
+ }
+
+ dynstr comment;
+ dynstr_init(&comment, pp);
+ PF_FAIL_BUBBLE(pp);
- // deal with the first character explicitly
- if (buffer_adv(pf) != 0) PF_FAIL_FORWARD();
+ assert(!buffer_is_empty(pp));
- int c = buffer_peek(pf);
- if (c == EOF || c == '>' || c == '\n') break;
+ int check = buffer_advance(pp, 1); // skip first whitespace
+ if (check == E_EOF) goto label_eof;
+ PF_FAIL_BUBBLE(pp);
- goto regular;
+ assert(!buffer_is_empty(pp));
- // read line
- while (1) {
- if (buffer_adv(pf) != 0) PF_FAIL_FORWARD();
+ // get comment
+ while (buffer_peek(pp) != '\n') {
+ check = dynstr_append(&comment, buffer_begin(pp), 1, pp);
+ PF_FAIL_BUBBLE_CHECK(pp, check);
- c = buffer_peek(pf);
- if (c == '\n') break;
- if (c == EOF) break;
+ check = buffer_advance(pp, 1);
+ if (check == E_EOF) goto label_eof;
+ PF_FAIL_BUBBLE_CHECK(pp, check);
+ }
- regular:
- if (!isgraph(c)) {
- PF_FAIL_STR("Unexpected character '%c' in sequence on line %zu",
- c, pf->line);
- }
- if (dynstr_put(&seq, c) != 0) PF_FAIL_ERRNO();
- }
+label_eof:
+ if (buffer_is_eof(pp))
+ PF_FAIL_STR(pp, "Unexpected EOF in comment on line %zu.",
+ pp->line_number);
+
+ pr->comment_length = dynstr_len(&comment);
+ pr->comment = dynstr_move(&comment);
+
+cleanup:
+ if (return_code) {
+ dynstr_free(&comment);
}
+ return return_code;
+}
- if (dynstr_len(&seq) == 0) {
- PF_FAIL_STR("Empty sequence on line %zu", pf->line);
+int pfasta_read_sequence(struct pfasta_parser *pp, struct pfasta_record *pr) {
+ int return_code = 0;
+
+ dynstr sequence;
+ dynstr_init(&sequence, pp);
+ PF_FAIL_BUBBLE(pp);
+
+ assert(!buffer_is_empty(pp));
+ assert(!buffer_is_eof(pp));
+ assert(buffer_peek(pp) == '\n');
+
+ int check = skip_whitespace(pp);
+ if (check == E_EOF)
+ PF_FAIL_STR(pp, "Empty sequence on line %zu.", pp->line_number);
+ PF_FAIL_BUBBLE_CHECK(pp, check);
+
+ while (LIKELY(isalpha(buffer_peek(pp)))) {
+ int check = copy_word(pp, &sequence);
+ if (UNLIKELY(check == E_EOF)) break;
+ PF_FAIL_BUBBLE_CHECK(pp, check);
+
+ // optimize for more common case
+ ptrdiff_t length = buffer_end(pp) - buffer_begin(pp);
+ if (LIKELY(length >= 2 && buffer_begin(pp)[0] == '\n' &&
+ buffer_begin(pp)[1] > ' ')) {
+ pp->read_ptr++; // nasty hack
+ pp->line_number += 1;
+ } else {
+ check = skip_whitespace(pp);
+ if (UNLIKELY(check == E_EOF)) break;
+ PF_FAIL_BUBBLE_CHECK(pp, check);
+ }
}
- ps->seq = dynstr_move(&seq);
+
+ if (dynstr_len(&sequence) == 0)
+ PF_FAIL_STR(pp, "Empty sequence on line %zu.", pp->line_number);
+
+ pr->sequence_length = dynstr_len(&sequence);
+ pr->sequence = dynstr_move(&sequence);
+ pp->errstr = NULL; // reset error
cleanup:
- dynstr_free(&seq);
+ if (return_code) {
+ dynstr_free(&sequence);
+ }
return return_code;
}
-/** @brief Returns an explanatory string for encountered errors. */
-const char *pfasta_strerror(const pfasta_file *pf) {
- if (!pf) return NULL;
- if (pf->errno__ == 0) {
- return pf->errstr;
- } else {
- return strerror(pf->errno__);
- }
+void pfasta_record_free(struct pfasta_record *pr) {
+ if (!pr) return;
+ free(pr->name);
+ free(pr->comment);
+ free(pr->sequence);
+ pr->name = pr->comment = pr->sequence = NULL;
+}
+
+void pfasta_free(struct pfasta_parser *pp) {
+ if (!pp) return;
+ free(pp->buffer);
+ pp->buffer = NULL;
}
/** @brief Creates a new string that can grow dynamically.
@@ -407,35 +491,49 @@ const char *pfasta_strerror(const pfasta_file *pf) {
*
* @returns 0 iff successful.
*/
-static inline int dynstr_init(dynstr *ds) {
+static inline int dynstr_init(dynstr *ds, struct pfasta_parser *pp) {
+ int return_code = 0;
+
*ds = (dynstr){NULL, 0, 0};
ds->str = malloc(61);
- if (!ds->str) return -1;
+ if (!ds->str) PF_FAIL_ERRNO(pp);
+
ds->str[0] = '\0';
ds->capacity = 61;
- return 0;
+ ds->count = 0;
+
+cleanup:
+ return return_code;
}
-/** @brief A append a character to a string.
+/** @brief A append more than one character to a string.
*
* @param ds - A reference to the dynstr container.
- * @param c - The new character.
+ * @param str - The new characters.
+ * @param length - number of new characters to append
*
* @returns 0 iff successful.
*/
-static inline int dynstr_put(dynstr *ds, char c) {
- if (ds->count >= ds->capacity - 1) {
- char *neu = reallocarray(ds->str, ds->capacity / 2, 3);
- if (!neu) {
+static inline int dynstr_append(dynstr *ds, const char *str, size_t length,
+ struct pfasta_parser *pp) {
+ int return_code = 0;
+ size_t required = ds->count + length;
+
+ if (UNLIKELY(required >= ds->capacity)) {
+ char *neu = pfasta_reallocarray(ds->str, required / 2, 3);
+ if (UNLIKELY(!neu)) {
dynstr_free(ds);
- return -1;
+ PF_FAIL_ERRNO(pp);
}
ds->str = neu;
- ds->capacity = (ds->capacity / 2) * 3;
+ ds->capacity = (required / 2) * 3;
}
- ds->str[ds->count++] = c;
- return 0;
+ memcpy(ds->str + ds->count, str, length);
+ ds->count = required;
+
+cleanup:
+ return return_code;
}
/** @brief Frees a dynamic string. */
@@ -452,9 +550,8 @@ static inline void dynstr_free(dynstr *ds) {
*
* @returns a `char*` to a standard null-terminated string.
*/
-
static inline char *dynstr_move(dynstr *ds) {
- char *out = reallocarray(ds->str, ds->count + 1, 1);
+ char *out = pfasta_reallocarray(ds->str, ds->count + 1, 1);
if (!out) {
out = ds->str;
}
@@ -465,3 +562,16 @@ static inline char *dynstr_move(dynstr *ds) {
/** @brief Returns the current length of the dynamic string. */
static inline size_t dynstr_len(const dynstr *ds) { return ds->count; }
+
+__attribute__((weak)) void *reallocarray(void *ptr, size_t nmemb, size_t size);
+
+/**
+ * @brief Unsafe fallback in case reallocarray isn't provided by the stdlib.
+ */
+void *pfasta_reallocarray(void *ptr, size_t nmemb, size_t size) {
+ if (reallocarray == NULL) {
+ return realloc(ptr, nmemb * size);
+ } else {
+ return reallocarray(ptr, nmemb, size);
+ }
+}
=====================================
libs/pfasta.h
=====================================
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015, Fabian Klötzl <fabian-pfasta at kloetzl.info>
+ * Copyright (c) 2015-2018, Fabian Klötzl <fabian-pfasta at kloetzl.info>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -18,31 +18,79 @@
#ifndef PFASTA_H
#define PFASTA_H
-/** The following is the maximum length of an error string. It has to be
- * carefully chosen, so that all calls to PF_FAIL_STR succeed. For instance,
- * the line number can account for up to 20 characters.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+
+/**
+ * There is no magic to this structure. Its just a container of three strings.
+ * Feel free to duplicate or move them. But don't forget to free the data after
+ * usage!
+ */
+struct pfasta_record {
+ char *name, *comment, *sequence;
+ size_t name_length, comment_length, sequence_length;
+};
+
+/**
+ * This structure holds a number of members to represent the state of the FASTA
+ * parser. Please make sure that it is properly initialized before usage..
+ * Always free this structure when the parser is done.
+ */
+struct pfasta_parser {
+ const char *errstr;
+ int done;
+
+ /*< private -- do not touch! >*/
+ int file_descriptor;
+ char *buffer;
+ char *read_ptr, *fill_ptr;
+ size_t line_number;
+};
+
+/**
+ * This function initializes a `pfasta_parser` struct with a parser bound to a
+ * specific file descriptor. Iff an error occurred `errstr` is set to contain a
+ * suitable message. Otherwise you can read data from it as long as `done` isn't
+ * set. The parser should be freed after usage.
+ *
+ * Please note that the user is responsible for opening the file descriptor as
+ * readable and closing after usage.
+ */
+struct pfasta_parser pfasta_init(int file_descriptor);
+
+/**
+ * Using a properly initialized parser, this function can read FASTA sequences.
+ * These are stored in the simple structure and returned. On error, the `errstr`
+ * property of the parser is set.
*/
-#define PF_ERROR_STRING_LENGTH 100
+struct pfasta_record pfasta_read(struct pfasta_parser *pp);
-typedef struct pfasta_file {
- char *buffer, *readptr, *fillptr;
- char *errstr;
- int errno__;
- int fd;
- size_t line;
- char errstr_buf[PF_ERROR_STRING_LENGTH];
- char unexpected_char;
-} pfasta_file;
+/**
+ * This function frees the resources held by a pfasta record.
+ */
+void pfasta_record_free(struct pfasta_record *pr);
-typedef struct pfasta_seq {
- char *name, *comment, *seq;
-} pfasta_seq;
+/**
+ * This function frees the resources held by a pfasta parser.
+ */
+void pfasta_free(struct pfasta_parser *pp);
+
+/**
+ * Get a string defining the version of the pfasta library.
+ */
+const char *pfasta_version(void);
-int pfasta_parse(pfasta_file *, int file_descriptor);
-void pfasta_free(pfasta_file *);
-void pfasta_seq_free(pfasta_seq *);
-int pfasta_read(pfasta_file *, pfasta_seq *);
+#ifdef __STDC_NO_THREADS__
+/** If the preprocessor macro `PFASTA_NO_THREADS` is defined, the parser is not
+ * fully thread safe. */
+#define PFASTA_NO_THREADS
+#endif
-const char *pfasta_strerror(const pfasta_file *);
+#ifdef __cplusplus
+}
+#endif
#endif /* PFASTA_H */
=====================================
opt/psufsort/Makefile.am deleted
=====================================
@@ -1,4 +0,0 @@
-noinst_LIBRARIES = libpsufsort.a
-libpsufsort_a_SOURCES = psufsort.cxx c_interface.cxx interface.h $(top_srcdir)/src/global.h
-libpsufsort_a_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
-libpsufsort_a_CPPFLAGS = $(OPENMP_CXXFLAGS) -I$(top_srcdir)/src
=====================================
opt/psufsort/c_interface.cxx deleted
=====================================
@@ -1,15 +0,0 @@
-#include <string>
-#include <vector>
-#include <cstring>
-
-std::vector<int> psufsort(const std::string& T);
-
-extern "C" int c_psufsort(const char *str, int* SA){
- if( !str || !SA){
- return 1;
- }
- auto T = std::string(str);
- auto temp = psufsort(T);
- memcpy(SA, temp.data()+1, T.size() * sizeof(int));
- return 0;
-}
=====================================
opt/psufsort/interface.h deleted
=====================================
@@ -1,17 +0,0 @@
-#ifdef __cplusplus
-
-std::vector<int> psufsort(const std::string& T);
-
-#else
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int c_psufsort(const char *str, int* SA);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
=====================================
opt/psufsort/psufsort.cxx deleted
=====================================
@@ -1,336 +0,0 @@
-#include <string>
-#include <vector>
-#include <utility>
-#include <cstring>
-#include <algorithm>
-#include <iostream>
-#include <cassert>
-#include <cmath>
-#include <global.h>
-
-void mk_sort (std::vector<int>& SA, const std::string& T, size_t l, size_t r, size_t depth);
-void insertion_sort (std::vector<int>& SA, const std::string& T, size_t l, size_t r, size_t depth);
-void TSQS (std::vector<int>& SA, const std::string& T, size_t l, size_t r, size_t depth);
-void mk_buildin (std::vector<int>& SA, const std::string& T, size_t l, size_t r, size_t depth);
-
-std::vector<int> psufsort(const std::string& T);
-
-class Bucket {
-
-public:
- size_t start, size;
- constexpr Bucket() noexcept : start(0), size(0) {};
-};
-
-class PSufSort
-{
- const std::string& T;
- std::vector<int>& SA;
- size_t threshold;
-public:
- PSufSort(const std::string& _T, std::vector<int>& _SA, size_t size) : T(_T), SA(_SA) {
- threshold = std::log(size);
- }
- ~PSufSort() {};
-
- // All intervals are semi-open: [l,r)
- void sort(size_t l, size_t r, size_t depth, size_t calls);
- void sort_tsqs(size_t l, size_t r, size_t depth, size_t calls);
- void sort_insert(size_t l, size_t r, size_t depth, size_t calls);
- void sort_heap(size_t l, size_t r, size_t depth, size_t calls);
-
- void build_heap( int* rSA, size_t n, size_t depth);
- void heapify( int* rSA, size_t heap_size, size_t i, size_t depth);
-
- char median3(size_t a, size_t b, size_t c, size_t depth);
-
- void swap_range(size_t a, size_t b, size_t n);
- inline char char_at( size_t sai, size_t depth);
- inline const char *str_from( size_t sai, size_t depth);
-};
-
-std::vector<int> psufsort(const std::string& T){
- auto n = T.size();
- auto SA = std::vector<int>(n+1);
-
- auto L = std::vector<Bucket>(256);
- auto bucket_SM = std::vector<Bucket>(256*256); // S-
- auto bucket_SS = std::vector<Bucket>(256*256); // S*
-
- auto SM = [&](size_t X, size_t Y) -> Bucket& {
- return bucket_SM[(X<<8) + Y];
- };
- auto SS = [&](size_t X, size_t Y) -> Bucket& {
- return bucket_SS[(X<<8) + Y];
- };
-
- // classify suffixes and compute the bucket sizes
- ssize_t i = n -1;
- while( i >= 0){
- if( T[i] >= T[i+1]){
- L[T[i]].size++;
- i--;
- continue;
- }
-
- SS(T[i], T[i+1]).size++;
- i--;
-
- while( i >= 0 && T[i] <= T[i+1]){
- SM(T[i], T[i+1]).size++;
- i--;
- }
- }
-
- // Deal with the null byte
- SS(0,0).size = 1;
- SA[0] = n;
-
- // compute bucket starting points
- int j = 0;
- for(i=0; i<256; i++){
- L[i].start = j;
- j += L[i].size;
- for(auto k =i; k< 256; k++){
- SS(i, k).start = j;
- j += SS(i, k).size;
- SM(i, k).start = j;
- j += SM(i, k).size;
- }
- }
-
- // fill in S* buckets
- i = n -1;
- while( i >= 0){
- auto c = T[i];
-
- if( c >= T[i+1]){ // skip L
- i--;
- continue;
- }
-
- SA[SS(c,T[i+1]).start] = i;
- SS(c,T[i+1]).start++;
- i--;
-
- while( i >= 0 && T[i] <= T[i+1]){ // skip S-
- i--;
- }
- }
-
- // correct the `++` from the previous loop.
- for(i=0; i<256*256; i++){
- bucket_SS[i].start -= bucket_SS[i].size;
- }
-
- // sort all S* suffixes
- #pragma omp parallel for shared(SA,T) schedule(dynamic, 1) num_threads( (FLAGS & F_LOW_MEMORY) ? THREADS : 1)
- for(i=0; i<256*256; i++){
- const auto buc = bucket_SS[i];
- if( buc.size > 1){
- int b = buc.start;
- int e = b + buc.size;
- auto sorter = PSufSort( T, SA, buc.size);
-
- // sort
- sorter.sort( b, e, 2, 0);
- }
- }
-
- // induced insert all S-
- for(i=n; i >= 0;i--){
- int j = SA[i];
- if( j == 0) continue;
-
- auto a = T[j-1];
- auto b = T[j];
- if( a <= b){
- SA[SM(a, b).start + SM(a, b).size - 1] = j-1;
- SM(a, b).size--;
- }
- }
-
- // induced insert all Ls
- for(i=0; i<n+1; i++){
- int j = SA[i];
- if( j == 0) continue;
-
- auto a = T[j-1];
- if( SA[L[a].start] != 0) continue;
- if( a >= T[j]){
- SA[L[a].start] = j -1;
- L[a].start++;
- }
- }
-
- return SA; // move doesnt move
-}
-
-inline char PSufSort::char_at( size_t sai, size_t depth){
- return T[sai + depth];
-}
-
-inline const char *PSufSort::str_from( size_t sai, size_t depth){
- return &T[sai + depth];
-}
-
-void PSufSort::swap_range(size_t a, size_t b, size_t n){
- auto& SA = this->SA;
-
- for(auto i=0; i< n; i++){
- std::swap(SA[a++], SA[b++]);
- }
-}
-
-void PSufSort::sort (size_t l, size_t r, size_t depth, size_t calls) {
- if(l >= r){
- return;
- }
-
- auto m = r - l;
- if( m < 2 ){
- return;
- }
-
- if (m <= 16){
- sort_insert(l, r, depth, calls);
- return;
- }
-
- if( calls < threshold){
- sort_tsqs(l, r, depth, calls);
- } else {
- sort_heap(l, r, depth, calls);
- }
-}
-
-
-void PSufSort::sort_insert (size_t l, size_t r, size_t depth, size_t unused){
- auto cmp_from = [&]( size_t a, size_t b){
- auto ta = T.data()+ a +depth;
- auto tb = T.data()+ b +depth;
-
- return strcmp(ta,tb);
- };
-
- for(auto j = l+1; j < r; j++){
- auto X = SA[j];
-
- auto i = j;
- for(; i > l && cmp_from( SA[i-1], X) > 0 ; i--){
- SA[i] = SA[i-1];
- }
-
- SA[i] = X;
- }
-}
-
-char PSufSort::median3(size_t b, size_t a, size_t c, size_t depth){
- auto key = [&](size_t i){
- return char_at(SA[i], depth);
- };
-
- if( key(a) > key(b) ){ std::swap(SA[a], SA[b]); }
- if( key(b) > key(c) ){ std::swap(SA[b], SA[c]); }
- if( key(a) > key(b) ){ std::swap(SA[a], SA[b]); }
-
- return key(b);
-}
-
-void PSufSort::sort_tsqs (size_t l, size_t r, size_t depth, size_t calls){
- auto K = median3(l, (r-l)/2 + l, r-1, depth); // pick K
-
- auto a = l;
- auto b = l;
- auto c = r-1;
- auto d = r-1;
-
- while(true) {
- for(; b <= c && char_at(SA[b], depth) <= K; b++){
- if( char_at(SA[b], depth) == K){
- std::swap(SA[a], SA[b]);
- a++;
- }
- }
-
- for(; b <= c && char_at(SA[c], depth) >= K; c--){
- if( char_at(SA[c], depth) == K){
- std::swap(SA[c],SA[d]);
- d--;
- }
- }
-
- if( b > c) break;
-
- std::swap(SA[b],SA[c]);
- b++, c--;
- }
-
- auto m = std::min(a-l, b-a);
- swap_range(l, b-m, m);
-
- m = std::min(d-c, r-d-1);
- swap_range(b, r-m, m);
-
- auto i = l + b - a;
- auto j = r - d + c;
-
- this->sort(l, i, depth, calls + 1);
- this->sort(i, j, depth+1, calls + 1);
- this->sort(j, r, depth, calls + 1);
-}
-
-constexpr inline size_t LEFT(size_t i) noexcept {
- return (i << 1) + 1;
-}
-
-constexpr inline size_t RIGHT(size_t i) noexcept {
- return (i << 1) + 2;
-}
-
-constexpr inline size_t PARENT( size_t i) noexcept {
- return (i-1) >> 1;
-}
-
-void PSufSort::build_heap( int* rSA, size_t n, size_t depth){
- auto heap_size = n;
- for( ssize_t i= PARENT(n-1); i>=0 ; i--){
- heapify(rSA, heap_size, i, depth);
- }
-}
-
-void PSufSort::heapify( int* rSA, size_t heap_size, size_t i, size_t depth){ // aka. siftDown
- auto key = [&](size_t j){
- return T.data() + j + depth;
- };
-
- auto l = LEFT(i);
- auto r = RIGHT(i);
- auto largest = i;
-
- if( l < heap_size && strcmp( key(rSA[l]) , key(rSA[i])) > 0 ){
- largest = l;
- }
- if( r < heap_size && strcmp( key(rSA[r]) , key(rSA[largest])) > 0){
- largest = r;
- }
- if( largest != i){
- std::swap(rSA[i], rSA[largest]);
- heapify(rSA, heap_size, largest, depth);
- }
-}
-
-void PSufSort::sort_heap(size_t left, size_t right, size_t depth, size_t calls){
- auto rSA = SA.data() + left;
- auto n = right - left;
- auto heap_size = n;
-
- build_heap(rSA, n, depth);
-
- for( auto i = n-1; i ; i--){
- std::swap(rSA[0],rSA[i]);
- heap_size--;
- heapify(rSA, heap_size, 0, depth);
- }
-}
-
=====================================
src/Makefile.am
=====================================
@@ -1,20 +1,11 @@
bin_PROGRAMS = andi
-if !BUILD_WITH_LIBDIVSUFSORT
-PSUFSORT=$(top_builddir)/opt/psufsort/libpsufsort.a
-# This is a hack to make sure, andi is created with a C++ compiler
-DUMMY=dummy.cxx
-endif
-
andi_SOURCES = andi.c esa.c process.c sequence.c io.c global.h esa.h process.h sequence.h io.h dist_hack.h \
model.h model.c
andi_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -std=gnu99
andi_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra -Wno-missing-field-initializers
-andi_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
andi_LDADD = $(PSUFSORT) $(top_builddir)/libs/libpfasta.a $(top_builddir)/opt/libcompat.a
-nodist_EXTRA_andi_SOURCES = $(DUMMY)
.PHONY: perf
perf: CFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
-perf: CXXFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
perf: andi
=====================================
src/andi.c
=====================================
@@ -373,7 +373,7 @@ void usage(int status) {
void version(void) {
const char str[] = {
"andi " VERSION "\n"
- "Copyright (C) 2014 - 2017 Fabian Klötzl\n"
+ "Copyright (C) 2014 - 2020 Fabian Klötzl\n"
"License GPLv3+: GNU GPL version 3 or later "
"<http://gnu.org/licenses/gpl.html>\n"
"This is free software: you are free to change and redistribute it.\n"
@@ -381,13 +381,13 @@ void version(void) {
"Acknowledgments:\n"
"1) Andi: Haubold, B. Klötzl, F. and Pfaffelhuber, P. (2015). "
"Fast and accurate estimation of evolutionary distances between "
- "closely related genomes\n"
+ "closely related genomes, Bioinformatics.\n"
"2) Algorithms: Ohlebusch, E. (2013). Bioinformatics Algorithms. "
"Sequence Analysis, Genome Rearrangements, and Phylogenetic "
"Reconstruction. pp 118f.\n"
- "3) SA construction: Mori, Y. (2005). Short description of improved "
- "two-stage suffix sorting algorithm. "
- "http://homepage3.nifty.com/wpage/software/itssort.txt\n"};
+ "3) SA construction: Mori, Y. (2005). libdivsufsort, unpublished.\n"
+ "4) Bootstrapping: Klötzl, F. and Haubold, B. (2016). Support Values "
+ "for Genome Phylogenies, Life 6.1.\n"};
printf("%s", str);
exit(EXIT_SUCCESS);
}
=====================================
src/dist_hack.h
=====================================
@@ -65,7 +65,7 @@ void NAME(struct model *M, const seq_t *sequences, size_t n) {
size_t ql = sequences[j].len;
- M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.gc);
+ M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.threshold);
#pragma omp atomic update
progress_counter++;
=====================================
src/esa.c
=====================================
@@ -91,8 +91,8 @@ int esa_init_cache(esa_s *self) {
*
* This function is a depth first search on the virtual suffix tree and fills
* the cache. Or rather it calls it self until some value to cache is
- * calculated. This function is a recursive version of get_inteval but with more
- * edge cases.
+ * calculated. This function is a recursive version of get_interval but with
+ * more edge cases.
*
* @param C - The ESA.
* @param str - The current prefix.
@@ -121,6 +121,15 @@ void esa_init_cache_dfs(esa_s *C, char *str, size_t pos, const lcp_inter_t in) {
// fail early
if (ij.i == -1 && ij.j == -1) {
+ // if the current extension cannot be found, will with previous one
+ esa_init_cache_fill(C, str, pos + 1, in);
+ continue;
+ }
+
+ // singleton
+ if (ij.i == ij.j) {
+ // fix length
+ ij.l = pos + 1;
esa_init_cache_fill(C, str, pos + 1, ij);
continue;
}
@@ -166,6 +175,10 @@ void esa_init_cache_dfs(esa_s *C, char *str, size_t pos, const lcp_inter_t in) {
str[k] = c;
}
+ // We are skipping intervals here. Maybe for each of them we should also
+ // fill the cache. However, I haven't yet figured out how to do that
+ // properly and whether it is worth it.
+
if (non_acgt) {
esa_init_cache_fill(C, str, k, ij);
} else {
@@ -287,15 +300,7 @@ int esa_init_SA(esa_s *C) {
C->SA = malloc(C->len * sizeof(*C->SA));
CHECK_MALLOC(C->SA);
- saidx_t result = 1;
-
-#ifdef HAVE_LIBDIVSUFSORT
- result = divsufsort((const unsigned char *)C->S, C->SA, C->len);
-#else
- result = c_psufsort(C->S, C->SA);
-#endif
-
- return result;
+ return divsufsort((const unsigned char *)C->S, C->SA, C->len);
}
/** @brief Initializes the CLD (child) array.
@@ -322,7 +327,7 @@ int esa_init_CLD(esa_s *C) {
pair_t *top = stack; // points at the topmost filled element
pair_t last;
- R(CLD, 0) = C->len + 1;
+ R(CLD, 0) = C->len;
top->idx = 0;
top->lcp = -1;
@@ -462,9 +467,17 @@ static lcp_inter_t get_interval(const esa_s *self, lcp_inter_t ij, char a) {
SoSueMe:
if (c == a) {
/* found ! */
- saidx_t n = L(CLD, m);
- ij = (lcp_inter_t){.i = i, .j = m - 1, .m = n, .l = LCP[n]};
+ if (i != m - 1) {
+ // found interval contains >1 element
+ saidx_t n = L(CLD, m);
+
+ ij = (lcp_inter_t){.i = i, .j = m - 1, .m = n, .l = LCP[n]};
+ } else {
+ // empty or singleton
+ // doing L(CLD, m) is not valid in this case!
+ ij = (lcp_inter_t){.i = i, .j = i, .m = -1, .l = LCP[i]};
+ }
return ij;
}
=====================================
src/esa.h
=====================================
@@ -8,17 +8,8 @@
#include "config.h"
#include "sequence.h"
-#include <sys/types.h>
-
-#ifdef HAVE_LIBDIVSUFSORT
#include <divsufsort.h>
-#else
-
-#include "../opt/psufsort/interface.h"
-
-typedef int saidx_t;
-
-#endif
+#include <sys/types.h>
/**
* @brief Represents LCP-Intervals.
=====================================
src/io.c
=====================================
@@ -96,7 +96,7 @@ size_t string_vector_size(const struct string_vector *sv) {
}
/**
- * @brief Read a *fof* and add it's contents into a vector.
+ * @brief Read a *fof* and add its contents into a vector.
* @param file_name - The file of file names aka. fof.
* @param sv - The vector to add file names to.
*/
@@ -204,35 +204,32 @@ void read_fasta(const char *file_name, dsa_t *dsa) {
return;
}
- int l;
- int check;
-
- seq_t top = {};
- pfasta_file pf;
-
- if ((l = pfasta_parse(&pf, file_descriptor)) != 0) {
- soft_errx("%s: %s", file_name, pfasta_strerror(&pf));
+ struct pfasta_parser pp = pfasta_init(file_descriptor);
+ if (pp.errstr) {
+ soft_errx("%s: %s", file_name, pp.errstr);
goto fail;
}
- pfasta_seq ps;
- while ((l = pfasta_read(&pf, &ps)) == 0) {
- check = seq_init(&top, ps.seq, ps.name);
+ seq_t top = {};
+ while (!pp.done) {
+ struct pfasta_record pr = pfasta_read(&pp);
+ if (pp.errstr) {
+ soft_errx("%s: %s", file_name, pp.errstr);
+ goto fail;
+ }
+
+ int check = seq_init(&top, pr.sequence, pr.name);
// skip broken sequences
if (check != 0) continue;
dsa_push(dsa, top);
- pfasta_seq_free(&ps);
+ pfasta_record_free(&pr);
}
- if (l < 0) {
- soft_errx("%s: %s", file_name, pfasta_strerror(&pf));
- pfasta_seq_free(&ps);
- }
fail:
- pfasta_free(&pf);
+ pfasta_free(&pp);
close(file_descriptor);
}
@@ -295,10 +292,10 @@ void print_distances(const struct model *D, const seq_t *sequences, size_t n,
double coverage1 = model_coverage(&D(i, j));
double coverage2 = model_coverage(&D(j, i));
- if (coverage1 < 0.05 || coverage2 < 0.05) {
+ if (coverage1 < 0.2 || coverage2 < 0.2) {
const char str[] = {
- "For the two sequences '%s' and '%s' less than 5%% "
- "homology were found (%f and %f, respectively)."};
+ "For the two sequences '%s' and '%s' very little "
+ "homology was found (%f and %f, respectively)."};
soft_errx(str, sequences[i].name, sequences[j].name,
coverage1, coverage2);
}
=====================================
src/process.c
=====================================
@@ -10,112 +10,18 @@
#include "io.h"
#include "model.h"
#include "sequence.h"
-#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
-#include <string.h>
#ifdef _OPENMP
#include <omp.h>
#endif
-double shustring_cum_prob(size_t x, double g, size_t l);
int calculate_bootstrap(const struct model *M, const seq_t *sequences,
size_t n);
-/**
- * @brief Calculates the minimum anchor length.
- *
- * Given some parameters calculate the minimum length for anchors according
- * to the distribution from Haubold et al. (2009).
- *
- * @param p - The probability with which an anchor will be created under a
- * random model.
- * @param g - The the relative amount of GC in the subject.
- * @param l - The length of the subject (includes revcomp).
- * @returns The minimum length of an anchor.
- */
-size_t min_anchor_length(double p, double g, size_t l) {
- size_t x = 1;
-
- while (shustring_cum_prob(x, g / 2, l) < 1 - p) {
- x++;
- }
-
- return x;
-}
-
-/**
- * @brief Calculates the binomial coefficient of n and k.
- *
- * We could (and probably should) use gsl_sf_lnchoose(xx,kk) for this.
- *
- * @param n - The n part of the binomial coefficient.
- * @param k - analogue.
- * @returns (n choose k)
- */
-size_t binomial_coefficient(size_t n, size_t k) {
- if (n <= 0 || k > n) {
- return 0;
- }
-
- if (k == 0 || k == n) {
- return 1;
- }
-
- if (k > n - k) {
- k = n - k;
- }
-
- size_t res = 1;
-
- for (size_t i = 1; i <= k; i++) {
- res *= n - k + i;
- res /= i;
- }
-
- return res;
-}
-
-/**
- * @brief Given `x` this function calculates the probability of a shustring
- * with a length less or equal to `x` under a random model. This means, it is
- * the cumulative probability.
- *
- * Let X be the longest shortest unique substring (shustring) at any position.
- * Then this function computes P{X <= x} with respect to the given parameter
- * set. See Haubold et al. (2009). Note that `x` includes the final mismatch.
- * Thus, `x` is `match length + 1`.
- *
- * @param x - The maximum length of a shustring.
- * @param p - The half of the relative amount of GC in the DNA.
- * @param l - The length of the subject.
- * @returns The probability of a certain shustring length.
- */
-double shustring_cum_prob(size_t x, double p, size_t l) {
- double xx = (double)x;
- double ll = (double)l;
- size_t k;
-
- double s = 0.0;
-
- for (k = 0; k <= x; k++) {
- double kk = (double)k;
- double t = pow(p, kk) * pow(0.5 - p, xx - kk);
-
- s += pow(2, xx) * (t * pow(1 - t, ll)) *
- (double)binomial_coefficient(x, k);
- if (s >= 1.0) {
- s = 1.0;
- break;
- }
- }
-
- return s;
-}
-
typedef _Bool bool;
#define false 0
#define true !false
@@ -229,18 +135,17 @@ static inline bool anchor(const struct context *ctx,
* @param query - The actual query string.
* @param query_length - The length of the query string. Needed for speed
* reasons.
- * @param gc - The relative GC content of the subject.
+ * @param threshold - Minimal length for an anchor.
* @returns A matrix with estimates of base substitutions.
*/
model dist_anchor(const esa_s *C, const char *query, size_t query_length,
- double gc) {
+ size_t threshold) {
struct model ret = {.seq_len = query_length, .counts = {0}};
struct anchor this_match = {0};
struct anchor last_match = {0};
bool last_was_right_anchor = false;
-
- size_t threshold = min_anchor_length(ANCHOR_P_VALUE, gc, C->len);
+ size_t border = C->len / 2;
struct context ctx = {C, query, query_length, threshold};
@@ -256,7 +161,8 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
size_t end_Q = last_match.pos_Q + last_match.length;
// Check if this can be a right anchor to the last one.
if (this_match.pos_S > end_S &&
- this_match.pos_Q - end_Q == this_match.pos_S - end_S) {
+ this_match.pos_Q - end_Q == this_match.pos_S - end_S &&
+ (this_match.pos_S < border) == (last_match.pos_S < border)) {
// classify nucleotides in the left qanchor
model_count_equal(&ret, query + last_match.pos_Q,
=====================================
src/sequence.c
=====================================
@@ -6,6 +6,7 @@
*/
#include <ctype.h>
#include <limits.h>
+#include <math.h>
#include <stdlib.h>
#include <string.h>
@@ -14,6 +15,8 @@
#include <compat-stdlib.h>
void normalize(seq_t *S);
+double shustring_cum_prob(size_t x, double g, size_t l);
+size_t min_anchor_length(double p, double g, size_t l);
/** Create a new dynamic array for sequences. */
int dsa_init(dsa_t *A) {
@@ -147,17 +150,13 @@ char *revcomp(const char *str, size_t len) {
rev[len] = '\0';
do {
+ char c = *s--;
char d;
- switch (*s--) {
- case 'A': d = 'T'; break;
- case 'T': d = 'A'; break;
- case 'G': d = 'C'; break;
- case 'C': d = 'G'; break;
- case '!':
- d = ';';
- break; // rosebud
- default: continue;
+ if (c < 'A') {
+ d = ';'; // rosebud
+ } else {
+ d = c ^= c & 2 ? 4 : 21;
}
*r++ = d;
@@ -213,6 +212,9 @@ int seq_subject_init(seq_subject *S, const seq_t *base) {
S->RS = catcomp(base->S, base->len);
if (!S->RS) return 1;
S->RSlen = 2 * base->len + 1;
+
+ S->threshold = min_anchor_length(ANCHOR_P_VALUE, S->gc, S->RSlen);
+
return 0;
}
@@ -278,3 +280,94 @@ void normalize(seq_t *S) {
FLAGS |= F_NON_ACGT;
}
}
+
+/**
+ * @brief Calculates the minimum anchor length.
+ *
+ * Given some parameters calculate the minimum length for anchors according
+ * to the distribution from Haubold et al. (2009).
+ *
+ * @param p - The probability with which an anchor will be created under a
+ * random model.
+ * @param g - The the relative amount of GC in the subject.
+ * @param l - The length of the subject (includes revcomp).
+ * @returns The minimum length of an anchor.
+ */
+size_t min_anchor_length(double p, double g, size_t l) {
+ size_t x = 1;
+
+ while (shustring_cum_prob(x, g / 2, l) < 1 - p) {
+ x++;
+ }
+
+ return x;
+}
+
+/**
+ * @brief Calculates the binomial coefficient of n and k.
+ *
+ * We could (and probably should) use gsl_sf_lnchoose(xx,kk) for this.
+ *
+ * @param n - The n part of the binomial coefficient.
+ * @param k - analogue.
+ * @returns (n choose k)
+ */
+size_t binomial_coefficient(size_t n, size_t k) {
+ if (n <= 0 || k > n) {
+ return 0;
+ }
+
+ if (k == 0 || k == n) {
+ return 1;
+ }
+
+ if (k > n - k) {
+ k = n - k;
+ }
+
+ size_t res = 1;
+
+ for (size_t i = 1; i <= k; i++) {
+ res *= n - k + i;
+ res /= i;
+ }
+
+ return res;
+}
+
+/**
+ * @brief Given `x` this function calculates the probability of a shustring
+ * with a length less or equal to `x` under a random model. This means, it is
+ * the cumulative probability.
+ *
+ * Let X be the longest shortest unique substring (shustring) at any position.
+ * Then this function computes P{X <= x} with respect to the given parameter
+ * set. See Haubold et al. (2009). Note that `x` includes the final mismatch.
+ * Thus, `x` is `match length + 1`.
+ *
+ * @param x - The maximum length of a shustring.
+ * @param p - The half of the relative amount of GC in the DNA.
+ * @param l - The length of the subject.
+ * @returns The probability of a certain shustring length.
+ */
+double shustring_cum_prob(size_t x, double p, size_t l) {
+ double xx = (double)x;
+ double ll = (double)l;
+ size_t k;
+
+ double s = 0.0;
+
+ for (k = 0; k <= x; k++) {
+ double kk = (double)k;
+ double t = pow(p, kk) * pow(0.5 - p, xx - kk);
+
+ s += pow(2, xx) * (t * pow(1 - t, ll)) *
+ (double)binomial_coefficient(x, k);
+ if (s >= 1.0) {
+ s = 1.0;
+ break;
+ }
+ }
+
+ return s;
+}
=====================================
src/sequence.h
=====================================
@@ -40,6 +40,8 @@ typedef struct seq_subject {
* The relative amount of G or C in the DNA.
*/
double gc;
+ /** The minimum length for an anchor. */
+ size_t threshold;
} seq_subject;
void seq_free(seq_t *S);
=====================================
test/Makefile.am
=====================================
@@ -1,12 +1,6 @@
check_PROGRAMS = test_esa test_seq test_fasta test_process
dist_noinst_DATA = test_extra.sh test_random.sh test_join.sh nan.sh low_homo.sh
-if !BUILD_WITH_LIBDIVSUFSORT
-PSUFSORT=$(top_builddir)/opt/psufsort/libpsufsort.a
-# This is a hack to make sure, the test_esa executable is created with a C++ compiler
-DUMMY=dummy.cxx
-endif
-
test_seq_SOURCES = test_seq.c $(top_srcdir)/src/sequence.c
test_seq_CPPFLAGS = -I$(top_srcdir)/src -I$(top_srcdir)/opt -DDEBUG -std=gnu99
test_seq_CFLAGS = -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
@@ -16,15 +10,11 @@ test_process_SOURCES = test_process.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/
test_process_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/src -I$(top_srcdir)/opt -I$(top_srcdir)/libs -DDEBUG -std=gnu99
test_process_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
test_process_LDADD = $(GLIB_LIBS) $(PSUFSORT) $(top_builddir)/opt/libcompat.a $(top_builddir)/libs/libpfasta.a
-test_process_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
-nodist_EXTRA_test_process_SOURCES = $(DUMMY)
test_esa_SOURCES = test_esa.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/sequence.c $(top_srcdir)/src/esa.h
test_esa_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -I$(top_srcdir)/src -DDEBUG -std=gnu99
test_esa_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
test_esa_LDADD = $(GLIB_LIBS) $(PSUFSORT) $(top_builddir)/opt/libcompat.a
-test_esa_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
-nodist_EXTRA_test_esa_SOURCES = $(DUMMY)
test_fasta_SOURCES = test_fasta.cxx
=====================================
test/low_homo.sh
=====================================
@@ -1,20 +1,29 @@
#!/bin/bash -f
-./test/test_fasta -l 100000 > a.fa
-./test/test_fasta -l 100000 > b.fa
-./test/test_fasta -l 100 > both.fa
+SEED=${RANDOM_SEED:-0}
+SEED2=0
+SEED3=0
+if test $SEED -ne 0; then
+ SEED=$((SEED + 1))
+ SEED2=$((SEED + 2))
+ SEED3=$((SEED + 3))
+fi
+
+./test/test_fasta -s $SEED -l 100000 > a_low.fa
+./test/test_fasta -s $SEED2 -l 100000 > b_low.fa
+./test/test_fasta -s $SEED3 -l 100 > both_low.fa
-cat both.fa a.fa | awk -vRS='>' '{if($1 == "S0")print ">"$0 > "S0.fa"}'
-cat both.fa b.fa | awk -vRS='>' '{if($1 == "S1")print ">"$0 > "S1.fa"}'
+cat both_low.fa a_low.fa | awk -v RS='>' '{if($1 == "S0")print ">"$0 > "S0_low.fa"}'
+cat both_low.fa b_low.fa | awk -v RS='>' '{if($1 == "S1")print ">"$0 > "S1_low.fa"}'
# this is expected to trigger the low homology warning
-./src/andi -j S0.fa S1.fa 2>&1 | grep 'homology'
+./src/andi -j S0_low.fa S1_low.fa 2>&1 | grep 'homology'
EXIT_VAL=$?
if [[ EXIT_VAL -ge 1 ]]; then
echo "Triggering low homology failed" >&2
- grep '^>' a.fa b.fa both.fa
+ grep '^>' a_low.fa b_low.fa both_low.fa
fi
-rm -f a.fa b.fa both.fa S0.fa S1.fa
+rm -f a_low.fa b_low.fa both_low.fa S0_low.fa S1_low.fa
exit $EXIT_VAL
=====================================
test/nan.sh
=====================================
@@ -1,17 +1,25 @@
#!/bin/bash -f
-./test/test_fasta -l 10000 > a.fa
-./test/test_fasta -l 10000 > b.fa
+SEED=${RANDOM_SEED:-0}
+SEED2=0
+if test $SEED -ne 0; then
+ SEED=$((SEED + 1))
+ SEED2=$((SEED + 2))
+fi
+
+
+./test/test_fasta -s $SEED -l 10000 > a_nan.fa
+./test/test_fasta -s $SEED2 -l 10000 > b_nan.fa
# this is expected to trigger the nan warning
-./src/andi -j a.fa b.fa 2>&1 | grep 'nan'
+./src/andi -j a_nan.fa b_nan.fa 2>&1 | grep 'nan'
EXIT_VAL=$?
if [[ EXIT_VAL -ge 1 ]]; then
echo "Triggering nan failed" >&2
- grep '^>' a.fa b.fa both.fa
+ grep '^>' a_nan.fa b_nan.fa
fi
-rm -f a.fa b.fa
+rm -f a_nan.fa b_nan.fa
exit $EXIT_VAL
=====================================
test/test_esa.c
=====================================
@@ -6,6 +6,7 @@
int FLAGS = F_NONE;
int THREADS = 1;
+double ANCHOR_P_VALUE = 0.025;
extern const int CACHE_LENGTH;
@@ -98,6 +99,7 @@ void teardown( esa_fixture *ef, gconstpointer test_data){
free(ef->C);
seq_free(ef->S);
free(ef->S);
+ seq_subject_free(&ef->subject);
}
extern int count;
@@ -162,6 +164,9 @@ void normq_cached( esa_fixture *ef, gconstpointer test_data){
b = get_match(C, "AAAAAAAAAAAA", 12);
assert_equal_lcp( &a, &b);
+ a = get_match_cached(C, "!AAAAAAAAAAAA", 12);
+ b = get_match(C, "!AAAAAAAAAAAA", 12);
+ assert_equal_lcp( &a, &b);
}
size_t MAX_DEPTH = 11;
=====================================
test/test_extra.sh
=====================================
@@ -3,17 +3,26 @@
# Test if andi exists, and can be executed
./src/andi --version > /dev/null || exit 1
+SEED=${RANDOM_SEED:-0}
+SEED2=0
+SEED3=0
+if test $SEED -ne 0; then
+ SEED=$((SEED + 1))
+ SEED2=$((SEED + 2))
+ SEED3=$((SEED + 3))
+fi
+
# Test andi for more than just two sequences at a time
-./test/test_fasta -l 100000 -d 0.01 -d 0.01 -d 0.01 -d 0.01 | ./src/andi > /dev/null || exit 1
+./test/test_fasta -s $SEED -l 100000 -d 0.01 -d 0.01 -d 0.01 -d 0.01 | ./src/andi > /dev/null || exit 1
# Test low-memory mode
-./test/test_fasta -l 10000 > test_extra.fasta
+./test/test_fasta -s $SEED2 -l 10000 > test_extra.fasta
./src/andi test_extra.fasta > extra.out
./src/andi test_extra.fasta --low-memory > extra_low_memory.out
diff extra.out extra_low_memory.out || exit 1
# Test file of filenames
-./test/test_fasta -l 10000 > test_extra.fasta
+./test/test_fasta -s $SEED3 -l 10000 > test_extra.fasta
echo "$PWD/test_extra.fasta" > fof.txt
./src/andi test_extra.fasta > extra.out
./src/andi --file-of-filenames fof.txt > fof.out
=====================================
test/test_join.sh
=====================================
@@ -2,18 +2,27 @@
./src/andi --help > /dev/null || exit 1
+SEED=${RANDOM_SEED:-0}
+SEED2=0
+SEED3=0
+if test $SEED -ne 0; then
+ SEED=$((SEED + 1))
+ SEED2=$((SEED + 2))
+ SEED3=$((SEED + 3))
+fi
+
# Simple join test
-./test/test_fasta -l 1000 -L 1000 -d 0.1 > p1.fasta
-./test/test_fasta -l 1000 -L 1000 -d 0.1 > p2.fasta
-./test/test_fasta -l 10000 -L 10000 -d 0.1 > p3.fasta
+./test/test_fasta -s $SEED -l 1000 -L 1000 -d 0.1 > p1_join.fasta
+./test/test_fasta -s $SEED2 -l 1000 -L 1000 -d 0.1 > p2_join.fasta
+./test/test_fasta -s $SEED3 -l 10000 -L 10000 -d 0.1 > p3_join.fasta
-head -qn 2 p1.fasta p2.fasta p3.fasta > S0.fasta
-tail -qn 2 p1.fasta p2.fasta p3.fasta > S1.fasta
+head -qn 2 p1_join.fasta p2_join.fasta p3_join.fasta > S0_join.fasta
+tail -qn 2 p1_join.fasta p2_join.fasta p3_join.fasta > S1_join.fasta
-rm p1.fasta p2.fasta p3.fasta;
+rm p1_join.fasta p2_join.fasta p3_join.fasta;
-RES=$(./src/andi -m RAW -t 1 -j S0.fasta S1.fasta |
+RES=$(./src/andi -m RAW -t 1 -j S0_join.fasta S1_join.fasta |
tail -n 1 |
awk '{print ($2 - 0.1)}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
@@ -21,21 +30,28 @@ RES=$(./src/andi -m RAW -t 1 -j S0.fasta S1.fasta |
if test $RES -ne 1; then
echo "The last test computed a distance deviating more than three percent from its intended value."
- echo "See S0.fasta and S1.fasta for the used sequences."
+ echo "See S0_join.fasta and S1_join.fasta for the used sequences."
exit 1;
fi
+SEED=${RANDOM_SEED:-0}
+SEED2=0
+if test $SEED -ne 0; then
+ SEED=$((SEED + 5))
+ SEED2=$((SEED + 6))
+fi
+
#unbalanced number of contigs
-./test/test_fasta -l 1000 -L 1000 -d 0.1 > p2.fasta
-./test/test_fasta -l 10000 -L 10000 -d 0.1 > p3.fasta
+./test/test_fasta -s $SEED -l 1000 -L 1000 -d 0.1 > p2_join.fasta
+./test/test_fasta -s $SEED2 -l 10000 -L 10000 -d 0.1 > p3_join.fasta
-head -qn 2 p3.fasta > S0.fasta
-tail -qn 2 p2.fasta p3.fasta > S1.fasta
+head -qn 2 p3_join.fasta > S0_join.fasta
+tail -qn 2 p2_join.fasta p3_join.fasta > S1_join.fasta
-rm p2.fasta p3.fasta;
+rm p2_join.fasta p3_join.fasta;
-RES=$(./src/andi -m RAW -t1 -j S0.fasta S1.fasta |
+RES=$(./src/andi -m RAW -t1 -j S0_join.fasta S1_join.fasta |
tail -n 1 |
awk '{print ($2 - 0.1)}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
@@ -43,23 +59,31 @@ RES=$(./src/andi -m RAW -t1 -j S0.fasta S1.fasta |
if test $RES -ne 1; then
echo "The last test computed a distance deviating more than three percent from its intended value."
- echo "See S0.fasta and S1.fasta for the used sequences."
+ echo "See S0_join.fasta and S1_join.fasta for the used sequences."
exit 1;
fi
+SEED=${RANDOM_SEED:-0}
+SEED2=0
+SEED3=0
+if test $SEED -ne 0; then
+ SEED=$((SEED + 11))
+ SEED2=$((SEED + 12))
+ SEED3=$((SEED + 13))
+fi
#unbalanced number of contigs 2
-./test/test_fasta -l 1000 -L 1000 -d 0.1 > p1.fasta
-./test/test_fasta -l 1000 -L 1000 -d 0.1 > p2.fasta
-./test/test_fasta -l 10000 -L 10000 -d 0.1 > p3.fasta
+./test/test_fasta -s $SEED -l 1000 -L 1000 -d 0.1 > p1_join.fasta
+./test/test_fasta -s $SEED2 -l 1000 -L 1000 -d 0.1 > p2_join.fasta
+./test/test_fasta -s $SEED3 -l 10000 -L 10000 -d 0.1 > p3_join.fasta
-head -qn 2 p1.fasta p3.fasta > S0.fasta
-tail -qn 2 p1.fasta p2.fasta p3.fasta > S1.fasta
+head -qn 2 p1_join.fasta p3_join.fasta > S0_join.fasta
+tail -qn 2 p1_join.fasta p2_join.fasta p3_join.fasta > S1_join.fasta
-rm p1.fasta p2.fasta p3.fasta;
+rm p1_join.fasta p2_join.fasta p3_join.fasta;
-RES=$(./src/andi -mRAW -t 1 -j S0.fasta S1.fasta |
+RES=$(./src/andi -mRAW -t 1 -j S0_join.fasta S1_join.fasta |
tail -n 1 |
awk '{print ($2 - 0.1)}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
@@ -67,9 +91,9 @@ RES=$(./src/andi -mRAW -t 1 -j S0.fasta S1.fasta |
if test $RES -ne 1; then
echo "The last test computed a distance deviating more than three percent from its intended value."
- echo "See S0.fasta and S1.fasta for the used sequences."
+ echo "See S0_join.fasta and S1_join.fasta for the used sequences."
exit 1;
fi
-rm S0.fasta S1.fasta
+rm S0_join.fasta S1_join.fasta
=====================================
test/test_seq.c
=====================================
@@ -4,6 +4,8 @@
#include <string.h>
#include "sequence.h"
+double ANCHOR_P_VALUE = 0.025;
+
int FLAGS = F_NONE;
void test_seq_basic(){
View it on GitLab: https://salsa.debian.org/med-team/andi/-/commit/b8148b95c7203f2ce523a97ff9bcd39e6b9485b4
--
View it on GitLab: https://salsa.debian.org/med-team/andi/-/commit/b8148b95c7203f2ce523a97ff9bcd39e6b9485b4
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200327/6642b1a4/attachment-0001.html>
More information about the debian-med-commit
mailing list