[med-svn] [andi] 01/06: Imported Upstream version 0.9.5

Fabian Klötzl kloetzl-guest at moszumanska.debian.org
Tue Dec 8 13:55:53 UTC 2015


This is an automated email from the git hooks/post-receive script.

kloetzl-guest pushed a commit to branch master
in repository andi.

commit abc0e9a9c0983d5c3c1e5c5e8675f9e911a43f5c
Author: Fabian Klötzl <fabian at kloetzl.info>
Date:   Tue Dec 8 10:31:43 2015 +0100

    Imported Upstream version 0.9.5
---
 .travis.yml                 |  10 ++---
 Makefile.am                 |   1 +
 README.md                   |   6 +--
 andi-manual.pdf             | Bin 420249 -> 423356 bytes
 configure.ac                |  12 ++++--
 docs/andi.1.in              |   2 +-
 docs/manual/andi-manual.tex |  58 ++++++++++++++++++++++------
 libs/pfasta.c               |  49 ++++++++++++++++++------
 libs/pfasta.h               |   9 +++++
 src/Makefile.am             |   7 +---
 src/andi.c                  |  32 ++++++++++++++--
 src/esa.c                   |   4 +-
 src/global.h                |   2 +
 src/io.c                    |  40 ++++++++++++-------
 src/process.c               |  91 +++++++++++++++++++++++++++++++++++++++++++-
 src/sequence.c              |   4 +-
 test/test_join.sh           |   6 +--
 17 files changed, 266 insertions(+), 67 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index bc97df0..ff9e05f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,10 +8,10 @@ before_install:
 - if [ "$CXX" = "g++" ]; then sudo apt-get install -qq g++-4.8; fi
 - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
-- sudo apt-get install libglib2.0-dev
-- wget https://libdivsufsort.googlecode.com/files/libdivsufsort-2.0.1.tar.gz
-- tar -xzvf libdivsufsort-2.0.1.tar.gz
-- cd libdivsufsort-2.0.1
+- sudo apt-get install libglib2.0-dev libgsl0-dev
+- wget https://github.com/y-256/libdivsufsort/archive/master.tar.gz
+- tar -xzvf master.tar.gz
+- cd libdivsufsort-master
 - mkdir build
 - cd build
 - cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="/usr/local" ..
@@ -34,4 +34,4 @@ script:
 - make check
 - cd ..
 after_success:
-- coveralls --exclude libdivsufsort-2.0.1 --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'
+- coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'
diff --git a/Makefile.am b/Makefile.am
index 7975954..86f9f73 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -25,6 +25,7 @@ endif # BUILD_TESTS
 
 dist_noinst_DATA = README ChangeLog README.md
 dist_pdf_DATA = andi-manual.pdf
+dist_noinst_SCRIPTS= scripts/maf2phy.awk scripts/vmatch.sh
 
 # This is a small hack to satisfy both GitHub and the GNU coding style.
 README: README.md
diff --git a/README.md b/README.md
index 9be208b..4d16243 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ For the latest [stable release](https://github.com/EvolBioInf/andi/releases) of
 
 ## Prerequisites
 
-This program has the following external dependency: [libdivsufsort](https://code.google.com/p/libdivsufsort/). Please make sure you installed `libdivsufsort` before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step.
+This program has the following external dependencies: [libdivsufsort](https://code.google.com/p/libdivsufsort/) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step.
 
 
 ## Compiling
@@ -30,14 +30,14 @@ Excessive build instructions are located in `INSTALL`. If the build was successf
 	$ andi --help
 	$ man andi
 
-You can use simply `andi` with your genomes in `FASTA` format.
+You can simply use `andi` with your genomes in `FASTA` format.
 
 	$ andi S1.fasta S2.fasta
 	2
 	S1     0.0  0.1
 	s2     0.1  0.0
 
-From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf). for a more thorough description.
+From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description.
 
 
 # Links and Additional Resources
diff --git a/andi-manual.pdf b/andi-manual.pdf
index 0f731f6..f032ac5 100644
Binary files a/andi-manual.pdf and b/andi-manual.pdf differ
diff --git a/configure.ac b/configure.ac
index fdf319f..fcd04d9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([andi], [0.9.5-beta])
+AC_INIT([andi], [0.9.5])
 AM_INIT_AUTOMAKE([-Wall foreign ])
 
 AC_CONFIG_MACRO_DIR([m4])
@@ -16,10 +16,16 @@ AC_OPENMP
 
 # Execute all tests using C
 AC_LANG(C)
-
-AC_CHECK_LIB(m, cos)
 AC_OPENMP
 
+AC_CHECK_LIB([m],[cos])
+AC_CHECK_LIB([gslcblas],[cblas_dgemm], [], [have_gsl=no])
+AC_CHECK_LIB([gsl],[gsl_ran_binomial], [], [have_gsl=no])
+
+AS_IF([test "x$have_gsl" = "xno"],[
+	AC_MSG_ERROR([Missing the Gnu Scientific Library.])
+])
+
 # By default try to build with libdivsufsort.
 AC_ARG_WITH([libdivsufsort],
     AS_HELP_STRING([--without-libdivsufsort], [Build without libdivsufsort and use psufsort instead.]))
diff --git a/docs/andi.1.in b/docs/andi.1.in
index 3f46b6e..35e57ec 100644
--- a/docs/andi.1.in
+++ b/docs/andi.1.in
@@ -12,7 +12,7 @@ The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each en
 .SH OPTIONS
 .TP
 \fB\-j\fR, \fB\-\-join\fR
-Use this mode if each of your \fIFASTA\fR files represents one assembly with numerous contigs. \fBandi\fR will then treat all of the contained sequences per file as a single genome. In this mode at least one file name must be provided via command line arguments. For the output the filename is used to identify each sequence.
+Use this mode if each of your \fIFASTA\fR files represents one assembly with numerous contigs. \fBandi\fR will then treat all of the contained sequences per file as a single genome. In this mode at least one filename must be provided via command line arguments. For the output the filename is used to identify each sequence.
 .TP
 \fB\-m\fR, \fB\-\-low-memory\fR
 In multithreaded mode, \fBandi\fR requires memory linear to the amount of threads. The low memory mode changes this to a constant demand independent from the used number of threads. Unfortunately, this comes at a significant runtime cost.
diff --git a/docs/manual/andi-manual.tex b/docs/manual/andi-manual.tex
index 2a60356..6bdf268 100644
--- a/docs/manual/andi-manual.tex
+++ b/docs/manual/andi-manual.tex
@@ -46,7 +46,7 @@
 \newcommand{\thymine}{\textsc{m}\oldstylenums{2}\xspace}
 \newcommand{\local}{\textsc{m}\oldstylenums{1}\xspace}
 \newcommand{\algo}[1]{\textsc{{#1}}}
-\newcommand{\andi}{\algo{andi} }
+\newcommand{\andi}{\algo{andi}\xspace}
 \newcommand{\word}[1]{\textsf{\small#1}}
 \newcommand{\wchar}[1]{\textsf{\small#1}}
 \newcommand{\eco}{\textsc{eco}\oldstylenums{29}\xspace}
@@ -74,7 +74,8 @@
   xrightmargin=12pt,
   breaklines=true,
   basicstyle=\small\ttfamily,
-  morekeywords={make, tar, git, sudo, andi, time, man, head, cut, fneighbor, fretree, figtree},
+  morekeywords={make, tar, git, sudo, andi, time, man, head, cut, fneighbor,
+   fretree, figtree, brew, aura, autoreconf},
  % literate={~} {$\sim$}{1}
 }
 
@@ -94,7 +95,7 @@
 \section*{Abstract}
 This is the documentation of the \andi program for estimating the evolutionary distance between closely related genomes. These distances can be used to rapidly infer phylogenies for big sets of genomes. Because \andi does not compute full alignments, it is so efficient that it scales well up to thousands of bacterial genomes.
 
-This is scientific software. Please cite our paper \cite{andi} if you use \andi in your publication. Also refer to the paper for the iterna of \andi. Additionally, there is a Master's thesis with even more in depth analysis of \andi \cite{kloetzl}.
+This is scientific software. Please cite our paper \cite{andi} if you use \andi in your publication. Also refer to the paper for the internals of \andi. Additionally, there is a Master's thesis with even more in depth analysis of \andi \cite{kloetzl}.
 
 \vspace*{1cm}
 \section*{License}
@@ -104,11 +105,33 @@ This document is release under the Creative Commons Attribution Share-Alike lice
 
 \chapter{Installation} %%%%%
 
-The easiest way to obtain your own copy of \algo{andi}, is to download the latest \href{https://github.com/EvolBioInf/andi/releases}{release} from GitHub. Please note, that \andi requires \algo{libdivsufsort}\footnote{\url{https://github.com/y-256/libdivsufsort}} for optimal performance \cite{divsufsort}. If you wish to install \andi without \algo{libdivsufsort}, consult Section~\ref{sub:wo-divsufsort}.
+\section{Package Manager}
 
-\andi runs best under a \algo{Unix} commandline such as \lstinline$bash$. All following examples are also intended for that environment.
+The easiest way to install \andi is via a package manager. This also handles all dependencies for you.
 
-\section{Installing from Source Package} \label{sub:regular}
+\noindent OS X with homebrew:
+
+\begin{lstlisting}
+~ %  brew install science/andi
+\end{lstlisting}
+
+\noindent ArchLinux:
+
+\begin{lstlisting}
+~ %  aura -A andi
+\end{lstlisting}
+
+\noindent Debian and Ubuntu (soon):
+
+\begin{lstlisting}
+~ %  sudo apt-get install andi
+\end{lstlisting}
+
+\andi is intended to run in a \algo{Unix} commandline such as \lstinline$bash$ or \lstinline$zsh$. All examples in this document are also intended for that environment. You can verify that \andi was installed correctly by executing \lstinline$andi -h$. This should give you a list of all available options (see Section~\ref{sec:options}).
+
+\section{Source Package} \label{sub:regular}
+
+Download the latest \href{https://github.com/EvolBioInf/andi/releases}{release} from GitHub. Please note, that \andi requires the \algo{Gnu Scientific Library} and optionally \algo{libdivsufsort}\footnote{\url{https://github.com/y-256/libdivsufsort}} for optimal performance \cite{divsufsort}. If you wish to install \andi without \algo{libdivsufsort}, consult Section~\ref{sub:wo-divsufsort}.
 
 Once you have downloaded the package, unzip it and change into the newly created directory. 
 
@@ -191,7 +214,7 @@ S1        0.0000 0.0979
 S2        0.0979 0.0000
 \end{lstlisting}
 
-The output of \andi is a matrix in \algo{Phylip} style: On the first line the number of compared sequences is given, \lstinline!2! in our example. Then the matrix is printed, where each line is preceded by the name of the $i$th sequence. Note that the matrix is symmetric and the main diagonal contains only zeros. The numbers themselves are evolutionary distances, estimates as substitution rates.
+The output of \andi is a matrix in \algo{Phylip} style: On the first line the number of compared sequences is given, \lstinline!2! in our example. Then the matrix is printed, where each line is preceded by the name of the $i$th sequence. Note that the matrix is symmetric and the main diagonal contains only zeros. The numbers themselves are evolutionary distances, estimated from substitution rates.
 
 
 \section{Input} \label{sec:join}
@@ -222,7 +245,7 @@ S2        0.0979 0.0000
 
 If the computation completed successfully, \andi exits with the status code 0. Otherwise, the value of \lstinline$errno$ is used as the exit code. \andi can also produce warnings and error messages for the user's convenience. These messages are printed to \lstinline$stderr$ and thus do not interfere with the normal output.
 
-\section{Options}
+\section{Options} \label{sec:options}
 
 \andi takes a small number of commandline options, of which even fewer are of interest on a day-to-day basis. If \lstinline$andi -h$ displays a \lstinline$-t$ option, then \andi was compiled with multi-threading support (implemented using \algo{OpenMP}). By default \andi uses all available processors. However, to restrict the number of threads, use \lstinline$-t$.
 
@@ -243,6 +266,18 @@ In the above examples the runtime dropped from \SI{0.613}{\second}, to \SI{0.362
 
 By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. This means, the output is substitution rates, which is greater than the mismatch rate. To obtain the pure mismatch rate, use the \lstinline$--raw$ switch.
 
+Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}.
+
+\begin{lstlisting}
+~ %  andi -b 2 ../test/1M.1.fasta
+2
+S1        0.0000 0.1067
+S2        0.1067 0.0000
+2
+S1        0.0000 0.1071
+S2        0.1071 0.0000
+\end{lstlisting}
+
 \section{Example: \algo{eco29}}
 
 Here follows a real-world example of how to use \algo{andi}. It makes heavy use of the commandline and tools like \algo{Phylip}. If you prefer \algo{R}, check out this excellent \href{http://holtlab.net/2015/05/08/r-code-to-infer-tree-from-andi-output/}{blog post} by Kathryn Holt.
@@ -299,9 +334,9 @@ EOF
 
 \chapter{DevOps} %%%%%
 
-\andi is written in C/C++; mostly C99 with some parts in C++11. The sources are released on \algo{GitHub} as \emph{free software} under the \textsc{Gnu General Public License version~3} \cite{GPL}. Prebundled packages using \algo{autoconf} are also available, with the latest release being {v0.9} at the time of writing.
+\andi is written in C/C++; mostly C99 with some parts in C++11. The sources are released on \algo{GitHub} as \emph{free software} under the \textsc{Gnu General Public License version~3} \cite{GPL}. Prebundled packages using \algo{autoconf} are also available, with the latest release being {\version} at the time of writing.
 
-If you are interested in the internals of \andi, consult the paper \cite{andi} or my master thesis \cite{kloetzl}. Both explain the used approach in detail. The latter emphasizes the used algorithms, data structures and their efficient implementation.
+If you are interested in the internals of \algo{andi}, consult the paper \cite{andi} or my Master's thesis \cite{kloetzl}. Both explain the used approach in detail. The latter emphasizes the used algorithms, data structures and their efficient implementation.
 
 \section{Dependencies}
 
@@ -310,6 +345,7 @@ Here is a complete list of dependencies required for developing \algo{andi}.
 \begin{itemize}
   \item A C and a C++11 compiler,
   \item the \algo{autotools},
+  \item the \algo{Gnu Scientific Library},
   \item \algo{Pdflatex} with various packages for the manual,
   \item \algo{Git},
   \item \algo{glib2} for the unit tests,
@@ -344,7 +380,7 @@ The unit tests are located in the \andi repository under the \lstinline$./test$
 These minor issues are known. I intend to fix them, when I have time.
 
 \begin{enumerate}
-  \item This code will not work under Windows. At two places Unix-only code is used: filepath-seperators are assumed to be / and file-descriptors are used for I/O.
+  \item This code will not work under Windows. At two places Unix-only code is used: filepath-seperators are assumed to be \lstinline$/$ and file-descriptors are used for I/O.
 \end{enumerate}
 
 
diff --git a/libs/pfasta.c b/libs/pfasta.c
index c54cea7..8a629fc 100644
--- a/libs/pfasta.c
+++ b/libs/pfasta.c
@@ -56,7 +56,7 @@
 	do {                                                                       \
 		pf->errno__ = errno;                                                   \
 		pf->errstr = NULL;                                                     \
-		return errno;                                                          \
+		return -2;                                                             \
 	} while (0)
 
 #define PF_EXIT_FORWARD() return -1
@@ -71,14 +71,15 @@
 	do {                                                                       \
 		pf->errno__ = errno;                                                   \
 		pf->errstr = NULL;                                                     \
-		return_code = errno;                                                   \
+		return_code = -2;                                                      \
 		goto cleanup;                                                          \
 	} while (0)
 
-#define PF_FAIL_STR(str)                                                       \
+#define PF_FAIL_STR(...)                                                       \
 	do {                                                                       \
 		pf->errno__ = 0;                                                       \
-		pf->errstr = str;                                                      \
+		(void) snprintf(pf->errstr_buf, PF_ERROR_STRING_LENGTH, __VA_ARGS__);  \
+		pf->errstr = pf->errstr_buf;                                           \
 		return_code = -1;                                                      \
 		goto cleanup;                                                          \
 	} while (0)
@@ -151,6 +152,10 @@ static inline int buffer_peek(const pfasta_file *pf) {
  * @returns 0 iff successful.
  */
 static inline int buffer_adv(pfasta_file *pf) {
+	if (buffer_peek(pf) == '\n') {
+		pf->line++;
+	}
+
 	if (pf->readptr < pf->fillptr - 1) {
 		pf->readptr++;
 		return 0;
@@ -192,6 +197,8 @@ void pfasta_free(pfasta_file *pf) {
 	pf->buffer = pf->readptr = pf->fillptr = pf->errstr = NULL;
 	pf->errno__ = 0;
 	pf->fd = -1;
+	pf->line = 0;
+	pf->unexpected_char = '\0';
 }
 
 /** @brief Creates a new parser for the given file. This includes allocating the
@@ -209,6 +216,8 @@ int pfasta_parse(pfasta_file *pf, int file_descriptor) {
 	pf->errno__ = 0;
 	pf->buffer = pf->readptr = pf->fillptr = pf->errstr = NULL;
 	pf->fd = file_descriptor;
+	pf->line = 1;
+	pf->unexpected_char = '\0';
 
 	if (buffer_init(pf) != 0) PF_FAIL_FORWARD();
 
@@ -244,8 +253,11 @@ int pfasta_read(pfasta_file *pf, pfasta_seq *ps) {
 	*ps = (pfasta_seq){NULL, NULL, NULL};
 	int return_code = 0;
 
-	if (buffer_peek(pf) == EOF) return 1;
-	if (buffer_peek(pf) != '>') PF_FAIL_STR("Expected '>'");
+	int c = buffer_peek(pf);
+	if (c == EOF) return 1;
+	if (c != '>') {
+		PF_FAIL_STR("Expected '>', but found '%c' on line %zu", c, pf->line);
+	}
 
 	if (pfasta_read_name(pf, ps) < 0) PF_FAIL_FORWARD();
 	if (isblank(buffer_peek(pf))) {
@@ -278,13 +290,16 @@ int pfasta_read_name(pfasta_file *pf, pfasta_seq *ps) {
 		if (buffer_adv(pf) != 0) PF_FAIL_FORWARD();
 
 		int c = buffer_peek(pf);
-		if (c == EOF) PF_FAIL_STR("Unexpected EOF in sequence name");
+		if (c == EOF) {
+			PF_FAIL_STR("Unexpected EOF in sequence name on line %zu",
+			            pf->line);
+		}
 		if (!isgraph(c)) break;
 
 		if (dynstr_put(&name, c) != 0) PF_FAIL_ERRNO();
 	}
 
-	if (dynstr_len(&name) == 0) PF_FAIL_STR("Empty name");
+	if (dynstr_len(&name) == 0) PF_FAIL_STR("Empty name on line %zu", pf->line);
 
 	ps->name = dynstr_move(&name);
 
@@ -309,8 +324,11 @@ int pfasta_read_comment(pfasta_file *pf, pfasta_seq *ps) {
 		if (buffer_adv(pf) != 0) PF_FAIL_FORWARD();
 
 		int c = buffer_peek(pf);
-		if (c == EOF) PF_FAIL_STR("Unexpected EOF in sequence comment");
 		if (c == '\n') break;
+		if (c == EOF) {
+			PF_FAIL_STR("Unexpected EOF in sequence comment on line %zu",
+			            pf->line);
+		}
 
 		if (dynstr_put(&comment, c) != 0) PF_FAIL_ERRNO();
 	}
@@ -335,7 +353,8 @@ int pfasta_read_seq(pfasta_file *pf, pfasta_seq *ps) {
 	if (dynstr_init(&seq) != 0) PF_FAIL_ERRNO();
 
 	while (1) {
-		assert(buffer_peek(pf) == '\n');
+		// The only guaranty is !graph && !blank
+		assert(!isgraph(buffer_peek(pf)) && !isblank(buffer_peek(pf)));
 
 		// deal with the first character explicitly
 		if (buffer_adv(pf) != 0) PF_FAIL_FORWARD();
@@ -351,14 +370,20 @@ int pfasta_read_seq(pfasta_file *pf, pfasta_seq *ps) {
 
 			c = buffer_peek(pf);
 			if (c == '\n') break;
+			if (c == EOF) break;
 
 		regular:
-			if (!isgraph(c)) PF_FAIL_STR("Unexpected character in sequence");
+			if (!isgraph(c)) {
+				PF_FAIL_STR("Unexpected character '%c' in sequence on line %zu",
+				            c, pf->line);
+			}
 			if (dynstr_put(&seq, c) != 0) PF_FAIL_ERRNO();
 		}
 	}
 
-	if (dynstr_len(&seq) == 0) PF_FAIL_STR("Empty sequence");
+	if (dynstr_len(&seq) == 0) {
+		PF_FAIL_STR("Empty sequence on line %zu", pf->line);
+	}
 	ps->seq = dynstr_move(&seq);
 
 cleanup:
diff --git a/libs/pfasta.h b/libs/pfasta.h
index 603cff1..c6de3fd 100644
--- a/libs/pfasta.h
+++ b/libs/pfasta.h
@@ -18,11 +18,20 @@
 #ifndef PFASTA_H
 #define PFASTA_H
 
+/** The following is the maximum length of an error string. It has to be
+ * carefully chosen, so that all calls to PF_FAIL_STR succeed. For instance,
+ * the line number can account for up to 20 characters.
+ */
+#define PF_ERROR_STRING_LENGTH 100
+
 typedef struct pfasta_file {
 	char *buffer, *readptr, *fillptr;
 	char *errstr;
 	int errno__;
 	int fd;
+	size_t line;
+	char errstr_buf[PF_ERROR_STRING_LENGTH];
+	char unexpected_char;
 } pfasta_file;
 
 typedef struct pfasta_seq {
diff --git a/src/Makefile.am b/src/Makefile.am
index 8eebbe6..6f5183a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -14,10 +14,5 @@ andi_LDADD = $(PSUFSORT) $(top_builddir)/libs/libpfasta.a $(top_builddir)/opt/li
 nodist_EXTRA_andi_SOURCES = $(DUMMY)
 
 .PHONY: perf
-perf: CFLAGS+= -g -O3 -ggdb
+perf: CFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
 perf: andi
-
-.PHONY: asan
-asan: CFLAGS+= -fsanitize=address -fno-omit-frame-pointer -g -O1
-asan: CXXFLAGS+= -fsanitize=address -fno-omit-frame-pointer -g -O1
-asan: andi
diff --git a/src/andi.c b/src/andi.c
index b1ba8d8..c2c3421 100644
--- a/src/andi.c
+++ b/src/andi.c
@@ -40,6 +40,7 @@
 /* Global variables */
 int FLAGS = 0;
 int THREADS = 1;
+long unsigned int BOOTSTRAP = 0;
 double RANDOM_ANCHOR_PROP = 0.05;
 
 void usage(void);
@@ -55,7 +56,7 @@ void version(void);
 int main( int argc, char *argv[]){
 	int c;
 	int version_flag = 0;
-	
+
 	struct option long_options[] = {
 		{"version", no_argument, &version_flag, 1},
 		{"help", no_argument, NULL, 'h'},
@@ -64,6 +65,7 @@ int main( int argc, char *argv[]){
 		{"join", no_argument, NULL, 'j'},
 		{"low-memory", no_argument, NULL, 'm'},
 		{"threads", required_argument, NULL, 't'},
+		{"bootstrap", required_argument, NULL, 'b'},
 		{0,0,0,0}
 	};
 
@@ -77,7 +79,7 @@ int main( int argc, char *argv[]){
 	
 		int option_index = 0;
 		
-		c = getopt_long( argc, argv, "jvhrt:p:m", long_options, &option_index);
+		c = getopt_long( argc, argv, "jvhrt:p:mb:", long_options, &option_index);
 		
 		if( c == -1){
 			break;
@@ -156,6 +158,23 @@ int main( int argc, char *argv[]){
 #endif
 				}
 				break;
+			case 'b':
+				{
+					errno = 0;
+					char *end;
+					long unsigned int bootstrap = strtoul( optarg, &end, 10);
+
+					if( errno || end == optarg || *end != '\0' || bootstrap == 0){
+						warnx(
+							"Expected a positive number for -b argument, but '%s' was given. "
+							"Ignoring -b argument.", optarg
+						);
+						break;
+					}
+
+					BOOTSTRAP = bootstrap - 1;
+				}
+				break;
 			case '?': /* intentional fall-through */
 			default:
 				usage();
@@ -217,6 +236,8 @@ int main( int argc, char *argv[]){
 		warnx("I am truly sorry, but with less than two sequences (%zu given) there is nothing to compare.", n);
 	}
 
+
+
 	dsa_free( &dsa);
 	return 0;
 }
@@ -226,16 +247,19 @@ int main( int argc, char *argv[]){
  */
 void usage(void){
 	const char str[]= {
-		"Usage: andi [-jrv] [-p FLOAT] [-t INT] FILES...\n"
+		"Usage: andi [-bjrv] [-p FLOAT] [-t INT] FILES...\n"
 		"\tFILES... can be any sequence of FASTA files. If no files are supplied, stdin is used instead.\n"
 		"Options:\n"
+		"  -b, --bootstrap <INT> \n"
+		"                    Print additional bootstrap matrices\n"
 		"  -j, --join        Treat all sequences from one file as a single genome\n"
 		"  -m, --low-memory  Use less memory at the cost of speed\n"
 		"  -p <FLOAT>        Significance of an anchor pair; default: 0.05\n"
 		"  -r, --raw         Calculates raw distances; default: Jukes-Cantor corrected\n"
 		"  -v, --verbose     Prints additional information\n"
 #ifdef _OPENMP
-		"  -t <INT>          The number of threads to be used; by default, all available processors are used\n"
+		"  -t, --threads <INT> \n"
+		"                    The number of threads to be used; by default, all available processors are used\n"
 #endif
 		"  -h, --help        Display this help and exit\n"
 		"      --version     Output version information and acknowledgments\n"
diff --git a/src/esa.c b/src/esa.c
index 992878f..3916c70 100644
--- a/src/esa.c
+++ b/src/esa.c
@@ -229,7 +229,7 @@ int esa_init_FVC(esa_s *self){
 	const int *LCP= self->LCP;
 
 	FVC[0] = '\0';
-	for(size_t i=len; i--; FVC++, SA++, LCP++){
+	for(size_t i=len; i; i--, FVC++, SA++, LCP++){
 		*FVC = S[*SA + *LCP];
 	}
 
@@ -648,7 +648,7 @@ lcp_inter_t get_match_cached( const esa_s *C, const char *query, size_t qlen){
 	if( qlen <= CACHE_LENGTH) return get_match( C, query, qlen);
 
 	ssize_t offset = 0;
-	for( size_t i = 0; i< CACHE_LENGTH; i++){
+	for( size_t i = 0; i< CACHE_LENGTH && offset >= 0; i++){
 		offset <<= 2;
 		offset |= char2code(query[i]);
 	}
diff --git a/src/global.h b/src/global.h
index 4f6f1d3..c21ff46 100644
--- a/src/global.h
+++ b/src/global.h
@@ -29,6 +29,8 @@ extern int THREADS;
  */
 extern double RANDOM_ANCHOR_PROP;
 
+extern long unsigned int BOOTSTRAP;
+
 /** 
  * This enum contains the available flags. Please note that all
  * available options are a power of 2.
diff --git a/src/io.c b/src/io.c
index ad638ec..e17ec2c 100644
--- a/src/io.c
+++ b/src/io.c
@@ -75,7 +75,7 @@ void read_fasta( const char* file_name, dsa_t *dsa){
 	pfasta_file pf;
 
 	if ((l = pfasta_parse(&pf, file_descriptor)) != 0) {
-		warnx("%s: Parser initialization failed: %s", file_name, pfasta_strerror(&pf));
+		warnx("%s: %s", file_name, pfasta_strerror(&pf));
 		goto fail;
 	}
 
@@ -91,7 +91,7 @@ void read_fasta( const char* file_name, dsa_t *dsa){
 	}
 
 	if (l < 0) {
-		warnx("%s: Input parsing failed: %s", file_name, pfasta_strerror(&pf));
+		warnx("%s: %s", file_name, pfasta_strerror(&pf));
 		pfasta_seq_free(&ps);
 	}
 
@@ -113,25 +113,29 @@ fail:
 void print_distances( const data_t *D, const seq_t *sequences, size_t n){
 
 	int use_scientific = 0;
-	int failed = 0;
 	size_t i,j;
 
-	for( i=0; i<n && (!use_scientific || !failed); i++){
+	for( i=0; i<n; i++){
 		for( j=0; j<n; j++){
-			if( D(i,j).distance > 0 && D(i,j).distance < 0.001 ){
-				use_scientific = 1;
-			}
 			if( isnan(D(i,j).distance)){
-				failed = 1;
+				const char str[] = {
+					"For the two sequences '%s' and '%s' the distance computation "
+					"failed and is reported as nan. "
+					"Please refer to the documentation for further details."
+				};
+				warnx(str, sequences[i].name, sequences[j].name);
+			} else if( D(i,j).distance > 0 && D(i,j).distance < 0.001 ){
+				use_scientific = 1;
+			} else if( i < j && D(i,j).coverage < 0.05 && D(j,i).coverage < 0.05){
+				const char str[] = {
+					"For the two sequences '%s' and '%s' less than 5%% "
+					"homology were found (%f and %f, respectively)."
+				};
+				warnx(str, sequences[i].name, sequences[j].name, D(i,j).coverage, D(j,i).coverage);
 			}
 		}
 	}
 
-	if( failed){
-		warnx("Some distance computations failed and are reported as nan. "
-			"Please refer to the documentation for further details.");
-	}
-
 	printf("%zu\n", n);
 	for( i=0;i<n;i++){
 		// Print exactly nine characters of the name. Pad with spaces if necessary.
@@ -159,6 +163,15 @@ void print_distances( const data_t *D, const seq_t *sequences, size_t n){
 				val = 0.0;
 			}
 
+			if( !(FLAGS & F_RAW)){
+				val = -0.75 * log(1.0- (4.0 / 3.0) * val ); // jukes cantor
+			}
+
+			// fix negative zero
+			if( val <= 0.0 ){
+				val = 0.0;
+			}
+
 			// use scientific notation for small numbers
 			printf(use_scientific ? " %1.4e" : " %1.4f", val);
 		}
@@ -181,4 +194,3 @@ void print_coverages( const data_t *D, size_t n){
 		printf("\n");
 	}
 }
-
diff --git a/src/process.c b/src/process.c
index 4b7a8a9..e58cace 100644
--- a/src/process.c
+++ b/src/process.c
@@ -15,11 +15,16 @@
 #include "sequence.h"
 #include "io.h"
 
+#include <time.h>
+#include <gsl/gsl_rng.h>
+#include <gsl/gsl_randist.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
 double shuprop( size_t x, double g, size_t l);
+int calculate_bootstrap(const data_t *M, const seq_t *sequences, size_t n);
 
 /**
  * @brief Calculates the minimum anchor length.
@@ -186,7 +191,8 @@ data_t dist_anchor( const esa_s *C, const char *query, size_t query_length, doub
 #endif
 
 			// Check if this can be a right anchor to the last one.
-			if( this_pos_Q - last_pos_Q == this_pos_S - last_pos_S ){
+			if( this_pos_S > last_pos_S &&
+				this_pos_Q - last_pos_Q == this_pos_S - last_pos_S ){
 				num_right_anchors++;
 #ifdef DEBUG
 				if( this_pos_S < (size_t)(C->len / 2)){
@@ -355,12 +361,95 @@ void calculate_distances( seq_t* sequences, int n){
 	// print the results
 	print_distances( M, sequences, n);
 
+
 	// print additional information.
 	if( FLAGS & F_VERBOSE){
 		print_coverages( M, n);
 	}
+
+	// create new bootstrapped distance matrices
+	if( BOOTSTRAP ){
+		int res = calculate_bootstrap(M, sequences, n);
+		if( res){
+			warnx("Bootstrapping failed.");
+		}
+	}
+
 	
 	free(M);
 }
 
+/** Yet another hack. */
+#define B( X, Y) (B[ (X)*n + (Y) ])
 
+/** @brief Computes a bootstrap from _pairwise_ aligments.
+ *
+ * Doing bootstrapping for alignments with only two sequences is easy. As
+ * we only count substitutions, this boils down to a binomial distribution.
+ * `n` is the number of homologous sites, and `p` is the (uncorrected)
+ * substitution rate. Thus the bootstrapped distance `q` is `B(n;p)/n`.
+ * Note that `E(q) = p` and `Var(q) = p*(1-p)/n`. So in the limit of long
+ * sequences (i.e. big `n`) `q` almost certainly converges to `p`.
+ *
+ * @param M - the initial distance matrix
+ * @param sequences - a list of the sequences, containing their lengths
+ * @param n - the number of sequences
+ *
+ * The number of bootstrapped distance matrices to print is implicitly
+ * passed via the global `BOOTSTRAP` variable.
+ *
+ * @returns 0 iff successful.
+ */
+int calculate_bootstrap(const data_t *M, const seq_t *sequences, size_t n){
+	if( !M || !sequences || !n){
+		return 1;
+	}
+
+	// B is the new bootstrap matrix
+	data_t *B = malloc(n * n * sizeof(data_t));
+	if( !B) return 2;
+
+	gsl_rng *rng = gsl_rng_alloc(gsl_rng_default);
+	if( !rng){
+		free(B);
+		return 3;
+	}
+
+	// seed the random number generator with the current time
+	gsl_rng_set( rng, time(NULL));
+
+	// Compute a number of new distance matrices
+	while( BOOTSTRAP--){
+		for( size_t i=0; i< n; i++){
+			for( size_t j=i; j<n; j++){
+				if( i == j){
+					B(i,j) = (data_t){0.0,0.0};
+					continue;
+				}
+				/* The classical bootstrapping process, as described by
+					Felsenstein, resamples all nucleotides of a MSA. As andi
+					only computes a pairwise alignment, this process boils
+					down to a simple binomial distribution around a mean of
+					the original distance. */
+
+				double ijnucl = M(i,j).coverage * (double)sequences[j].len;
+				double jinucl = M(j,i).coverage * (double)sequences[i].len;
+				double homonucl = ijnucl + jinucl;
+				double avg_distance  = (M(i,j).distance * ijnucl + M(j,i).distance * jinucl)
+					/ (homonucl);
+
+				// The actual coverage value doesn't matter, just ensure its > 0
+				data_t datum = {.coverage = 1.0};
+
+				datum.distance = gsl_ran_binomial( rng, avg_distance, homonucl) / homonucl;
+				B(j,i) = B(i,j) = datum;
+			}
+		}
+
+		print_distances(B, sequences, n);
+	}
+
+	free(B);
+	gsl_rng_free(rng);
+	return 0;
+}
diff --git a/src/sequence.c b/src/sequence.c
index 973699d..3fcc088 100644
--- a/src/sequence.c
+++ b/src/sequence.c
@@ -156,7 +156,7 @@ char *revcomp( const char *str, size_t len){
 	char c, d;
 	char local_non_acgt = 0;
 	
-	while( len --> 0 ){
+	do {
 		c = *s--;
 		
 		switch( c){
@@ -171,7 +171,7 @@ char *revcomp( const char *str, size_t len){
 		}
 		
 		*r++ = d;
-	}
+	} while ( --len );
 	
 	if( local_non_acgt ){
 		#pragma omp atomic
diff --git a/test/test_join.sh b/test/test_join.sh
index 3053efb..bf17bc9 100755
--- a/test/test_join.sh
+++ b/test/test_join.sh
@@ -13,7 +13,7 @@ tail -qn 2 p1.fasta p2.fasta p3.fasta > S1.fasta
 rm p1.fasta p2.fasta p3.fasta;
 
 
-RES=$($srcdir/src/andi -rt 1 -j S0.fasta S1.fasta |
+RES=$(./src/andi -rt 1 -j S0.fasta S1.fasta |
 	tail -n 1 |
 	awk '{print ($2 - 0.1)}' |
 	awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
@@ -35,7 +35,7 @@ tail -qn 2 p2.fasta p3.fasta > S1.fasta
 rm p2.fasta p3.fasta;
 
 
-RES=$($srcdir/src/andi -rt1 -j S0.fasta S1.fasta |
+RES=$(./src/andi -rt1 -j S0.fasta S1.fasta |
         tail -n 1 |
         awk '{print ($2 - 0.1)}' |
         awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
@@ -59,7 +59,7 @@ tail -qn 2 p1.fasta p2.fasta p3.fasta > S1.fasta
 rm p1.fasta p2.fasta p3.fasta;
 
 
-RES=$($srcdir/src/andi -rt 1 -j S0.fasta S1.fasta |
+RES=$(./src/andi -rt 1 -j S0.fasta S1.fasta |
         tail -n 1 |
         awk '{print ($2 - 0.1)}' |
         awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/andi.git



More information about the debian-med-commit mailing list