[med-svn] [andi] 02/07: Imported Upstream version 0.9.6
Fabian Klötzl
kloetzl-guest at moszumanska.debian.org
Fri Feb 5 19:18:49 UTC 2016
This is an automated email from the git hooks/post-receive script.
kloetzl-guest pushed a commit to branch master
in repository andi.
commit a88fbdf73c90e8061c27c83925099450a4f25c03
Author: Fabian Klötzl <fabian at kloetzl.info>
Date: Fri Feb 5 12:50:47 2016 +0000
Imported Upstream version 0.9.6
---
.clang-format | 9 ++
.travis.yml | 47 +++---
Makefile.am | 6 +-
README.md | 6 +-
andi-manual.pdf | Bin 423356 -> 425545 bytes
configure.ac | 17 +-
docs/andi.1.in | 13 +-
docs/manual/andi-manual.tex | 17 +-
opt/psufsort/Makefile.am | 2 +-
src/Makefile.am | 7 +-
src/andi.c | 326 ++++++++++++++++++++-------------------
src/dist_hack.h | 46 +++---
src/esa.c | 368 +++++++++++++++++++++-----------------------
src/esa.h | 19 ++-
src/global.h | 21 ++-
src/io.c | 161 +++++++++----------
src/io.h | 24 +--
src/model.c | 255 ++++++++++++++++++++++++++++++
src/model.h | 64 ++++++++
src/process.c | 336 ++++++++++++++++++----------------------
src/process.h | 6 +-
src/sequence.c | 203 +++++++++++-------------
src/sequence.h | 20 +--
test/Makefile.am | 6 +-
test/test_extra.sh | 6 +-
test/test_fasta.cxx | 26 +++-
test/test_join.sh | 6 +-
test/test_random.sh | 54 ++++++-
test/test_seq.c | 4 +-
29 files changed, 1218 insertions(+), 857 deletions(-)
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..b2ff8f9
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,9 @@
+BasedOnStyle: LLVM
+IndentWidth: 4
+TabWidth: 4
+UseTab: Always
+AllowShortIfStatementsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: false
+IndentCaseLabels: true
+AllowShortCaseLabelsOnASingleLine: true
+BreakBeforeBraces: Attach
diff --git a/.travis.yml b/.travis.yml
index ff9e05f..2ca5d14 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,32 +1,39 @@
language: cpp
compiler:
- gcc
+sudo: false
+addons:
+ apt:
+ sources:
+ - deadsnakes
+ - ubuntu-toolchain-r-test
+ packages:
+ - cmake
+ - g++-4.8
+ - libglib2.0-dev
+ - libgsl0-dev
+install:
+ - export LIBDIVDIR="$HOME/libdivsufsort"
+ - pip install --user cpp-coveralls
+ - wget https://github.com/y-256/libdivsufsort/archive/master.tar.gz
+ - tar -xzvf master.tar.gz
+ - cd libdivsufsort-master && mkdir build && cd build
+ - cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" ..
+ - make && make install
before_install:
-- sudo pip install cpp-coveralls
-- sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
-- sudo apt-get update -qq
-- if [ "$CXX" = "g++" ]; then sudo apt-get install -qq g++-4.8; fi
- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
-- sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
-- sudo apt-get install libglib2.0-dev libgsl0-dev
-- wget https://github.com/y-256/libdivsufsort/archive/master.tar.gz
-- tar -xzvf master.tar.gz
-- cd libdivsufsort-master
-- mkdir build
-- cd build
-- cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="/usr/local" ..
-- make
-- sudo make install
-- cd $TRAVIS_BUILD_DIR
-- export LD_LIBRARY_PATH=/usr/local:$LD_LIBRARY_PATH
+
+# - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
script:
-- export LD_LIBRARY_PATH=/usr/local/lib:/usr/local:$LD_LIBRARY_PATH
-- export OMP_NUM_THREADS=4
+- export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib"
+- export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH"
+- cd $TRAVIS_BUILD_DIR
- autoreconf -fvi -Im4
-- ./configure --enable-unit-tests CFLAGS='-fprofile-arcs -ftest-coverage' CXXFLAGS='-fprofile-arcs -ftest-coverage'
+- export MYFLAGS="-fprofile-arcs -ftest-coverage -I$LIBDIVDIR/include"
+- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
- make
- make check
-- make distcheck
+- make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\""
- tar xzvf andi-*.tar.gz
- cd andi-*
- ./configure --enable-unit-tests --without-libdivsufsort
diff --git a/Makefile.am b/Makefile.am
index 86f9f73..0ef0c80 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -10,15 +10,17 @@ endif
SUBDIRS = . $(OPT_PSUFSORT) libs opt src docs
DIST_SUBDIRS = . libs opt src docs test opt/psufsort
-# Conditionaly build the tests
+# Conditionally build the tests
if BUILD_TESTS
SUBDIRS+= test
+AM_TESTS_ENVIRONMENT= \
+ RANDOM_SEED='@SEED@' ; export RANDOM_SEED ;
+
TESTS = test/test_esa test/test_seq test/test_extra.sh test/test_random.sh test/test_join.sh
$(TESTS): src/andi
-# $(MAKE) -C test
endif # BUILD_TESTS
diff --git a/README.md b/README.md
index 4d16243..b863c16 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ For the latest [stable release](https://github.com/EvolBioInf/andi/releases) of
## Prerequisites
-This program has the following external dependencies: [libdivsufsort](https://code.google.com/p/libdivsufsort/) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step.
+This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step.
## Compiling
@@ -42,11 +42,11 @@ From this distance matrix the phylogeny can be inferred via neighbor-joining. Ch
# Links and Additional Resources
-The release of this software is accompanied by a paper from [Haubold et al.](http://bioinformatics.oxfordjournals.org/content/31/8/1169). It explains the used *anchor distance* strategy in great detail. The `maf2phy.awk` script used in the validation process is located under `scripts`. Simulations were done using our own [simK](http://guanine.evolbio.mpg.de/bioBox/) tool.
+The release of this software is accompanied by a paper from [Haubold et al.](http://bioinformatics.oxfordjournals.org/content/31/8/1169). It explains the used *anchor distance* strategy in great detail. The `maf2phy.awk` script used in the validation process is located under `scripts`. Simulations were done using our own [simK](http://guanine.evolbio.mpg.de/bioBox/) tool. For a demo visualising the internals of andi visit our [GitHub pages](http://evolbioinf.github.io/andi/).
## Data Sets
-1. 29. E. coli strains: [data](http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz)
+1. 29 E. coli and Shigella strains: [data](http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz)
2. 109 E. coli ST131 strains ([paper](http://www.pnas.org/content/early/2014/03/28/1322678111.abstract)):
* [99 newly sequenced strains](https://github.com/BeatsonLab-MicrobialGenomics/ST131_99)
* [10 previously published strains](http://guanine.evolbio.mpg.de/andi/st131_extra.tgz)
diff --git a/andi-manual.pdf b/andi-manual.pdf
index f032ac5..f8d4467 100644
Binary files a/andi-manual.pdf and b/andi-manual.pdf differ
diff --git a/configure.ac b/configure.ac
index fcd04d9..718131f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([andi], [0.9.5])
+AC_INIT([andi], [0.9.6])
AM_INIT_AUTOMAKE([-Wall foreign ])
AC_CONFIG_MACRO_DIR([m4])
@@ -52,6 +52,8 @@ The latter may result in longer runtimes.])
AM_CONDITIONAL([BUILD_WITH_LIBDIVSUFSORT],[test "x${with_libdivsufsort}" != "xno"])
+
+
# The unit tests require GLIB2. So by default do not build the test.
# If enabled, check for glib.
@@ -62,6 +64,16 @@ AC_ARG_ENABLE([unit-tests],
AM_CONDITIONAL([BUILD_TESTS],[test "x${try_unit_tests}" = xyes])
+# The user may set a seed for the unit tests, so that builds are reproducible.
+# A value of 0 makes the tests random.
+AC_ARG_WITH([seed],
+ [AS_HELP_STRING([--with-seed=INT],
+ [random seed for reproducible builds. @<:@default: 0@:>@])],
+ [SEED=$withval],
+ [SEED=0])
+
+AC_SUBST([SEED])
+
if test "x${try_unit_tests}" = xyes; then
have_glib=yes
@@ -74,6 +86,7 @@ if test "x${try_unit_tests}" = xyes; then
AX_CXX_COMPILE_STDCXX_11([],[mandatory])
fi
+
# Check for various headers including those used by libdivsufsort.
AC_CHECK_HEADERS([limits.h stdlib.h string.h unistd.h stdint.h inttypes.h err.h errno.h fcntl.h])
@@ -92,7 +105,7 @@ AC_HEADER_STDBOOL
# it should be avoided in the first place. Thus I really don't need these checks.
AC_CHECK_FUNCS([floor pow sqrt strdup strerror])
-AC_CHECK_FUNCS([strndup])
+AC_CHECK_FUNCS([strndup strcasecmp])
AC_CHECK_FUNCS([strchr strrchr strchrnul])
AC_CHECK_FUNCS([strtoul strtod])
AC_CHECK_FUNCS([reallocarray])
diff --git a/docs/andi.1.in b/docs/andi.1.in
index 35e57ec..94e8efd 100644
--- a/docs/andi.1.in
+++ b/docs/andi.1.in
@@ -3,7 +3,7 @@
andi \- estimates evolutionary distance
.SH SYNOPSIS
.B andi
-[\fI-jrv\fR] [\fI-p FLOAT\fR] [\fI-t INT\fR] \fIFILES\fR...
+[\fI-jlv\fR] [\fI-b INT\fR] [\fI-p FLOAT\fR] [\fI-m MODEL\fR] [\fI-t INT\fR] \fIFILES\fR...
.SH DESCRIPTION
.TP
\fBandi\fR estimates the evolutionary distance between closely related genomes. For this \fBandi\fR reads the input sequences from \fIFASTA\fR files and computes the pairwise anchor distance. The idea behind this is explained in a paper by Haubold et al. (see below).
@@ -11,18 +11,21 @@ andi \- estimates evolutionary distance
The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse [...]
.SH OPTIONS
.TP
+\fB\-b, \-\-bootstrap\fR <INT>
+Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016, in review) for a detailed explanation.
+.TP
\fB\-j\fR, \fB\-\-join\fR
Use this mode if each of your \fIFASTA\fR files represents one assembly with numerous contigs. \fBandi\fR will then treat all of the contained sequences per file as a single genome. In this mode at least one filename must be provided via command line arguments. For the output the filename is used to identify each sequence.
.TP
-\fB\-m\fR, \fB\-\-low-memory\fR
+\fB\-l\fR, \fB\-\-low-memory\fR
In multithreaded mode, \fBandi\fR requires memory linear to the amount of threads. The low memory mode changes this to a constant demand independent from the used number of threads. Unfortunately, this comes at a significant runtime cost.
.TP
+\fB\-m\fR, \fB\-\-model\fR <Raw|JC|Kimura>
+Different models of nucleotide evolution are supported. By default the Jukes-Cantor correction is used.
+.TP
\fB\-p\fR <FLOAT>
Significance of an anchor pair; default: 0.05.
.TP
-\fB\-r\fR, \fB\-\-raw\fR
-Calculates raw distances; default: Jukes\-Cantor corrected.
-.TP
\fB\-t\fR <INT>
The number of threads to be used; by default, all available processors are used.
.br
diff --git a/docs/manual/andi-manual.tex b/docs/manual/andi-manual.tex
index 6bdf268..d2fd86b 100644
--- a/docs/manual/andi-manual.tex
+++ b/docs/manual/andi-manual.tex
@@ -153,16 +153,19 @@ Once you have downloaded the package, unzip it and change into the newly created
Now \andi should be ready for use. Try invoking the help.
\begin{lstlisting}
-~/andi-0.9 % andi -h
-Usage: andi [-jrv] [-p FLOAT] FILES...
- FILES... can be any sequence of FASTA files. If no files are supplied, stdin is used instead.
+Usage: andi [-jlv] [-b INT] [-p FLOAT] [-m MODEL] [-t INT] FILES...
+ FILES... can be any sequence of FASTA files. If no files are supplied, stdin is used instead.
Options:
+ -b, --bootstrap <INT>
+ Print additional bootstrap matrices
-j, --join Treat all sequences from one file as a single genome
- -m, --low-memory Use less memory at the cost of speed
+ -l, --low-memory Use less memory at the cost of speed
+ -m, --model <Raw|JC|Kimura>
+ Pick an evolutionary model; default: JC
-p <FLOAT> Significance of an anchor pair; default: 0.05
- -r, --raw Calculates raw distances; default: Jukes-Cantor corrected
-v, --verbose Prints additional information
- -t <INT> The number of threads to be used; default: 1
+ -t, --threads <INT>
+ The number of threads to be used; by default, all available processors are used
-h, --help Display this help and exit
--version Output version information and acknowledgments
\end{lstlisting}
@@ -264,7 +267,7 @@ S2 0.0995 0.0000
In the above examples the runtime dropped from \SI{0.613}{\second}, to \SI{0.362}{\second} using two threads. Giving \andi more threads than input genomes leads to no further speed improvement. \, The other important option is \lstinline$--join$ (see Section~\ref{sec:join}).
-By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. This means, the output is substitution rates, which is greater than the mismatch rate. To obtain the pure mismatch rate, use the \lstinline$--raw$ switch.
+By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. Other evolutionary models are also implemented (Kimura, raw). The \lstinline$--model$ parameter can be used to switch between them.
Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}.
diff --git a/opt/psufsort/Makefile.am b/opt/psufsort/Makefile.am
index 2682dc4..85fa64e 100644
--- a/opt/psufsort/Makefile.am
+++ b/opt/psufsort/Makefile.am
@@ -1,4 +1,4 @@
noinst_LIBRARIES = libpsufsort.a
libpsufsort_a_SOURCES = psufsort.cxx c_interface.cxx interface.h $(top_srcdir)/src/global.h
-libpsufsort_a_CXXFLAGS = $(OPENMP_CXXFLAGS) -W -Wall
+libpsufsort_a_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
libpsufsort_a_CPPFLAGS = $(OPENMP_CXXFLAGS) -I$(top_srcdir)/src
diff --git a/src/Makefile.am b/src/Makefile.am
index 6f5183a..4384dd4 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -6,10 +6,11 @@ PSUFSORT=$(top_builddir)/opt/psufsort/libpsufsort.a
DUMMY=dummy.cxx
endif
-andi_SOURCES = andi.c esa.c process.c sequence.c io.c global.h esa.h process.h sequence.h io.h dist_hack.h
+andi_SOURCES = andi.c esa.c process.c sequence.c io.c global.h esa.h process.h sequence.h io.h dist_hack.h \
+model.h model.c
andi_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -std=gnu99
-andi_CFLAGS = $(OPENMP_CFLAGS) -W -Wall -Wno-missing-field-initializers
-andi_CXXFLAGS = $(OPENMP_CXXFLAGS) -W -Wall
+andi_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra -Wno-missing-field-initializers
+andi_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
andi_LDADD = $(PSUFSORT) $(top_builddir)/libs/libpfasta.a $(top_builddir)/opt/libcompat.a
nodist_EXTRA_andi_SOURCES = $(DUMMY)
diff --git a/src/andi.c b/src/andi.c
index c2c3421..b607eaa 100644
--- a/src/andi.c
+++ b/src/andi.c
@@ -1,14 +1,14 @@
/**
* @file
*
- * This is the main file. It contains functions to parse the commandline arguments,
- * read files etc.
- *
+ * This is the main file. It contains functions to parse the commandline
+ * arguments, read files etc.
+ *
* @brief The main file
* @author Fabian Klötzl
-
+ *
* @section License
-
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 3 of
@@ -22,12 +22,14 @@
*
*/
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <assert.h>
-#include <getopt.h>
-#include <errno.h>
+#include <time.h>
+#include <gsl/gsl_rng.h>
#include "global.h"
#include "process.h"
#include "io.h"
@@ -42,6 +44,8 @@ int FLAGS = 0;
int THREADS = 1;
long unsigned int BOOTSTRAP = 0;
double RANDOM_ANCHOR_PROP = 0.05;
+gsl_rng *RNG = NULL;
+int MODEL = M_JC;
void usage(void);
void version(void);
@@ -53,136 +57,133 @@ void version(void);
* the set flags it reads the input files and forwards the contained sequences
* to processing.
*/
-int main( int argc, char *argv[]){
+int main(int argc, char *argv[]) {
int c;
int version_flag = 0;
- struct option long_options[] = {
- {"version", no_argument, &version_flag, 1},
- {"help", no_argument, NULL, 'h'},
- {"raw", no_argument, NULL, 'r'},
- {"verbose", no_argument, NULL, 'v'},
- {"join", no_argument, NULL, 'j'},
- {"low-memory", no_argument, NULL, 'm'},
- {"threads", required_argument, NULL, 't'},
- {"bootstrap", required_argument, NULL, 'b'},
- {0,0,0,0}
- };
+ struct option long_options[] = {{"version", no_argument, &version_flag, 1},
+ {"help", no_argument, NULL, 'h'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"join", no_argument, NULL, 'j'},
+ {"low-memory", no_argument, NULL, 'l'},
+ {"threads", required_argument, NULL, 't'},
+ {"bootstrap", required_argument, NULL, 'b'},
+ {"model", required_argument, NULL, 'm'},
+ {0, 0, 0, 0}};
#ifdef _OPENMP
// Use all available processors by default.
THREADS = omp_get_num_procs();
#endif
-
+
// parse arguments
- while( 1 ){
-
+ while (1) {
+
int option_index = 0;
-
- c = getopt_long( argc, argv, "jvhrt:p:mb:", long_options, &option_index);
-
- if( c == -1){
+
+ c = getopt_long(argc, argv, "jvht:p:m:b:l", long_options,
+ &option_index);
+
+ if (c == -1) {
break;
}
-
- switch (c){
- case 0:
- break;
- case 'h':
- usage();
- break;
- case 'r':
- FLAGS |= F_RAW;
- break;
+
+ switch (c) {
+ case 0: break;
+ case 'h': usage(); break;
case 'v':
FLAGS |= FLAGS & F_VERBOSE ? F_EXTRA_VERBOSE : F_VERBOSE;
break;
- case 'p':
- {
- errno = 0;
- char *end;
- double prop = strtod( optarg, &end);
-
- if( errno || end == optarg || *end != '\0'){
- warnx(
- "Expected a floating point number for -p argument, but '%s' was given. "
- "Skipping argument.", optarg
- );
- break;
- }
-
- if( prop < 0.0 || prop > 1.0 ){
- warnx(
- "A probability should be a value between 0 and 1; "
- "Ignoring -p %f argument.", prop
- );
- break;
- }
-
- RANDOM_ANCHOR_PROP = prop;
+ case 'p': {
+ errno = 0;
+ char *end;
+ double prop = strtod(optarg, &end);
+
+ if (errno || end == optarg || *end != '\0') {
+ warnx(
+ "Expected a floating point number for -p argument, but "
+ "'%s' was given. Skipping argument.",
+ optarg);
+ break;
}
+
+ if (prop < 0.0 || prop > 1.0) {
+ warnx("A probability should be a value between 0 and 1; "
+ "Ignoring -p %f argument.",
+ prop);
+ break;
+ }
+
+ RANDOM_ANCHOR_PROP = prop;
break;
- case 'm':
- FLAGS |= F_LOW_MEMORY;
- break;
- case 'j':
- FLAGS |= F_JOIN;
- break;
- case 't':
- {
+ }
+ case 'l': FLAGS |= F_LOW_MEMORY; break;
+ case 'j': FLAGS |= F_JOIN; break;
+ case 't': {
#ifdef _OPENMP
- errno = 0;
- char *end;
- long unsigned int threads = strtoul( optarg, &end, 10);
-
- if( errno || end == optarg || *end != '\0'){
- warnx(
- "Expected a number for -t argument, but '%s' was given. "
- "Ignoring -t argument.", optarg
- );
- break;
- }
-
- if( threads > (long unsigned int) omp_get_num_procs() ){
- warnx(
- "The number of threads to be used, is greater then the number of available processors; "
- "Ignoring -t %lu argument.", threads
- );
- break;
- }
-
- THREADS = threads;
+ errno = 0;
+ char *end;
+ long unsigned int threads = strtoul(optarg, &end, 10);
+
+ if (errno || end == optarg || *end != '\0') {
+ warnx("Expected a number for -t argument, but '%s' was "
+ "given. Ignoring -t argument.",
+ optarg);
+ break;
+ }
+
+ if (threads > (long unsigned int)omp_get_num_procs()) {
+ warnx(
+ "The number of threads to be used, is greater then the "
+ "number of available processors; Ignoring -t %lu "
+ "argument.",
+ threads);
+ break;
+ }
+
+ THREADS = threads;
#else
- warnx("This version of andi was built without OpenMP and thus "
- "does not support multi threading. Ignoring -t argument.");
+ warnx(
+ "This version of andi was built without OpenMP and thus "
+ "does not support multi threading. Ignoring -t argument.");
#endif
+ break;
+ }
+ case 'b': {
+ errno = 0;
+ char *end;
+ long unsigned int bootstrap = strtoul(optarg, &end, 10);
+
+ if (errno || end == optarg || *end != '\0' || bootstrap == 0) {
+ warnx(
+ "Expected a positive number for -b argument, but '%s' "
+ "was given. Ignoring -b argument.",
+ optarg);
+ break;
}
+
+ BOOTSTRAP = bootstrap - 1;
break;
- case 'b':
- {
- errno = 0;
- char *end;
- long unsigned int bootstrap = strtoul( optarg, &end, 10);
-
- if( errno || end == optarg || *end != '\0' || bootstrap == 0){
- warnx(
- "Expected a positive number for -b argument, but '%s' was given. "
- "Ignoring -b argument.", optarg
- );
- break;
- }
-
- BOOTSTRAP = bootstrap - 1;
+ }
+ case 'm': {
+ // valid options are 'RAW' and 'JC'
+ if (strcasecmp(optarg, "RAW") == 0) {
+ MODEL = M_RAW;
+ } else if (strcasecmp(optarg, "JC") == 0) {
+ MODEL = M_JC;
+ } else if (strcasecmp(optarg, "KIMURA") == 0) {
+ MODEL = M_KIMURA;
+ } else {
+ warnx("Ignoring argument for --model. Expected Raw JC or Kimura");
}
break;
+ }
case '?': /* intentional fall-through */
- default:
- usage();
- break;
+ default: usage(); break;
}
}
-
- if( version_flag ){
+
+ if (version_flag) {
version();
}
@@ -190,22 +191,22 @@ int main( int argc, char *argv[]){
argv += optind;
// at least one file name must be given
- if( FLAGS & F_JOIN && argc == 0 ){
+ if (FLAGS & F_JOIN && argc == 0) {
errx(1, "In join mode at least one filename needs to be supplied.");
}
-
+
dsa_t dsa;
- if(dsa_init(&dsa)){
- errx(errno,"Out of memory.");
+ if (dsa_init(&dsa)) {
+ errx(errno, "Out of memory.");
}
const char *file_name;
// parse all files
int minfiles = FLAGS & F_JOIN ? 2 : 1;
- for( ; ; minfiles-- ){
- if( !*argv){
- if( minfiles <= 0) break;
+ for (;; minfiles--) {
+ if (!*argv) {
+ if (minfiles <= 0) break;
// if no files are supplied, read from stdin
file_name = "-";
@@ -213,57 +214,68 @@ int main( int argc, char *argv[]){
file_name = *argv++;
}
- if( FLAGS & F_JOIN){
- read_fasta_join( file_name, &dsa);
+ if (FLAGS & F_JOIN) {
+ read_fasta_join(file_name, &dsa);
} else {
- read_fasta( file_name, &dsa);
+ read_fasta(file_name, &dsa);
}
}
+ size_t n = dsa_size(&dsa);
- size_t n = dsa_size( &dsa);
-
- if( FLAGS & F_VERBOSE){
- fprintf( stderr, "Comparing %zu sequences\n", n);
- fflush( stderr);
+ if (FLAGS & F_VERBOSE) {
+ fprintf(stderr, "Comparing %zu sequences\n", n);
+ fflush(stderr);
}
-
- seq_t *sequences = dsa_data( &dsa);
+
+ RNG = gsl_rng_alloc(gsl_rng_default);
+ if (!RNG) {
+ err(1, "RNG allocation failed.");
+ }
+
+ // seed the random number generator with the current time
+ gsl_rng_set(RNG, time(NULL));
+
+ seq_t *sequences = dsa_data(&dsa);
// compute distance matrix
- if( n >= 2){
+ if (n >= 2) {
calculate_distances(sequences, n);
} else {
- warnx("I am truly sorry, but with less than two sequences (%zu given) there is nothing to compare.", n);
+ warnx("I am truly sorry, but with less than two sequences (%zu given) "
+ "there is nothing to compare.",
+ n);
}
-
-
- dsa_free( &dsa);
+ dsa_free(&dsa);
+ gsl_rng_free(RNG);
return 0;
}
/**
* Prints the usage to stdout and then exits successfully.
*/
-void usage(void){
- const char str[]= {
- "Usage: andi [-bjrv] [-p FLOAT] [-t INT] FILES...\n"
- "\tFILES... can be any sequence of FASTA files. If no files are supplied, stdin is used instead.\n"
+void usage(void) {
+ const char str[] = {
+ "Usage: andi [-jlv] [-b INT] [-p FLOAT] [-m MODEL] [-t INT] FILES...\n"
+ "\tFILES... can be any sequence of FASTA files. If no files are "
+ "supplied, stdin is used instead.\n"
"Options:\n"
" -b, --bootstrap <INT> \n"
" Print additional bootstrap matrices\n"
- " -j, --join Treat all sequences from one file as a single genome\n"
- " -m, --low-memory Use less memory at the cost of speed\n"
+ " -j, --join Treat all sequences from one file as a single "
+ "genome\n"
+ " -l, --low-memory Use less memory at the cost of speed\n"
+ " -m, --model <Raw|JC|Kimura>\n"
+ " Pick an evolutionary model; default: JC\n"
" -p <FLOAT> Significance of an anchor pair; default: 0.05\n"
- " -r, --raw Calculates raw distances; default: Jukes-Cantor corrected\n"
" -v, --verbose Prints additional information\n"
#ifdef _OPENMP
" -t, --threads <INT> \n"
- " The number of threads to be used; by default, all available processors are used\n"
+ " The number of threads to be used; by default, all "
+ "available processors are used\n"
#endif
" -h, --help Display this help and exit\n"
- " --version Output version information and acknowledgments\n"
- };
+ " --version Output version information and acknowledgments\n"};
printf("%s", str);
exit(EXIT_SUCCESS);
@@ -271,25 +283,27 @@ void usage(void){
/**
* This function just prints the version string and then aborts
- * the program. It conforms to the [GNU Coding Standard](http://www.gnu.org/prep/standards/html_node/_002d_002dversion.html#g_t_002d_002dversion).
+ * the program. It conforms to the [GNU Coding
+ * Standard](http://www.gnu.org/prep/standards/html_node/_002d_002dversion.html#g_t_002d_002dversion).
*/
-void version(void){
- const char str[]= {
- "andi " VERSION "\n"
+void version(void) {
+ const char str[] = {
+ "andi " VERSION "\n"
"Copyright (C) 2014, 2015 Fabian Klötzl\n"
- "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n"
+ "License GPLv3+: GNU GPL version 3 or later "
+ "<http://gnu.org/licenses/gpl.html>\n"
"This is free software: you are free to change and redistribute it.\n"
"There is NO WARRANTY, to the extent permitted by law.\n\n"
"Acknowledgments:\n"
"1) Andi: Haubold, B. Klötzl, F. and Pfaffelhuber, P. (2015). "
- "Fast and accurate estimation of evolutionary distances between closely related genomes\n"
- "2) Algorithms: Ohlebusch, E. (2013). Bioinformatics Algorithms. Sequence Analysis, "
- "Genome Rearrangements, and Phylogenetic Reconstruction. pp 118f.\n"
- "3) SA construction: Mori, Y. (2005). Short description of improved two-stage suffix "
- "sorting alorithm. http://homepage3.nifty.com/wpage/software/itssort.txt\n"
- };
+ "Fast and accurate estimation of evolutionary distances between "
+ "closely related genomes\n"
+ "2) Algorithms: Ohlebusch, E. (2013). Bioinformatics Algorithms. "
+ "Sequence Analysis, Genome Rearrangements, and Phylogenetic "
+ "Reconstruction. pp 118f.\n"
+ "3) SA construction: Mori, Y. (2005). Short description of improved "
+ "two-stage suffix sorting alorithm. "
+ "http://homepage3.nifty.com/wpage/software/itssort.txt\n"};
printf("%s", str);
exit(EXIT_SUCCESS);
}
-
-
diff --git a/src/dist_hack.h b/src/dist_hack.h
index 53f4c87..f66d37d 100644
--- a/src/dist_hack.h
+++ b/src/dist_hack.h
@@ -1,5 +1,6 @@
/** @file
- * @brief This file is a preprocessor hack for the two functions `distMatrix` and `distMatrixLM`.
+ * @brief This file is a preprocessor hack for the two functions `distMatrix`
+ * and `distMatrixLM`.
*/
#ifdef FAST
#define NAME distMatrix
@@ -10,63 +11,56 @@
#undef P_OUTER
#undef P_INNER
#define NAME distMatrixLM
-#define P_OUTER
+#define P_OUTER
#define P_INNER _Pragma("omp parallel for num_threads( THREADS)")
#endif
-/** @brief This function calls dist_andi for pairs of subjects and queries, and thereby fills the distance matrix.
+/** @brief This function calls dist_andi for pairs of subjects and queries, and
+ *thereby fills the distance matrix.
*
- * This function is actually two functions. It is one template that gets compiled into two functions via
+ * This function is actually two functions. It is one template that gets
+ *compiled into two functions via
* preprocessor hacks. The reason is DRY (Do not Repeat Yourselves).
- * The two functions only differ by their name and pragmas; i.e. They run in different parallel modes.
+ * The two functions only differ by their name and pragmas; i.e. They run in
+ *different parallel modes.
* `distMatrix` is faster than `distMatrixLM` but needs more memory.
*
* @param sequences - The sequences to compare
* @param n - The number of sequences
* @param M - A matrix for additional output data
*/
-void NAME( data_t *M, seq_t* sequences, size_t n){
+void NAME(struct model *M, seq_t *sequences, size_t n) {
size_t i;
//#pragma
P_OUTER
- for(i=0;i<n;i++){
+ for (i = 0; i < n; i++) {
seq_t *subject = &sequences[i];
esa_s E;
- seq_subject_init( subject);
- if( esa_init( &E, subject)){
- warnx("Failed to create index for %s.", subject->name);
-
- for( size_t j=0; j< n; j++){
- M(i,j).distance = (i==j) ? 0.0 : NAN;
- M(i,j).coverage = 0.0;
- }
-
- continue;
+ if (seq_subject_init(subject) || esa_init(&E, subject)) {
+ errx(1, "Failed to create index for %s.", subject->name);
}
// now compare every other sequence to i
size_t j;
P_INNER
- for(j=0; j<n; j++){
- if( j == i) {
- M(i,j) = (data_t){0.0,0.0};
+ for (j = 0; j < n; j++) {
+ if (j == i) {
+ M(i, j) = (struct model){.seq_len = 9, .counts = {9}};
continue;
}
// TODO: Provide a nicer progress indicator.
- if( FLAGS & F_EXTRA_VERBOSE ){
- #pragma omp critical
- {
- fprintf( stderr, "comparing %zu and %zu\n", i, j);
- }
+ if (FLAGS & F_EXTRA_VERBOSE) {
+#pragma omp critical
+ { fprintf(stderr, "comparing %zu and %zu\n", i, j); }
}
size_t ql = sequences[j].len;
- M(i,j) = dist_anchor( &E, sequences[j].S, ql, subject->gc);
+ M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject->gc);
}
esa_free(&E);
diff --git a/src/esa.c b/src/esa.c
index 3916c70..d9297b7 100644
--- a/src/esa.c
+++ b/src/esa.c
@@ -3,12 +3,12 @@
* @brief ESA functions
*
* This file contains various functions that operate on an enhanced suffix
- * array. The basic algorithms originate from the book of Ohlebusch
+ * array. The basic algorithms originate from the book of Ohlebusch
* "Bioinformatics Algorithms" (2013). Most of these were heavily modified
* for improved performance. One example is the lcp-cache.
*
* The ESA structure defined in esa.h contains a `cache` field. This cache is
- * used to quickly look up lcp-intervals. Consider the queries "AAGT" and
+ * used to quickly look up lcp-intervals. Consider the queries "AAGT" and
* "AACG". In both cases the interval for "AA" has to be looked up in the
* ESA. If we simply store the interval for "AA" in the cache, once and use it
* for each query we are significantly faster (up to 7 times).
@@ -18,23 +18,24 @@
#include <assert.h>
#include "esa.h"
-static void esa_init_cache_dfs( esa_s *, char *str, size_t pos, lcp_inter_t in);
-static void esa_init_cache_fill( esa_s *, char *str, size_t pos, lcp_inter_t in);
+static void esa_init_cache_dfs(esa_s *, char *str, size_t pos, lcp_inter_t in);
+static void esa_init_cache_fill(esa_s *, char *str, size_t pos, lcp_inter_t in);
-static lcp_inter_t get_interval( const esa_s *, lcp_inter_t ij, char a);
-lcp_inter_t get_match( const esa_s *, const char *query, size_t qlen);
-static lcp_inter_t get_match_from( const esa_s *, const char *query, size_t qlen, saidx_t k, lcp_inter_t ij);
+static lcp_inter_t get_interval(const esa_s *, lcp_inter_t ij, char a);
+lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen);
+static lcp_inter_t get_match_from(const esa_s *, const char *query, size_t qlen,
+ saidx_t k, lcp_inter_t ij);
-static int esa_init_SA( esa_s *);
-static int esa_init_LCP( esa_s *);
-static int esa_init_CLD( esa_s *);
+static int esa_init_SA(esa_s *);
+static int esa_init_LCP(esa_s *);
+static int esa_init_CLD(esa_s *);
/** @brief The prefix length up to which LCP-intervals are cached. */
const size_t CACHE_LENGTH = 10;
/** @brief Map a code to the character. */
-char code2char( ssize_t code){
- switch( code & 0x3){
+char code2char(ssize_t code) {
+ switch (code & 0x3) {
case 0: return 'A';
case 1: return 'C';
case 2: return 'G';
@@ -44,13 +45,13 @@ char code2char( ssize_t code){
}
/** @brief Map a character to a two bit code. */
-ssize_t char2code( const char c){
+ssize_t char2code(const char c) {
ssize_t result = -1;
- switch( c){
- case 'A' : result = 0; break;
- case 'C' : result = 1; break;
- case 'G' : result = 2; break;
- case 'T' : result = 3; break;
+ switch (c) {
+ case 'A': result = 0; break;
+ case 'C': result = 1; break;
+ case 'G': result = 2; break;
+ case 'T': result = 3; break;
}
return result;
}
@@ -58,39 +59,33 @@ ssize_t char2code( const char c){
#define R(CLD, i) ((CLD)[(i)])
#define L(CLD, i) ((CLD)[(i)-1])
-
-
/** @brief Fills the LCP-Interval cache.
*
- * Traversing the virtual suffix tree, created by SA, LCP and CLD is rather slow.
- * Hence we create a cache, holding the LCP-interval for a prefix of a certain
- * length ::CACHE_LENGTH. This function it the entry point for the cache filling
- * routine.
+ * Traversing the virtual suffix tree, created by SA, LCP and CLD is rather
+ * slow. Hence we create a cache, holding the LCP-interval for a prefix of a
+ * certain length ::CACHE_LENGTH. This function it the entry point for the
+ * cache filling routine.
*
* @param self - The ESA.
* @returns 0 iff successful
*/
-int esa_init_cache( esa_s *self){
- lcp_inter_t* cache = malloc((1 << (2*CACHE_LENGTH)) * sizeof(lcp_inter_t) );
+int esa_init_cache(esa_s *self) {
+ lcp_inter_t *cache =
+ malloc((1 << (2 * CACHE_LENGTH)) * sizeof(lcp_inter_t));
- if( !cache){
+ if (!cache) {
return 1;
}
self->cache = cache;
- char str[CACHE_LENGTH+1];
+ char str[CACHE_LENGTH + 1];
str[CACHE_LENGTH] = '\0';
saidx_t m = L(self->CLD, self->len);
- lcp_inter_t ij = {
- .i = 0,
- .j = self->len - 1,
- .m = m,
- .l = self->LCP[m]
- };
+ lcp_inter_t ij = {.i = 0, .j = self->len - 1, .m = m, .l = self->LCP[m]};
- esa_init_cache_dfs( self, str, 0, ij);
+ esa_init_cache_dfs(self, str, 0, ij);
return 0;
}
@@ -98,59 +93,60 @@ int esa_init_cache( esa_s *self){
/** @brief Fills the cache — one char at a time.
*
* This function is a depth first search on the virtual suffix tree and fills
- * the cache. Or rather it calls it self until some value to cache is calculated.
- * This function is a recursive version of get_inteval but with more edge cases.
+ * the cache. Or rather it calls it self until some value to cache is
+ * calculated. This function is a recursive version of get_inteval but with more
+ * edge cases.
*
* @param C - The ESA.
* @param str - The current prefix.
* @param pos - The length of the prefix.
* @param in - The LCP-interval of prefix[0..pos-1].
*/
-void esa_init_cache_dfs( esa_s *C, char *str, size_t pos, const lcp_inter_t in){
+void esa_init_cache_dfs(esa_s *C, char *str, size_t pos, const lcp_inter_t in) {
// we are not yet done, but the current strings do not exist in the subject.
- if( pos < CACHE_LENGTH && in.i == -1 && in.j == -1){
- esa_init_cache_fill(C,str,pos,in);
+ if (pos < CACHE_LENGTH && in.i == -1 && in.j == -1) {
+ esa_init_cache_fill(C, str, pos, in);
return;
}
// we are past the caching length
- if( pos >= CACHE_LENGTH){
- esa_init_cache_fill(C,str,pos,in);
+ if (pos >= CACHE_LENGTH) {
+ esa_init_cache_fill(C, str, pos, in);
return;
}
lcp_inter_t ij;
// iterate over all nucleotides
- for( int code = 0; code < 4; ++code){
+ for (int code = 0; code < 4; ++code) {
str[pos] = code2char(code);
ij = get_interval(C, in, str[pos]);
// fail early
- if( ij.i == -1 && ij.j == -1){
+ if (ij.i == -1 && ij.j == -1) {
esa_init_cache_fill(C, str, pos + 1, ij);
continue;
}
// The LCP-interval is deeper than expected
- if ( ij.l > (ssize_t)(pos + 1)){
+ if (ij.l > (ssize_t)(pos + 1)) {
// Check if it still fits into the cache
- if( (size_t)ij.l < CACHE_LENGTH){
+ if ((size_t)ij.l < CACHE_LENGTH) {
// fill with dummy value
- esa_init_cache_fill(C, str, pos+1, in);
+ esa_init_cache_fill(C, str, pos + 1, in);
char non_acgt = 0;
// fast forward
size_t k = pos + 1;
- for(;k < (size_t)ij.l; k++){
- // In some very edgy edge cases the lcp-interval `ij` contains
- // a `;` or another non-acgt character. Since we cannot cache
- // those, break.
- char c = C->S[C->SA[ij.i]+k];
- if( char2code(c) < 0){
+ for (; k < (size_t)ij.l; k++) {
+ // In some very edgy edge cases the lcp-interval `ij`
+ // contains a `;` or another non-acgt character. Since we
+ // cannot cache those, break.
+ char c = C->S[C->SA[ij.i] + k];
+ if (char2code(c) < 0) {
non_acgt = 1;
break;
}
@@ -158,7 +154,7 @@ void esa_init_cache_dfs( esa_s *C, char *str, size_t pos, const lcp_inter_t in){
str[k] = c;
}
- if( non_acgt) {
+ if (non_acgt) {
esa_init_cache_fill(C, str, k, ij);
} else {
esa_init_cache_dfs(C, str, k, ij);
@@ -168,12 +164,12 @@ void esa_init_cache_dfs( esa_s *C, char *str, size_t pos, const lcp_inter_t in){
}
// If the lcp-interval exceeds the cache depth, stop here and fill
- esa_init_cache_fill(C, str, pos+1, in);
+ esa_init_cache_fill(C, str, pos + 1, in);
continue;
}
// Continue one level deeper
- esa_init_cache_dfs(C,str,pos+1, ij);
+ esa_init_cache_dfs(C, str, pos + 1, ij);
}
}
@@ -187,15 +183,15 @@ void esa_init_cache_dfs( esa_s *C, char *str, size_t pos, const lcp_inter_t in){
* @param pos - The length of the prefix.
* @param in - The LCP-interval of prefix[0..pos-1].
*/
-void esa_init_cache_fill( esa_s *C, char *str, size_t pos, lcp_inter_t in){
- if( pos < CACHE_LENGTH){
- for( int code = 0; code < 4; ++code){
+void esa_init_cache_fill(esa_s *C, char *str, size_t pos, lcp_inter_t in) {
+ if (pos < CACHE_LENGTH) {
+ for (int code = 0; code < 4; ++code) {
str[pos] = code2char(code);
- esa_init_cache_fill( C, str, pos + 1, in);
+ esa_init_cache_fill(C, str, pos + 1, in);
}
} else {
ssize_t code = 0;
- for( size_t i = 0; i < CACHE_LENGTH; ++i ){
+ for (size_t i = 0; i < CACHE_LENGTH; ++i) {
code <<= 2;
code |= char2code(str[i]);
}
@@ -216,20 +212,20 @@ void esa_init_cache_fill( esa_s *C, char *str, size_t pos, lcp_inter_t in){
* @param self - The ESA
* @returns 0 iff successful
*/
-int esa_init_FVC(esa_s *self){
+int esa_init_FVC(esa_s *self) {
size_t len = self->len;
char *FVC = self->FVC = malloc(len);
- if(!FVC){
+ if (!FVC) {
return 1;
}
const char *S = self->S;
const int *SA = self->SA;
- const int *LCP= self->LCP;
+ const int *LCP = self->LCP;
FVC[0] = '\0';
- for(size_t i=len; i; i--, FVC++, SA++, LCP++){
+ for (size_t i = len; i; i--, FVC++, SA++, LCP++) {
*FVC = S[*SA + *LCP];
}
@@ -243,41 +239,38 @@ int esa_init_FVC(esa_s *self){
* @param S - The sequence
* @returns 0 iff successful
*/
-int esa_init( esa_s *C, const seq_t *S){
- if( !C || !S || !S->S) return 1;
+int esa_init(esa_s *C, const seq_t *S) {
+ if (!C || !S || !S->S) return 1;
- *C = (esa_s){
- .S = S->RS,
- .len = S->RSlen
- };
+ *C = (esa_s){.S = S->RS, .len = S->RSlen};
int result;
result = esa_init_SA(C);
- if(result) return result;
+ if (result) return result;
result = esa_init_LCP(C);
- if(result) return result;
+ if (result) return result;
result = esa_init_CLD(C);
- if(result) return result;
+ if (result) return result;
result = esa_init_FVC(C);
- if( result) return result;
+ if (result) return result;
result = esa_init_cache(C);
- if(result) return result;
+ if (result) return result;
return 0;
}
/** @brief Free the private data of an ESA. */
-void esa_free( esa_s *self){
- free( self->SA);
- free( self->LCP);
- free( self->CLD);
- free( self->cache);
- free( self->FVC);
+void esa_free(esa_s *self) {
+ free(self->SA);
+ free(self->LCP);
+ free(self->CLD);
+ free(self->cache);
+ free(self->FVC);
*self = (esa_s){};
}
@@ -286,25 +279,25 @@ void esa_free( esa_s *self){
* @param C The enhanced suffix array to use. Reads C->S, fills C->SA.
* @returns 0 iff successful
*/
-int esa_init_SA(esa_s *C){
+int esa_init_SA(esa_s *C) {
// assert c.S
- if( !C || !C->S ){
+ if (!C || !C->S) {
return 1;
}
C->SA = malloc(C->len * sizeof(saidx_t));
- if( !C->SA ){
+ if (!C->SA) {
return 2;
}
-
+
saidx_t result = 1;
- #ifdef HAVE_LIBDIVSUFSORT
- result = divsufsort((const unsigned char*)C->S, C->SA, C->len);
- #else
+#ifdef HAVE_LIBDIVSUFSORT
+ result = divsufsort((const unsigned char *)C->S, C->SA, C->len);
+#else
result = c_psufsort(C->S, C->SA);
- #endif
-
+#endif
+
return result;
}
@@ -314,44 +307,42 @@ int esa_init_SA(esa_s *C){
*
* @param C - The ESA
*/
-int esa_init_CLD( esa_s *C){
- if( !C || !C->LCP){
+int esa_init_CLD(esa_s *C) {
+ if (!C || !C->LCP) {
return 1;
}
- saidx_t* CLD = C->CLD = malloc((C->len+1) * sizeof(saidx_t));
- if( !C->CLD) {
+ saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(saidx_t));
+ if (!C->CLD) {
return 2;
}
saidx_t *LCP = C->LCP;
- typedef struct pair_s {
- saidx_t idx, lcp;
- } pair_t;
+ typedef struct pair_s { saidx_t idx, lcp; } pair_t;
- pair_t *stack = malloc((C->len+1) * sizeof(pair_t));
+ pair_t *stack = malloc((C->len + 1) * sizeof(pair_t));
pair_t *top = stack; // points at the topmost filled element
pair_t last;
- R(CLD,0) = C->len + 1;
+ R(CLD, 0) = C->len + 1;
top->idx = 0;
top->lcp = -1;
// iterate over all elements
- for( size_t k = 1; k < (size_t)(C->len + 1); k++){
- while( LCP[k] < top->lcp){
+ for (size_t k = 1; k < (size_t)(C->len + 1); k++) {
+ while (LCP[k] < top->lcp) {
// top->lcp is a leaf
last = *top--;
// link all elements of same lcp value in a chain
- while( top->lcp == last.lcp){
- R(CLD,top->idx) = last.idx;
+ while (top->lcp == last.lcp) {
+ R(CLD, top->idx) = last.idx;
last = *top--;
}
// store the l-index of last
- if( LCP[k] < top->lcp){
+ if (LCP[k] < top->lcp) {
R(CLD, top->idx) = last.idx;
} else {
L(CLD, k) = last.idx;
@@ -364,7 +355,7 @@ int esa_init_CLD( esa_s *C){
top->lcp = LCP[k];
}
- free( stack);
+ free(stack);
return 0;
}
@@ -376,63 +367,62 @@ int esa_init_CLD( esa_s *C){
* @param C The enhanced suffix array to compute the LCP from.
* @returns 0 iff successful
*/
-int esa_init_LCP( esa_s *C){
+int esa_init_LCP(esa_s *C) {
const char *S = C->S;
- saidx_t *SA = C->SA;
- saidx_t len = C->len;
-
+ saidx_t *SA = C->SA;
+ saidx_t len = C->len;
+
// Trivial safety checks
- if( !C || !S || !SA || len == 0){
+ if (!C || !S || !SA || len == 0) {
return 1;
}
-
+
// Allocate new memory
// The LCP array is one element longer than S.
- saidx_t *LCP = C->LCP = malloc((len+1)*sizeof(saidx_t));
- if( !LCP ){
+ saidx_t *LCP = C->LCP = malloc((len + 1) * sizeof(saidx_t));
+ if (!LCP) {
return 3;
}
-
+
LCP[0] = -1;
LCP[len] = -1;
-
+
// Allocate temporary arrays
- saidx_t *PHI = malloc( len * sizeof(saidx_t));
+ saidx_t *PHI = malloc(len * sizeof(saidx_t));
saidx_t *PLCP = PHI;
- if( !PHI ){
+ if (!PHI) {
free(PHI);
return 2;
}
-
+
PHI[SA[0]] = -1;
saidx_t k;
ssize_t i;
-
-
- for( i=1; i< len; i++){
- PHI[SA[i]] = SA[ i-1];
+
+ for (i = 1; i < len; i++) {
+ PHI[SA[i]] = SA[i - 1];
}
-
+
ssize_t l = 0;
- for( i = 0; i< len ; i++){
+ for (i = 0; i < len; i++) {
k = PHI[i];
- if( k != -1 ){
- while( S[k+l] == S[i+l]){
+ if (k != -1) {
+ while (S[k + l] == S[i + l]) {
l++;
}
PLCP[i] = l;
l--;
- if( l < 0) l = 0;
+ if (l < 0) l = 0;
} else {
PLCP[i] = -1;
}
}
-
+
// unpermutate the LCP array
- for( i=1; i< len; i++){
+ for (i = 1; i < len; i++) {
LCP[i] = PLCP[SA[i]];
}
-
+
free(PHI);
return 0;
}
@@ -450,7 +440,7 @@ int esa_init_LCP( esa_s *C){
* @param a - The next character.
* @returns The lcp-interval one level deeper.
*/
-static lcp_inter_t get_interval( const esa_s *self, lcp_inter_t ij, char a){
+static lcp_inter_t get_interval(const esa_s *self, lcp_inter_t ij, char a) {
saidx_t i = ij.i;
saidx_t j = ij.j;
@@ -458,10 +448,10 @@ static lcp_inter_t get_interval( const esa_s *self, lcp_inter_t ij, char a){
const saidx_t *LCP = self->LCP;
const char *S = self->S;
const saidx_t *CLD = self->CLD;
- const char *FVC= self->FVC;
+ const char *FVC = self->FVC;
// check for singleton or empty interval
- if( i == j ){
- if( S[SA[i] + ij.l] != a){
+ if (i == j) {
+ if (S[SA[i] + ij.l] != a) {
ij.i = ij.j = -1;
}
return ij;
@@ -476,36 +466,31 @@ static lcp_inter_t get_interval( const esa_s *self, lcp_inter_t ij, char a){
do {
c = FVC[i];
- SoSueMe:
- if( c == a){
+ SoSueMe:
+ if (c == a) {
/* found ! */
saidx_t n = L(CLD, m);
- ij = (lcp_inter_t){
- .i = i,
- .j = m-1,
- .m = n,
- .l = LCP[n]
- };
+ ij = (lcp_inter_t){.i = i, .j = m - 1, .m = n, .l = LCP[n]};
return ij;
}
- if( c > a){
+ if (c > a) {
break;
}
i = m;
- if( i == j ){
+ if (i == j) {
break; // singleton interval, or `a` not found
}
- m = R(CLD,m);
- } while ( /*m != "bottom" && */ LCP[m] == l);
+ m = R(CLD, m);
+ } while (/*m != "bottom" && */ LCP[m] == l);
// final sanity check
- if( i != ij.i ? FVC[i] == a : S[SA[i] + l] == a){
+ if (i != ij.i ? FVC[i] == a : S[SA[i] + l] == a) {
ij.i = i;
ij.j = j;
/* Also return the length of the LCP interval including `a` and
@@ -522,11 +507,13 @@ static lcp_inter_t get_interval( const esa_s *self, lcp_inter_t ij, char a){
/** @brief Compute the longest match of a query with the subject.
*
* The *longest match* is the core concept of `andi`. Its simply defined as the
- * longest prefix of a query Q appearing anywhere in the subject S. Talking about
- * genetic sequences, a match is a homologous region, likely followed by a SNP.
+ * longest prefix of a query Q appearing anywhere in the subject S. Talking
+ * about genetic sequences, a match is a homologous region, likely followed by a
+ * SNP.
*
* This function returns the interval for where the longest match of the query
- * can be found in the ESA. Thereto it expects a starting interval for the search.
+ * can be found in the ESA. Thereto it expects a starting interval for the
+ * search.
*
* @param C - The enhanced suffix array for the subject.
* @param query - The query sequence.
@@ -535,22 +522,23 @@ static lcp_inter_t get_interval( const esa_s *self, lcp_inter_t ij, char a){
* @param ij - The LCP interval for the string `query[0..k]`.
* @returns The LCP interval for the longest prefix.
*/
-lcp_inter_t get_match_from( const esa_s *C, const char *query, size_t qlen, saidx_t k, lcp_inter_t ij){
+lcp_inter_t get_match_from(const esa_s *C, const char *query, size_t qlen,
+ saidx_t k, lcp_inter_t ij) {
- if( ij.i == -1 && ij.j == -1){
+ if (ij.i == -1 && ij.j == -1) {
return ij;
}
// fail early on singleton intervals.
- if( ij.i == ij.j){
+ if (ij.i == ij.j) {
// try to extend the match. See line 513 below.
saidx_t p = C->SA[ij.i];
size_t k = ij.l;
const char *S = (const char *)C->S;
- for( ; k< qlen && S[p+k]; k++ ){
- if( S[p+k] != query[k]){
+ for (; k < qlen && S[p + k]; k++) {
+ if (S[p + k] != query[k]) {
ij.l = k;
return ij;
}
@@ -563,44 +551,44 @@ lcp_inter_t get_match_from( const esa_s *C, const char *query, size_t qlen, said
saidx_t l, i, j;
lcp_inter_t res = ij;
-
+
const saidx_t *SA = C->SA;
const char *S = C->S;
-
+
// Loop over the query until a mismatch is found
do {
// Get the subinterval for the next character.
- ij = get_interval( C, ij, query[k]);
+ ij = get_interval(C, ij, query[k]);
i = ij.i;
j = ij.j;
-
+
// If our match cannot be extended further, return.
- if( i == -1 && j == -1 ){
+ if (i == -1 && j == -1) {
res.l = k;
return res;
}
-
+
res.i = ij.i;
res.j = ij.j;
l = qlen;
- if( i < j && ij.l < l){
- /* Instead of making another look up we can use the LCP interval calculated
- * in get_interval */
+ if (i < j && ij.l < l) {
+ /* Instead of making another look up we can use the LCP interval
+ * calculated in get_interval */
l = ij.l;
}
-
+
// By definition, the kth letter of the query was matched.
k++;
// Extend the match
- for( int p = SA[i]; k < l; k++){
- if( S[p+k] != query[k] ){
+ for (int p = SA[i]; k < l; k++) {
+ if (S[p + k] != query[k]) {
res.l = k;
return res;
}
}
- } while ( k < (ssize_t)qlen);
+ } while (k < (ssize_t)qlen);
res.l = qlen;
return res;
@@ -608,61 +596,55 @@ lcp_inter_t get_match_from( const esa_s *C, const char *query, size_t qlen, said
/** @brief Get a match.
*
- * Given an ESA and a string Q find the longest prefix of Q that matches somewhere
- * in C. This search is done entirely via jumping around in the ESA, and thus is
- * slow.
+ * Given an ESA and a string Q find the longest prefix of Q that matches
+ * somewhere in C. This search is done entirely via jumping around in the ESA,
+ * and thus is slow.
*
* @param C - The ESA.
* @param query - The query string — duh.
* @param qlen - The length of the query.
* @returns the lcp interval of the match.
*/
-lcp_inter_t get_match( const esa_s *C, const char *query, size_t qlen){
+lcp_inter_t get_match(const esa_s *C, const char *query, size_t qlen) {
// sanity checks
- if( !C || !query || !C->len || !C->SA || !C->LCP || !C->S || !C->CLD ){
- return (lcp_inter_t){-1,-1,-1,-1};
+ if (!C || !query || !C->len || !C->SA || !C->LCP || !C->S || !C->CLD) {
+ return (lcp_inter_t){-1, -1, -1, -1};
}
saidx_t m = L(C->CLD, C->len);
- lcp_inter_t ij = {
- .i = 0,
- .j = C->len - 1,
- .m = m,
- .l = C->LCP[m]
- };
+ lcp_inter_t ij = {.i = 0, .j = C->len - 1, .m = m, .l = C->LCP[m]};
return get_match_from(C, query, qlen, 0, ij);
}
-/** @brief Compute the LCP interval of a query. For a certain prefix length of the
- * query its LCP interval is retrieved from a cache. Hence this is faster than the
- * naive `get_match`. If the cache fails to provide a proper value, we fall back
- * to the standard search.
+/** @brief Compute the LCP interval of a query. For a certain prefix length of
+ * the query its LCP interval is retrieved from a cache. Hence this is faster
+ * than the naive `get_match`. If the cache fails to provide a proper value, we
+ * fall back to the standard search.
*
* @param C - The enhanced suffix array for the subject.
* @param query - The query sequence.
* @param qlen - The length of the query. Should correspond to `strlen(query)`.
* @returns The LCP interval for the longest prefix.
*/
-lcp_inter_t get_match_cached( const esa_s *C, const char *query, size_t qlen){
- if( qlen <= CACHE_LENGTH) return get_match( C, query, qlen);
+lcp_inter_t get_match_cached(const esa_s *C, const char *query, size_t qlen) {
+ if (qlen <= CACHE_LENGTH) return get_match(C, query, qlen);
ssize_t offset = 0;
- for( size_t i = 0; i< CACHE_LENGTH && offset >= 0; i++){
+ for (size_t i = 0; i < CACHE_LENGTH && offset >= 0; i++) {
offset <<= 2;
offset |= char2code(query[i]);
}
- if( offset < 0){
- return get_match( C, query, qlen);
+ if (offset < 0) {
+ return get_match(C, query, qlen);
}
lcp_inter_t ij = C->cache[offset];
- if( ij.i == -1 && ij.j == -1){
- return get_match( C, query, qlen);
+ if (ij.i == -1 && ij.j == -1) {
+ return get_match(C, query, qlen);
}
return get_match_from(C, query, qlen, ij.l, ij);
}
-
diff --git a/src/esa.h b/src/esa.h
index 6580a2d..6595925 100644
--- a/src/esa.h
+++ b/src/esa.h
@@ -10,7 +10,7 @@
#include "config.h"
#ifdef HAVE_LIBDIVSUFSORT
-# include <divsufsort.h>
+#include <divsufsort.h>
#else
#include "../opt/psufsort/interface.h"
@@ -41,9 +41,9 @@ typedef struct {
saidx_t m;
} lcp_inter_t;
-/**
+/**
* @brief The ESA type.
- *
+ *
* This structure holds arrays and objects associated with an enhanced
* suffix array (ESA).
*/
@@ -66,16 +66,15 @@ typedef struct esa_s {
saidx_t *CLD;
} esa_s;
-lcp_inter_t get_match_cached( const esa_s *, const char *query, size_t qlen);
-lcp_inter_t get_match( const esa_s *, const char *query, size_t qlen);
-int esa_init( esa_s *, const seq_t *S);
-void esa_free( esa_s *);
+lcp_inter_t get_match_cached(const esa_s *, const char *query, size_t qlen);
+lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen);
+int esa_init(esa_s *, const seq_t *S);
+void esa_free(esa_s *);
#ifdef DEBUG
-char code2char( ssize_t code);
+char code2char(ssize_t code);
-#endif //DEBUG
+#endif // DEBUG
#endif
-
diff --git a/src/global.h b/src/global.h
index c21ff46..faa244e 100644
--- a/src/global.h
+++ b/src/global.h
@@ -7,6 +7,8 @@
*/
#ifndef _GLOBAL_H_
#define _GLOBAL_H_
+#include <gsl/gsl_rng.h>
+
#include "config.h"
/**
@@ -29,15 +31,29 @@ extern int THREADS;
*/
extern double RANDOM_ANCHOR_PROP;
+/**
+ * The number of matrices that should be bootstrapped.
+ */
extern long unsigned int BOOTSTRAP;
-/**
+/**
+ * A globel random number generator. Has to be seedable.
+ */
+extern gsl_rng *RNG;
+
+/**
+ * The evolutionary model.
+ */
+extern int MODEL;
+
+enum { M_RAW, M_JC, M_KIMURA };
+
+/**
* This enum contains the available flags. Please note that all
* available options are a power of 2.
*/
enum {
F_NONE = 0,
- F_RAW = 1,
F_VERBOSE = 2,
F_EXTRA_VERBOSE = 4,
F_NON_ACGT = 8,
@@ -47,4 +63,3 @@ enum {
};
#endif
-
diff --git a/src/io.c b/src/io.c
index e17ec2c..9175f89 100644
--- a/src/io.c
+++ b/src/io.c
@@ -28,45 +28,52 @@
* @param dsa - An array that holds found sequences.
* @param name - The name of the file to be used for the name of the sequence.
*/
-void read_fasta_join( const char* file_name, dsa_t *dsa){
- if( !file_name || !dsa ) return;
+void read_fasta_join(const char *file_name, dsa_t *dsa) {
+ if (!file_name || !dsa) return;
dsa_t single;
dsa_init(&single);
- read_fasta( file_name, &single);
-
- if( dsa_size( &single) == 0 ){
+ read_fasta(file_name, &single);
+
+ if (dsa_size(&single) == 0) {
return;
}
-
- seq_t joined = dsa_join( &single);
+
+ seq_t joined = dsa_join(&single);
/* In join mode we try to be clever about the sequence name. Given the file
* path we extract just the file name. ie. path/file.ext -> file
* This obviously fails on Windows.
*/
- const char *left = strrchr( file_name, '/'); // find the last path separator
- left = (left == NULL) ? file_name : left + 1; // left is the position one of to the right of the path separator
-
- const char *dot = strchrnul( left, '.'); // find the extension
-
- joined.name = strndup( left, dot-left ); // copy only the file name, not its path or extension
- dsa_push( dsa, joined);
- dsa_free( &single);
-}
+ const char *left = strrchr(file_name, '/'); // find the last path separator
+ left = (left == NULL) ? file_name : left + 1; // left is the position one of
+ // to the right of the path
+ // separator
+
+ const char *dot = strchrnul(left, '.'); // find the extension
+ joined.name = strndup(
+ left, dot - left); // copy only the file name, not its path or extension
+ dsa_push(dsa, joined);
+ dsa_free(&single);
+}
/**
* @brief This function reads sequences from a file.
* @param in - The file pointer to read from.
* @param dsa - An array that holds found sequences.
*/
-void read_fasta( const char* file_name, dsa_t *dsa){
- if( !file_name || !dsa) return;
+void read_fasta(const char *file_name, dsa_t *dsa) {
+ if (!file_name || !dsa) return;
+
+ int file_descriptor =
+ strcmp(file_name, "-") ? open(file_name, O_RDONLY) : STDIN_FILENO;
- int file_descriptor = strcmp(file_name, "-") ? open(file_name, O_RDONLY) : STDIN_FILENO;
- if (file_descriptor < 0) warn("%s", file_name);
+ if (file_descriptor < 0) {
+ warn("%s", file_name);
+ return;
+ }
int l;
int check;
@@ -81,12 +88,12 @@ void read_fasta( const char* file_name, dsa_t *dsa){
pfasta_seq ps;
while ((l = pfasta_read(&pf, &ps)) == 0) {
- check = seq_init( &top, ps.seq, ps.name);
+ check = seq_init(&top, ps.seq, ps.name);
// skip broken sequences
- if( check != 0) continue;
-
- dsa_push( dsa, top);
+ if (check != 0) continue;
+
+ dsa_push(dsa, top);
pfasta_seq_free(&ps);
}
@@ -100,83 +107,79 @@ fail:
close(file_descriptor);
}
-
/**
* @brief Prints the distance matrix.
*
* This function pretty prints the distance matrix. For small distances
* scientific notation is used.
+ *
* @param D - The distance matrix
* @param sequences - An array of pointers to the sequences.
* @param n - The number of sequences.
+ * @param warnings - Print warnings? Set to 0 for bootstrapped matrices.
*/
-void print_distances( const data_t *D, const seq_t *sequences, size_t n){
-
+void print_distances(const struct model *D, const seq_t *sequences, size_t n,
+ int warnings) {
+ size_t i, j;
int use_scientific = 0;
- size_t i,j;
- for( i=0; i<n; i++){
- for( j=0; j<n; j++){
- if( isnan(D(i,j).distance)){
+ double *DD = malloc(n * n * sizeof(*DD));
+ if (!DD) err(errno, "meh.");
+
+#define DD(X, Y) (DD[(X)*n + (Y)])
+
+ typedef double(estimate_fn)(const model *);
+ estimate_fn *estimate;
+
+ switch (MODEL) {
+ case M_RAW: estimate = &estimate_RAW; break;
+ case M_JC: estimate = &estimate_JC; break;
+ case M_KIMURA: estimate = &estimate_KIMURA; break;
+ }
+
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < n; j++) {
+ model datum = D(i, j);
+
+ if (!(FLAGS & F_EXTRA_VERBOSE)) {
+ datum = model_average(&D(i, j), &D(j, i));
+ }
+
+ double dist = DD(i, j) = estimate(&datum);
+ double coverage = model_coverage(&datum);
+
+ if (isnan(dist) && warnings) {
const char str[] = {
- "For the two sequences '%s' and '%s' the distance computation "
- "failed and is reported as nan. "
- "Please refer to the documentation for further details."
- };
+ "For the two sequences '%s' and '%s' the distance "
+ "computation failed and is reported as nan. "
+ "Please refer to the documentation for further details."};
warnx(str, sequences[i].name, sequences[j].name);
- } else if( D(i,j).distance > 0 && D(i,j).distance < 0.001 ){
+ } else if (dist > 0 && dist < 0.001) {
use_scientific = 1;
- } else if( i < j && D(i,j).coverage < 0.05 && D(j,i).coverage < 0.05){
+ } else if (i < j && coverage < 0.05 && warnings) {
const char str[] = {
"For the two sequences '%s' and '%s' less than 5%% "
- "homology were found (%f and %f, respectively)."
- };
- warnx(str, sequences[i].name, sequences[j].name, D(i,j).coverage, D(j,i).coverage);
+ "homology were found (%f and %f, respectively)."};
+ warnx(str, sequences[i].name, sequences[j].name,
+ model_coverage(&D(i, j)), model_coverage(&D(j, i)));
}
}
}
printf("%zu\n", n);
- for( i=0;i<n;i++){
- // Print exactly nine characters of the name. Pad with spaces if necessary.
+ for (i = 0; i < n; i++) {
+ // Print exactly nine characters of the name. Pad with spaces if
+ // necessary.
printf("%-9.9s", sequences[i].name);
-
- for( j=0;j<n;j++){
- // print average, weighted by covered nucleotides
- double val = 0;
- if( i != j){
- double ijnucl = D(i,j).coverage * (double)sequences[j].len;
- double jinucl = D(j,i).coverage * (double)sequences[i].len;
- val = (D(i,j).distance * ijnucl + D(j,i).distance * jinucl)
- / (ijnucl + jinucl);
- }
-
- if( FLAGS & F_EXTRA_VERBOSE ){
- val = D(i,j).distance;
- }
-
- if( !(FLAGS & F_RAW)){
- val = -0.75 * log(1.0- (4.0 / 3.0) * val ); // jukes cantor
- }
- // fix negative zero
- if( val <= 0.0 ){
- val = 0.0;
- }
-
- if( !(FLAGS & F_RAW)){
- val = -0.75 * log(1.0- (4.0 / 3.0) * val ); // jukes cantor
- }
-
- // fix negative zero
- if( val <= 0.0 ){
- val = 0.0;
- }
+ for (j = 0; j < n; j++) {
// use scientific notation for small numbers
- printf(use_scientific ? " %1.4e" : " %1.4f", val);
+ printf(use_scientific ? " %1.4e" : " %1.4f", DD(i, j));
}
printf("\n");
}
+
+ free(DD);
}
/**
@@ -184,12 +187,12 @@ void print_distances( const data_t *D, const seq_t *sequences, size_t n){
* @param D - The distance matrix
* @param n - The number of sequences.
*/
-void print_coverages( const data_t *D, size_t n){
- size_t i,j;
+void print_coverages(const struct model *D, size_t n) {
+ size_t i, j;
printf("\nCoverage:\n");
- for(i=0; i<n; i++){
- for(j=0; j<n; j++){
- printf("%1.4e ", D(i,j).coverage);
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < n; j++) {
+ printf("%1.4e ", model_coverage(&D(i, j)));
}
printf("\n");
}
diff --git a/src/io.h b/src/io.h
index 38e6856..e8290f7 100644
--- a/src/io.h
+++ b/src/io.h
@@ -9,28 +9,18 @@
#include <errno.h>
#include <stdio.h>
#include "sequence.h"
-
-/** @brief The data structure can be used to store output data resulting
- * from the computation of distance.
- */
-typedef struct data_s {
- /** The distance */
- double distance;
- /** The coverage */
- double coverage;
-} data_t;
-
+#include "model.h"
/**
* This is a neat hack for dealing with matrices.
*/
-#define D( X, Y) (D[ (X)*n + (Y) ])
-#define M( X, Y) (M[ (X)*n + (Y) ])
+#define D(X, Y) (D[(X)*n + (Y)])
+#define M(X, Y) (M[(X)*n + (Y)])
-void read_fasta( const char *, dsa_t *dsa);
-void read_fasta_join( const char *, dsa_t *dsa);
+void read_fasta(const char *, dsa_t *dsa);
+void read_fasta_join(const char *, dsa_t *dsa);
-void print_distances( const data_t *D, const seq_t *sequences, size_t n);
-void print_coverages( const data_t *D, size_t n);
+void print_distances(const struct model *, const seq_t *, size_t, int);
+void print_coverages(const struct model *, size_t);
#endif // _IO_H_
diff --git a/src/model.c b/src/model.c
new file mode 100644
index 0000000..98db34c
--- /dev/null
+++ b/src/model.c
@@ -0,0 +1,255 @@
+/** @file
+ * @brief This file contains all functions for the mutation matrix and the
+ * estimation of evolutionary distances thereof.
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <gsl/gsl_randist.h>
+#include "global.h"
+#include "model.h"
+
+/**
+ * @brief Sum some mutation count specified by `summands`. Intended to be used
+ * through the `model_sum` macro.
+ *
+ * @param MM - The mutation matrix.
+ * @param summands - The mutations to add.
+ * @returns The sum of mutations.
+ */
+static size_t model_sum_types(const model *MM, const int summands[]) {
+ size_t total = 0;
+ for (int i = 0; summands[i] != MUTCOUNTS; ++i) {
+ total += MM->counts[summands[i]];
+ }
+ return total;
+}
+
+#define model_sum(MM, ...) \
+ model_sum_types((MM), (int[]){__VA_ARGS__, MUTCOUNTS})
+
+/**
+ * @brief Average two mutation matrices.
+ *
+ * @param MM - One matrix
+ * @param NN - Second matrix
+ * @returns The average (sum) of two mutation matrices.
+ */
+model model_average(const model *MM, const model *NN) {
+ model ret = *MM;
+ for (int i = 0; i != MUTCOUNTS; ++i) {
+ ret.counts[i] += NN->counts[i];
+ }
+ ret.seq_len += NN->seq_len;
+ return ret;
+}
+
+/**
+ * @brief Compute the total number of nucleotides in the pairwise alignment.
+ *
+ * @param MM - The mutation matrix.
+ * @returns The length of the alignment.
+ */
+size_t model_total(const model *MM) {
+ size_t total = 0;
+ for (size_t i = 0; i < MUTCOUNTS; ++i) {
+ total += MM->counts[i];
+ }
+ return total;
+}
+
+/**
+ * @brief Compute the coverage of an alignment.
+ *
+ * @param MM - The mutation matrix.
+ * @returns The relative coverage
+ */
+double model_coverage(const model *MM) {
+ size_t covered = model_total(MM);
+ size_t actual = MM->seq_len;
+
+ return (double)covered / (double)actual;
+}
+
+/**
+ * @brief Estimate the uncorrected distance of a pairwise alignment.
+ *
+ * @param MM - The mutation matrix.
+ * @returns The uncorrected substitution rate.
+ */
+double estimate_RAW(const model *MM) {
+ size_t nucl = model_total(MM);
+ size_t SNPs = model_sum(MM, AtoC, AtoG, AtoT, CtoG, CtoT, GtoT);
+
+ // Insignificant results. All abort the fail train.
+ if (nucl <= 3) {
+ return NAN;
+ }
+
+ return (double)SNPs / (double)nucl;
+}
+
+/**
+ * @brief Compute the Jukes-Cantor distance.
+ *
+ * @param MM - The mutation matrix.
+ * @returns The corrected JC distance.
+ */
+double estimate_JC(const model *MM) {
+ double dist = estimate_RAW(MM);
+ dist = -0.75 * log(1.0 - (4.0 / 3.0) * dist); // jukes cantor
+
+ // fix negative zero
+ return dist <= 0.0 ? 0.0 : dist;
+}
+
+/** @brief computes the evolutionary distance using K80.
+ *
+ * @param MM - The mutation matrix.
+ * @returns The corrected Kimura distance.
+ */
+double estimate_KIMURA(const model *MM) {
+ size_t nucl = model_total(MM);
+ size_t transitions = model_sum(MM, AtoG, CtoT);
+ size_t transversions = model_sum(MM, AtoC, AtoT, GtoC, GtoT);
+
+ double P = (double)transitions / (double)nucl;
+ double Q = (double)transversions / (double)nucl;
+
+ double tmp = 1.0 - 2.0 * P - Q;
+ double dist = -0.25 * log((1.0 - 2.0 * Q) * tmp * tmp);
+
+ // fix negative zero
+ return dist <= 0.0 ? 0.0 : dist;
+}
+
+/* @brief Bootstrap a mutation matrix.
+ *
+ * The classical bootstrapping process, as described by Felsenstein, resamples
+ * all nucleotides of a MSA. As andi only computes a pairwise alignment, this
+ * process boils down to a simple multinomial distribution. We just have to
+ * resample the elements of the mutation matrix. (Paper in review.)
+ *
+ * @param MM - The original mutation matrix.
+ * @returns A bootstrapped mutation matrix.
+ */
+model model_bootstrap(const model MM) {
+ model datum = MM;
+
+ size_t nucl = model_total(&MM);
+ double p[MUTCOUNTS];
+ for (size_t i = 0; i < MUTCOUNTS; ++i) {
+ p[i] = MM.counts[i] / (double)nucl;
+ }
+
+ unsigned int n[MUTCOUNTS];
+
+ gsl_ran_multinomial(RNG, MUTCOUNTS, nucl, p, n);
+
+ for (size_t i = 0; i < MUTCOUNTS; ++i) {
+ datum.counts[i] = n[i];
+ }
+
+ return datum;
+}
+
+/**
+ * @brief Given an anchor, classify nucleotides.
+ *
+ * For anchors we already know that the nucleotides of the subject and the query
+ * are equal. Thus only one sequence has to be analysed. See `model_count` for
+ * an explanation of the algorithm.
+ *
+ * @param MM - The mutation matrix
+ * @param S - The subject
+ * @param len - The anchor length
+ */
+void model_count_equal(model *MM, const char *S, size_t len) {
+ size_t local_counts[4] = {0};
+
+ for (; len--;) {
+ char s = *S++;
+
+ if (s < 'A') {
+ continue;
+ }
+
+ unsigned char nibble_s = s & 7;
+
+ static const unsigned int mm1 = 0x20031000;
+
+ unsigned char index = (mm1 >> (4 * nibble_s)) & 0x3;
+ local_counts[index]++;
+ }
+
+ MM->counts[AtoA] += local_counts[0];
+ MM->counts[CtoC] += local_counts[1];
+ MM->counts[GtoG] += local_counts[2];
+ MM->counts[TtoT] += local_counts[3];
+}
+
+/**
+ * @brief Count the substitutions and add them to the mutation matrix.
+ *
+ * @param MM - The mutation matrix.
+ * @param S - The subject
+ * @param Q - The query
+ * @param len - The length of the alignment
+ */
+void model_count(model *MM, const char *S, const char *Q, size_t len) {
+ size_t local_counts[MUTCOUNTS] = {0};
+
+ for (; len--; S++, Q++) {
+ char s = *S;
+ char q = *Q;
+
+ // Skip special characters.
+ if (s < 'A' || q < 'A') {
+ continue;
+ }
+
+ /* We want to map characters:
+ * A → 0
+ * C → 1
+ * G → 2
+ * T → 3
+ * The trick used below is that the three lower bits of the
+ * characters are unique. Thus, they can be used to compute the mapping
+ * above. The mapping itself is done via tricky bitwise operations.
+ */
+
+ unsigned char nibble_s = s & 7;
+ unsigned char nibble_q = q & 7;
+
+ static const unsigned int mm1 = 0x20031000;
+
+ // Pick the correct two bits representing s and q.
+ unsigned char foo = (mm1 >> (4 * nibble_s)) & 0x3;
+ unsigned char baz = (mm1 >> (4 * nibble_q)) & 0x3;
+
+ /*
+ * The mutation matrix is symmetric. For convenience we define the
+ * mutation XtoY with X > Y.
+ */
+ if (baz > foo) {
+ int temp = foo;
+ foo = baz;
+ baz = temp;
+ }
+
+ /*
+ * Finally, we want to map the indices to the correct mutation. This is
+ * done by utilising the mutation types in model.h.
+ */
+ static const unsigned int map4 = 0x9740;
+ unsigned int base = (map4 >> (4 * baz)) & 0xf;
+ unsigned int index = base + (foo - baz);
+
+ local_counts[index]++;
+ }
+
+ for (int i = 0; i != MUTCOUNTS; ++i) {
+ MM->counts[i] += local_counts[i];
+ }
+}
diff --git a/src/model.h b/src/model.h
new file mode 100644
index 0000000..a904af5
--- /dev/null
+++ b/src/model.h
@@ -0,0 +1,64 @@
+/** @file
+ * @brief This header contains all structures and prototypes for creating a
+ * mutation matrix and estimating distances trough an evolutionary model
+ * thereof.
+ */
+#pragma once
+
+#include <stdlib.h>
+
+/**
+ * This enum contains all possible mutations. Some of them are considered
+ * symmetric (e.g. AtoT and TtoA) and thus have the same value. The total number
+ * of different possible mutations is MUTCOUNTS.
+ */
+enum {
+ AtoA,
+ AtoC,
+ AtoG,
+ AtoT,
+ CtoC,
+ CtoG,
+ CtoT,
+ GtoG,
+ GtoT,
+ TtoT,
+ MUTCOUNTS,
+ CtoA = AtoC,
+ GtoA = AtoG,
+ GtoC = CtoG,
+ TtoA = AtoT,
+ TtoC = CtoT,
+ TtoG = GtoT
+};
+
+/** @brief The mutation matrix.
+ *
+ * We need to keep track of the different types of mutations between two
+ * sequences. For this the following matrix is filled.
+ *
+ * To A C G T
+ * From
+ * A ( )
+ * C ( )
+ * G ( )
+ * T ( )
+ *
+ * The cells are absolute counts. Together with seq_len (the query length),
+ * we can deduce the substitution rate and coverage.
+ */
+typedef struct model {
+ /** The absolute counts of mutation types. */
+ size_t counts[MUTCOUNTS];
+ /** The query length. */
+ size_t seq_len;
+} model;
+
+void model_count_equal(model *, const char *, size_t);
+void model_count(model *, const char *, const char *, size_t);
+model model_average(const model *, const model *);
+double model_coverage(const model *);
+double estimate_RAW(const model *);
+double estimate_JC(const model *);
+double estimate_KIMURA(const model *);
+model model_bootstrap(const model);
diff --git a/src/process.c b/src/process.c
index e58cace..0aaf1ec 100644
--- a/src/process.c
+++ b/src/process.c
@@ -11,9 +11,10 @@
#include <stdio.h>
#include "esa.h"
#include "global.h"
+#include "io.h"
+#include "model.h"
#include "process.h"
#include "sequence.h"
-#include "io.h"
#include <time.h>
#include <gsl/gsl_rng.h>
@@ -23,8 +24,9 @@
#include <omp.h>
#endif
-double shuprop( size_t x, double g, size_t l);
-int calculate_bootstrap(const data_t *M, const seq_t *sequences, size_t n);
+double shuprop(size_t x, double g, size_t l);
+int calculate_bootstrap(const struct model *M, const seq_t *sequences,
+ size_t n);
/**
* @brief Calculates the minimum anchor length.
@@ -37,15 +39,15 @@ int calculate_bootstrap(const data_t *M, const seq_t *sequences, size_t n);
* @param l - The length of the subject.
* @returns The minimum length of an anchor.
*/
-size_t minAnchorLength( double p, double g, size_t l){
+size_t minAnchorLength(double p, double g, size_t l) {
size_t x = 1;
-
+
double prop = 0.0;
- while( prop < 1 - p){
- prop = shuprop( x, g/2, l);
+ while (prop < 1 - p) {
+ prop = shuprop(x, g / 2, l);
x++;
}
-
+
return x;
}
@@ -64,55 +66,56 @@ size_t minAnchorLength( double p, double g, size_t l){
* @param k - analog.
* @returns (n choose k)
*/
-size_t binomial_coefficient( size_t n, size_t k){
- if( n <= 0 || k > n){
+size_t binomial_coefficient(size_t n, size_t k) {
+ if (n <= 0 || k > n) {
return 0;
}
-
- if( k == 0 || k == n ){
+
+ if (k == 0 || k == n) {
return 1;
}
-
- if( k > n-k ){
- k = n-k;
+
+ if (k > n - k) {
+ k = n - k;
}
-
+
size_t res = 1;
- for( size_t i= 1; i <= k; i++){
+ for (size_t i = 1; i <= k; i++) {
res *= n - k + i;
res /= i;
}
-
+
return res;
}
/**
- * @brief Given `x` this function calculates the probability of a shustring
+ * @brief Given `x` this function calculates the probability of a shustring
* with a length less than `x`.
*
- * Let X be the longest shortest unique substring (shustring) at any position. Then
- * this function computes P{X <= x} with respect to the given parameter set. See
- * Haubold et al. (2009).
+ * Let X be the longest shortest unique substring (shustring) at any position.
+ * Then this function computes P{X <= x} with respect to the given parameter
+ * set. See Haubold et al. (2009).
*
* @param x - The maximum length of a shustring.
* @param g - The the half of the relative amount of GC in the DNA.
* @param l - The length of the subject.
* @returns The probability of a certain shustring length.
*/
-double shuprop( size_t x, double p, size_t l){
+double shuprop(size_t x, double p, size_t l) {
double xx = (double)x;
double ll = (double)l;
size_t k;
-
+
double s = 0.0;
-
- for(k=0; k<= x; k++){
+
+ for (k = 0; k <= x; k++) {
double kk = (double)k;
- double t = pow(p,kk) * pow(0.5 - p, xx - kk);
-
- s += pow(2,xx) * (t * pow(1-t,ll)) * (double)binomial_coefficient(x,k);
- if( s >= 1.0){
+ double t = pow(p, kk) * pow(0.5 - p, xx - kk);
+
+ s += pow(2, xx) * (t * pow(1 - t, ll)) *
+ (double)binomial_coefficient(x, k);
+ if (s >= 1.0) {
s = 1.0;
break;
}
@@ -129,28 +132,30 @@ double shuprop( size_t x, double p, size_t l){
* is a simple string. This function then looks for *anchors* -- long
* substrings that exist in both sequences. Then it manually checks for
* mutations between those anchors.
- *
+ *
* @return An estimate for the number of mutations within homologous regions.
* @param C - The enhanced suffix array of the subject.
* @param query - The actual query string.
- * @param query_length - The length of the query string. Needed for speed reasons.
+ * @param query_length - The length of the query string. Needed for speed
+ * reasons.
*/
-data_t dist_anchor( const esa_s *C, const char *query, size_t query_length, double gc){
- size_t snps = 0; // Total number of found SNPs
- size_t homo = 0; // Total number of homologous nucleotides.
-
+model dist_anchor(const esa_s *C, const char *query, size_t query_length,
+ double gc) {
+ struct model ret = {.seq_len = query_length, .counts = {0}};
+
lcp_inter_t inter;
-
+
size_t last_pos_Q = 0;
size_t last_pos_S = 0;
size_t last_length = 0;
- // This variable indicates that the last anchor was the right anchor of a pair.
+ // This variable indicates that the last anchor was the right anchor of a
+ // pair.
size_t last_was_right_anchor = 0;
-
+
size_t this_pos_Q = 0;
size_t this_pos_S;
size_t this_length;
-
+
size_t num_right_anchors = 0;
#ifdef DEBUG
@@ -163,143 +168,120 @@ data_t dist_anchor( const esa_s *C, const char *query, size_t query_length, doub
double off_dem = 0.0;
#endif
- size_t threshold = minAnchorLength( 1-sqrt(1-RANDOM_ANCHOR_PROP), gc, C->len);
-
- data_t retval = {0.0,0.0};
+ size_t threshold =
+ minAnchorLength(1 - sqrt(1 - RANDOM_ANCHOR_PROP), gc, C->len);
// Iterate over the complete query.
- while( this_pos_Q < query_length){
- inter = get_match_cached( C, query + this_pos_Q, query_length - this_pos_Q);
+ while (this_pos_Q < query_length) {
+ inter =
+ get_match_cached(C, query + this_pos_Q, query_length - this_pos_Q);
#ifdef DEBUG
num_matches++;
#endif
this_length = inter.l <= 0 ? 0 : inter.l;
-
- if( inter.i == inter.j && this_length >= threshold)
- {
+
+ if (inter.i == inter.j && this_length >= threshold) {
// We have reached a new anchor.
- this_pos_S = C->SA[ inter.i];
+ this_pos_S = C->SA[inter.i];
#ifdef DEBUG
num_anchors++;
length_anchors += this_length;
- if( this_pos_S < (size_t)(C->len / 2)){
+ if (this_pos_S < (size_t)(C->len / 2)) {
num_anchors_in_rc++;
}
#endif
// Check if this can be a right anchor to the last one.
- if( this_pos_S > last_pos_S &&
- this_pos_Q - last_pos_Q == this_pos_S - last_pos_S ){
+ if (this_pos_S > last_pos_S &&
+ this_pos_Q - last_pos_Q == this_pos_S - last_pos_S) {
num_right_anchors++;
#ifdef DEBUG
- if( this_pos_S < (size_t)(C->len / 2)){
+ if (this_pos_S < (size_t)(C->len / 2)) {
num_right_anchors_in_rc++;
}
#endif
-
+ // classify nucleotides in the qanchor
+ model_count_equal(&ret, query + last_pos_Q, last_length);
+
// Count the SNPs in between.
- size_t i;
- for( i= last_length; i< this_pos_Q - last_pos_Q; i++){
- if( C->S[ last_pos_S + i] != query[ last_pos_Q + i] ){
- snps++;
- }
- }
- homo += this_pos_Q - last_pos_Q;
+ model_count(&ret, C->S + last_pos_S + last_length, query + last_pos_Q + last_length,
+ this_pos_Q - last_pos_Q - last_length);
last_was_right_anchor = 1;
} else {
#ifdef DEBUG
- double off = fabs((double)(this_pos_Q - last_pos_Q)- (double)(this_pos_S - last_pos_S));
- if( off < 100 ){
+ double off = fabs((double)(this_pos_Q - last_pos_Q) -
+ (double)(this_pos_S - last_pos_S));
+ if (off < 100) {
off_num += off;
off_dem++;
}
#endif
- if( last_was_right_anchor){
- // If the last was a right anchor, but with the current one, we
- // cannot extend, then add its length.
- homo += last_length;
- } else if( (last_length / 2) >= threshold){
- // The last anchor wasn't neither a left or right anchor. But,
- // it was as long as an anchor pair. So still count it.
- homo += last_length;
+ if (last_was_right_anchor) {
+ // If the last was a right anchor, but with the current one,
+ // we cannot extend, then add its length.
+ model_count_equal(&ret, C->S + last_pos_S, last_length);
+ } else if ((last_length / 2) >= threshold) {
+ // The last anchor wasn't neither a left or right anchor.
+ // But, it was as long as an anchor pair. So still count it.
+ model_count_equal(&ret, C->S + last_pos_S, last_length);
}
-
+
last_was_right_anchor = 0;
}
-
+
// Cache values for later
last_pos_Q = this_pos_Q;
last_pos_S = this_pos_S;
- last_length= this_length;
+ last_length = this_length;
}
-
+
// Advance
this_pos_Q += this_length + 1;
}
#ifdef DEBUG
- if( FLAGS & F_EXTRA_VERBOSE ){
- const char str[] = {
- "- threshold: %ld\n"
- "- matches: %lu\n"
- "- anchors: %lu (rc: %lu)\n"
- "- right anchors: %lu (rc: %lu)\n"
- "- avg length: %lf\n"
- "- off: %f (skipped: %.0f)\n"
- "\n"
- };
-
- #pragma omp critical
+ if (FLAGS & F_EXTRA_VERBOSE) {
+ const char str[] = {"- threshold: %ld\n"
+ "- matches: %lu\n"
+ "- anchors: %lu (rc: %lu)\n"
+ "- right anchors: %lu (rc: %lu)\n"
+ "- avg length: %lf\n"
+ "- off: %f (skipped: %.0f)\n"
+ "\n"};
+
+#pragma omp critical
{
- fprintf(stderr, str, threshold, num_matches, num_anchors, num_anchors_in_rc, num_right_anchors, num_right_anchors_in_rc, (double)length_anchors/ num_anchors, off_num/off_dem, off_dem );
+ fprintf(stderr, str, threshold, num_matches, num_anchors,
+ num_anchors_in_rc, num_right_anchors,
+ num_right_anchors_in_rc,
+ (double)length_anchors / num_anchors, off_num / off_dem,
+ off_dem);
}
}
#endif
-
+
// Very special case: The sequences are identical
- if( last_length >= query_length ){
- retval.coverage = 1.0;
- return retval;
- }
-
- // We might miss a few nucleotides if the last anchor was also a right anchor.
- if( last_was_right_anchor ){
- homo += last_length;
- }
-
- // Nearly identical sequences
- if( homo == query_length){
- retval.distance = (double)snps/(double)homo;
- retval.coverage = 1.0;
- return retval;
+ if (last_length >= query_length) {
+ model_count(&ret, C->S + last_pos_S, query, query_length);
+ return ret;
}
- // Insignificant results. All abort the fail train.
- if ( homo <= 3){
- retval.distance = NAN;
- return retval;
+ // We might miss a few nucleotides if the last anchor was also a right
+ // anchor.
+ if (last_was_right_anchor) {
+ model_count(&ret, C->S + last_pos_S, query + last_pos_Q, last_length);
}
-
- // Abort if we have more homologous nucleotides than just nucleotides. This might
- // happen with sequences of different lengths.
- if( homo >= (size_t) C->len ){
- retval.distance = NAN;
- retval.coverage = 1.0;
- return retval;
- }
-
- retval.distance = (double)snps/(double)homo;
- retval.coverage = (double)homo/(double)query_length;
- return retval;
+
+ return ret;
}
/**
* @brief Computes the distance matrix.
*
- * The distMatrix() populates the D matrix with computed distances. It allocates D and
+ * The distMatrix() populates the D matrix with computed distances.
* @param sequences An array of pointers to the sequences.
* @param n The number of sequences.
*/
@@ -309,7 +291,7 @@ data_t dist_anchor( const esa_s *C, const char *query, size_t query_length, doub
/**
* @brief Computes the distance matrix.
*
- * The distMatrixLM() populates the D matrix with computed distances. It allocates D and
+ * The distMatrixLM() populates the D matrix with computed distances.
* @param sequences An array of pointers to the sequences.
* @param n The number of sequences.
*/
@@ -321,75 +303,71 @@ data_t dist_anchor( const esa_s *C, const char *query, size_t query_length, doub
* @param sequences - An array of pointers to the sequences.
* @param n - The number of sequences.
*/
-void calculate_distances( seq_t* sequences, int n){
+void calculate_distances(seq_t *sequences, int n) {
int i;
// check the sequences
- for( i=0;i<n;i++){
- if( sequences[i].S == NULL || sequences[i].len == 0){
+ for (i = 0; i < n; i++) {
+ if (sequences[i].S == NULL || sequences[i].len == 0) {
errx(1, "Missing sequence: %s", sequences[i].name);
}
- if( sequences[i].len < 1000){
+ if (sequences[i].len < 1000) {
FLAGS |= F_SHORT;
}
}
- if( FLAGS & F_SHORT ){
- warnx("One of the given input sequences is shorter than a thousand nucleotides. "
- "This may result in inaccurate distances. Try an alignment instead.");
+ if (FLAGS & F_SHORT) {
+ warnx("One of the given input sequences is shorter than a thousand "
+ "nucleotides. This may result in inaccurate distances. Try an "
+ "alignment instead.");
}
-
+
// Warn about non ACGT residues.
- if( FLAGS & F_NON_ACGT ){
+ if (FLAGS & F_NON_ACGT) {
warnx("The input sequences contained characters other than acgtACGT. "
- "These were automatically stripped to ensure correct results.");
+ "These were automatically stripped to ensure correct results.");
}
- data_t *M = malloc(n*n*sizeof(data_t));
- if( !M){
- err( errno, "Could not allocate enough memory for the comparison matrix. Try using --join or --low-memory.");
+ model *M = malloc(n * n * sizeof(*M));
+ if (!M) {
+ err(errno, "Could not allocate enough memory for the comparison "
+ "matrix. Try using --join or --low-memory.");
}
// compute the distances
- if( FLAGS & F_LOW_MEMORY){
- distMatrixLM( M, sequences, n);
+ if (FLAGS & F_LOW_MEMORY) {
+ distMatrixLM(M, sequences, n);
} else {
- distMatrix( M, sequences, n);
+ distMatrix(M, sequences, n);
}
// print the results
- print_distances( M, sequences, n);
-
+ print_distances(M, sequences, n, 1);
// print additional information.
- if( FLAGS & F_VERBOSE){
- print_coverages( M, n);
+ if (FLAGS & F_VERBOSE) {
+ print_coverages(M, n);
}
// create new bootstrapped distance matrices
- if( BOOTSTRAP ){
+ if (BOOTSTRAP) {
int res = calculate_bootstrap(M, sequences, n);
- if( res){
+ if (res) {
warnx("Bootstrapping failed.");
}
}
-
free(M);
}
/** Yet another hack. */
-#define B( X, Y) (B[ (X)*n + (Y) ])
+#define B(X, Y) (B[(X)*n + (Y)])
/** @brief Computes a bootstrap from _pairwise_ aligments.
*
- * Doing bootstrapping for alignments with only two sequences is easy. As
- * we only count substitutions, this boils down to a binomial distribution.
- * `n` is the number of homologous sites, and `p` is the (uncorrected)
- * substitution rate. Thus the bootstrapped distance `q` is `B(n;p)/n`.
- * Note that `E(q) = p` and `Var(q) = p*(1-p)/n`. So in the limit of long
- * sequences (i.e. big `n`) `q` almost certainly converges to `p`.
+ * Doing bootstrapping for alignments with only two sequences is easy. It boils
+ * down to a simple multi-nomial process over the substitution matrix.
*
* @param M - the initial distance matrix
* @param sequences - a list of the sequences, containing their lengths
@@ -400,56 +378,36 @@ void calculate_distances( seq_t* sequences, int n){
*
* @returns 0 iff successful.
*/
-int calculate_bootstrap(const data_t *M, const seq_t *sequences, size_t n){
- if( !M || !sequences || !n){
+int calculate_bootstrap(const struct model *M, const seq_t *sequences,
+ size_t n) {
+ if (!M || !sequences || !n) {
return 1;
}
// B is the new bootstrap matrix
- data_t *B = malloc(n * n * sizeof(data_t));
- if( !B) return 2;
-
- gsl_rng *rng = gsl_rng_alloc(gsl_rng_default);
- if( !rng){
- free(B);
- return 3;
- }
-
- // seed the random number generator with the current time
- gsl_rng_set( rng, time(NULL));
+ struct model *B = malloc(n * n * sizeof(*B));
+ if (!B) return 2;
// Compute a number of new distance matrices
- while( BOOTSTRAP--){
- for( size_t i=0; i< n; i++){
- for( size_t j=i; j<n; j++){
- if( i == j){
- B(i,j) = (data_t){0.0,0.0};
+ while (BOOTSTRAP--) {
+ for (size_t i = 0; i < n; i++) {
+ for (size_t j = i; j < n; j++) {
+ if (i == j) {
+ B(i, j) = (struct model){.seq_len = 1.0, .counts = {1.0}};
continue;
}
- /* The classical bootstrapping process, as described by
- Felsenstein, resamples all nucleotides of a MSA. As andi
- only computes a pairwise alignment, this process boils
- down to a simple binomial distribution around a mean of
- the original distance. */
-
- double ijnucl = M(i,j).coverage * (double)sequences[j].len;
- double jinucl = M(j,i).coverage * (double)sequences[i].len;
- double homonucl = ijnucl + jinucl;
- double avg_distance = (M(i,j).distance * ijnucl + M(j,i).distance * jinucl)
- / (homonucl);
-
- // The actual coverage value doesn't matter, just ensure its > 0
- data_t datum = {.coverage = 1.0};
-
- datum.distance = gsl_ran_binomial( rng, avg_distance, homonucl) / homonucl;
- B(j,i) = B(i,j) = datum;
+
+ // Bootstrapping should only be used with averaged distances.
+ model datum = model_average(&M(i, j), &M(j, i));
+ datum = model_bootstrap(datum);
+
+ B(j, i) = B(i, j) = datum;
}
}
- print_distances(B, sequences, n);
+ print_distances(B, sequences, n, 0);
}
free(B);
- gsl_rng_free(rng);
return 0;
}
diff --git a/src/process.h b/src/process.h
index 262db4a..eb632fe 100644
--- a/src/process.h
+++ b/src/process.h
@@ -1,15 +1,13 @@
/**
* @file
* @brief This file contains the declarations of functions in process.c
- *
+ *
*/
#ifndef _PROCESS_H_
#define _PROCESS_H_
#include "sequence.h"
-void calculate_distances( seq_t* sequences, int n);
+void calculate_distances(seq_t *sequences, int n);
#endif
-
-
diff --git a/src/sequence.c b/src/sequence.c
index 3fcc088..48de282 100644
--- a/src/sequence.c
+++ b/src/sequence.c
@@ -13,14 +13,14 @@
#include "sequence.h"
#include "global.h"
-void normalize( seq_t *S);
+void normalize(seq_t *S);
/** Create a new dynamic array for sequences. */
-int dsa_init(dsa_t *A){
+int dsa_init(dsa_t *A) {
// allocate at least 4 slots so the growth by 1.5 below doesn't get stuck
// at 3 slots.
A->data = malloc(sizeof(seq_t) * 4);
- if(!A->data){
+ if (!A->data) {
return 1;
}
@@ -30,13 +30,13 @@ int dsa_init(dsa_t *A){
}
/** Add a sequence to an array. */
-void dsa_push( dsa_t *A, seq_t S){
- if( A->size < A->capacity){
+void dsa_push(dsa_t *A, seq_t S) {
+ if (A->size < A->capacity) {
A->data[A->size++] = S;
} else {
// use the near-optimal growth factor of 1.5
- seq_t* ptr = reallocarray(A->data, A->capacity / 2, sizeof(seq_t) * 3);
- if(ptr == NULL){
+ seq_t *ptr = reallocarray(A->data, A->capacity / 2, sizeof(seq_t) * 3);
+ if (ptr == NULL) {
err(errno, "out of memory?");
}
@@ -47,9 +47,9 @@ void dsa_push( dsa_t *A, seq_t S){
}
/** Frees the array and all sequences stored within. */
-void dsa_free( dsa_t *A){
+void dsa_free(dsa_t *A) {
size_t i;
- for( i=0; i< A->size; i++){
+ for (i = 0; i < A->size; i++) {
seq_free(&A->data[i]);
}
@@ -58,32 +58,32 @@ void dsa_free( dsa_t *A){
}
/** Returns the number of sequences stored within an array. */
-size_t dsa_size( dsa_t *A){
+size_t dsa_size(dsa_t *A) {
return A->size;
}
/** Get the raw C array. */
-seq_t* dsa_data( dsa_t *A){
+seq_t *dsa_data(dsa_t *A) {
return A->data;
}
/**
* @brief Convert an array of multiple sequences into a single sequence.
- *
+ *
* This function joins all sequences contained in an array into one
* long sequence. The sequences are separated by a `!` character. The
* caller has to free the initial array.
*
* @returns A new sequence representation the union of the array.
*/
-seq_t dsa_join( dsa_t *A){
+seq_t dsa_join(dsa_t *A) {
seq_t joined = {};
- if( A->size == 0){
+ if (A->size == 0) {
return joined;
}
- if(A->size == 1){
+ if (A->size == 1) {
/* If we are to join just one sequence, _move_ its contents. */
joined = A->data[0];
A->data[0] = (seq_t){};
@@ -92,38 +92,38 @@ seq_t dsa_join( dsa_t *A){
seq_t *data = A->data;
seq_t *it = data;
-
+
// Compute the total length
size_t total = 0, i;
- for( i=0; i< A->size; i++, it++ ){
+ for (i = 0; i < A->size; i++, it++) {
total += it->len + 1;
}
-
+
// A single malloc for the whole new sequence
- char *ptr = malloc( total);
- if( ptr == NULL ){
+ char *ptr = malloc(total);
+ if (ptr == NULL) {
return joined;
}
char *next = ptr;
-
+
// Copy all old sequences and add a `!` in between
it = data;
- memcpy( next, it->S, it->len);
+ memcpy(next, it->S, it->len);
next += it->len;
- for( i=1, it++; i < A->size; i++, it++){
+ for (i = 1, it++; i < A->size; i++, it++) {
*next++ = '!';
- memcpy( next, it->S, it->len);
+ memcpy(next, it->S, it->len);
next += it->len;
}
-
+
// Don't forget the null byte.
*next = '\0';
-
+
joined.S = ptr;
- joined.len = total -1; // subtract the null byte
-
+ joined.len = total - 1; // subtract the null byte
+
return joined;
}
@@ -131,10 +131,10 @@ seq_t dsa_join( dsa_t *A){
* @brief Frees the memory of a given sequence.
* @param S - The sequence to free.
*/
-void seq_free( seq_t *S){
- free( S->S);
- free( S->RS);
- free( S->name);
+void seq_free(seq_t *S) {
+ free(S->S);
+ free(S->RS);
+ free(S->name);
*S = (seq_t){};
}
@@ -144,66 +144,55 @@ void seq_free( seq_t *S){
* @param len The length of the master string
* @return The reverse complement. The caller has to free it!
*/
-char *revcomp( const char *str, size_t len){
- char *rev = malloc( len + 1);
- if( !str || !rev) return NULL;
-
+char *revcomp(const char *str, size_t len) {
+ char *rev = malloc(len + 1);
+ if (!str || !rev) return NULL;
+
char *r = rev;
- const char *s = str + len-1;
-
+ const char *s = &str[len - 1];
rev[len] = '\0';
-
- char c, d;
- char local_non_acgt = 0;
-
+
do {
- c = *s--;
-
- switch( c){
+ char d;
+
+ switch (*s--) {
case 'A': d = 'T'; break;
case 'T': d = 'A'; break;
case 'G': d = 'C'; break;
case 'C': d = 'G'; break;
case '!': d = ';'; break; // rosebud
- default:
- local_non_acgt = 1;
- continue;
+ default: continue;
}
-
+
*r++ = d;
- } while ( --len );
-
- if( local_non_acgt ){
- #pragma omp atomic
- FLAGS |= F_NON_ACGT;
- }
-
+ } while (--len);
+
return rev;
}
/**
- * @brief This function concatenates the reverse complement to a given master string. A
- * `#` sign is used as a separator.
+ * @brief This function concatenates the reverse complement to a given master
+ * string. A `#` sign is used as a separator.
* @param s The master string.
* @param len Its length.
* @return The newly concatenated string.
*/
-char *catcomp( char *s , size_t len){
- if( !s) return NULL;
-
- char *rev = revcomp( s, len);
-
- char *temp = (char*) realloc( rev, 2 * len + 2);
- if( !temp){
+char *catcomp(char *s, size_t len) {
+ if (!s) return NULL;
+
+ char *rev = revcomp(s, len);
+
+ char *temp = realloc(rev, 2 * len + 2);
+ if (!temp) {
free(rev);
return NULL;
}
-
+
rev = temp;
rev[len] = '#';
-
- memcpy( rev+len+1, s, len+1);
-
+
+ memcpy(rev + len + 1, s, len + 1);
+
return rev;
}
@@ -212,29 +201,31 @@ char *catcomp( char *s , size_t len){
*
* This function computes the relative amount of G and C in the total sequence.
*/
-double calc_gc( seq_t *S){
+double calc_gc(const seq_t *S) {
size_t GC = 0;
- char *p = S->S;
-
- for(; *p; p++){
- if( *p == 'G' || *p == 'C'){
+ const char *p = S->S;
+
+ for (; *p; p++) {
+ if (*p == 'G' || *p == 'C') {
GC++;
}
}
-
- return S->gc = (double)GC/S->len;
+
+ return (double)GC / S->len;
}
/** @brief Prepares a sequences to be used as the subject in a comparison. */
-void seq_subject_init( seq_t *S){
- calc_gc(S);
-
+int seq_subject_init(seq_t *S) {
+ S->gc = calc_gc(S);
S->RS = catcomp(S->S, S->len);
+ if (!S->RS) return 1;
S->RSlen = 2 * S->len + 1;
+ return 0;
}
-/** @brief Frees some memory unused for when a sequence is only used as query. */
-void seq_subject_free( seq_t *S){
+/** @brief Frees some memory unused for when a sequence is only used as query.
+ */
+void seq_subject_free(seq_t *S) {
free(S->RS);
S->RS = NULL;
S->RSlen = 0;
@@ -245,29 +236,28 @@ void seq_subject_free( seq_t *S){
*
* @returns 0 iff successful.
*/
-int seq_init( seq_t *S, const char *seq, const char *name){
- if( !S || !seq || !name) {
+int seq_init(seq_t *S, const char *seq, const char *name) {
+ if (!S || !seq || !name) {
return 1;
}
- *S = (seq_t){
- .S = strdup(seq),
- .name=strdup(name)
- };
+ *S = (seq_t){.S = strdup(seq), .name = strdup(name)};
- if( !S->S || !S->name){
+ if (!S->S || !S->name) {
seq_free(S);
return 2;
}
- normalize( S);
+ normalize(S);
- // recalculate the length because `normalize` might have stripped some characters.
+ // recalculate the length because `normalize` might have stripped some
+ // characters.
S->len = strlen(S->S);
- const size_t LENGTH_LIMIT = (INT_MAX - 1) /2;
- if(S->len > LENGTH_LIMIT){
- warnx("The input sequence %s is too long. The technical limit is %zu.", S->name, LENGTH_LIMIT);
+ const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2;
+ if (S->len > LENGTH_LIMIT) {
+ warnx("The input sequence %s is too long. The technical limit is %zu.",
+ S->name, LENGTH_LIMIT);
return 3;
}
@@ -278,36 +268,29 @@ int seq_init( seq_t *S, const char *seq, const char *name){
* @brief Restricts a sequence characters set to ACGT.
*
* This function strips a sequence of non ACGT characters and converts acgt to
- * the upper case equivalent. A flag is set if a non-canonical character was encountered.
+ * the upper case equivalent. A flag is set if a non-canonical character was
+ * encountered.
*/
-void normalize( seq_t *S){
+void normalize(seq_t *S) {
char *p, *q;
char local_non_acgt = 0;
- for( p= q= S->S; *p; p++){
- switch( *p){
+ for (p = q = S->S; *p; p++) {
+ switch (*p) {
case 'A':
case 'C':
case 'G':
case 'T':
- case '!':
- *q++ = *p;
- break;
+ case '!': *q++ = *p; break;
case 'a':
case 'c':
case 'g':
- case 't':
- *q++ = toupper( (unsigned char)*p);
- break;
- default:
- local_non_acgt = 1;
- break;
-
+ case 't': *q++ = toupper((unsigned char)*p); break;
+ default: local_non_acgt = 1; break;
}
}
*q = '\0';
- if ( local_non_acgt ){
- #pragma omp atomic
+ if (local_non_acgt) {
+#pragma omp atomic
FLAGS |= F_NON_ACGT;
}
}
-
diff --git a/src/sequence.h b/src/sequence.h
index b28a513..db7aff8 100644
--- a/src/sequence.h
+++ b/src/sequence.h
@@ -35,26 +35,26 @@ typedef struct seq_s {
double gc;
} seq_t;
-void seq_free( seq_t *S);
-void seq_subject_init( seq_t *S);
-void seq_subject_free( seq_t *S);
-int seq_init( seq_t *S, const char *seq, const char *name);
+void seq_free(seq_t *S);
+int seq_subject_init(seq_t *S);
+void seq_subject_free(seq_t *S);
+int seq_init(seq_t *S, const char *seq, const char *name);
/**
* A dynamically growing structure for sequences.
*/
typedef struct dsa_s {
- seq_t* data;
+ seq_t *data;
size_t capacity, size;
} dsa_t;
int dsa_init(dsa_t *A);
-void dsa_push( dsa_t *A, seq_t S);
-void dsa_free( dsa_t *A);
-size_t dsa_size( dsa_t *A);
-seq_t* dsa_data( dsa_t *A);
+void dsa_push(dsa_t *A, seq_t S);
+void dsa_free(dsa_t *A);
+size_t dsa_size(dsa_t *A);
+seq_t *dsa_data(dsa_t *A);
-seq_t dsa_join( dsa_t *dsa);
+seq_t dsa_join(dsa_t *dsa);
#endif
diff --git a/test/Makefile.am b/test/Makefile.am
index 8476cd0..32332a5 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -9,14 +9,14 @@ endif
test_seq_SOURCES = test_seq.c ../src/sequence.c
test_seq_CPPFLAGS = -I$(top_srcdir)/src -I$(top_srcdir)/opt -DDEBUG -std=gnu99
-test_seq_CFLAGS = -W -Wall $(GLIB_CFLAGS) -Wno-missing-field-initializers
+test_seq_CFLAGS = -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
test_seq_LDADD = $(GLIB_LIBS) $(top_builddir)/opt/libcompat.a
test_esa_SOURCES = test_esa.c ../src/esa.c ../src/sequence.c $(top_srcdir)/src/esa.h
test_esa_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -I$(top_srcdir)/src -DDEBUG -std=gnu99
-test_esa_CFLAGS = $(OPENMP_CFLAGS) -W -Wall $(GLIB_CFLAGS) -Wno-missing-field-initializers
+test_esa_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
test_esa_LDADD = $(GLIB_LIBS) $(PSUFSORT) $(top_builddir)/opt/libcompat.a
-test_esa_CXXFLAGS = $(OPENMP_CXXFLAGS) -W -Wall
+test_esa_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
nodist_EXTRA_test_esa_SOURCES = $(DUMMY)
test_fasta_SOURCES = test_fasta.cxx
diff --git a/test/test_extra.sh b/test/test_extra.sh
index ac931bd..17266f4 100755
--- a/test/test_extra.sh
+++ b/test/test_extra.sh
@@ -9,8 +9,8 @@
# Test low-memory mode
./test/test_fasta -l 10000 > test_extra.fasta
./src/andi test_extra.fasta > extra.out
-./src/andi test_extra.fasta -m > extra_m.out
-diff extra.out extra_m.out || exit 1
+./src/andi test_extra.fasta --low-memory > extra_low_memory.out
+diff extra.out extra_low_memory.out || exit 1
-rm -f test_extra.fasta extra.out extra_m.out
+rm -f test_extra.fasta extra.out extra_low_memory.out
diff --git a/test/test_fasta.cxx b/test/test_fasta.cxx
index 3426e25..d0f4f20 100644
--- a/test/test_fasta.cxx
+++ b/test/test_fasta.cxx
@@ -19,15 +19,25 @@ int main(int argc, char *argv[]){
auto seed = rd();
int length = 1000;
int line_length = 70;
+ int raw = 0;
auto seqs = vector<double>{0};
int check;
- while((check = getopt(argc, argv, "l:L:d:")) != -1){
+ while((check = getopt(argc, argv, "s:l:L:d:r")) != -1){
switch(check) {
+ case 's':
+ {
+ seed = static_cast<unsigned int>(stol(optarg));
+ if( seed == 0){
+ seed = rd();
+ }
+ break;
+ }
case 'l': length = stoi(optarg); break;
case 'L': line_length = stoi(optarg); break;
case 'd': seqs.push_back(stod(optarg)); break;
+ case 'r': raw = 1; break;
case '?':
default: usage(); return 1;
}
@@ -37,9 +47,19 @@ int main(int argc, char *argv[]){
seqs.push_back(0.1);
}
+ if( !raw){
+ for(auto& dist : seqs) {
+ auto d = dist;
+ auto p = 0.75 - 0.75 * exp(-(4.0/3.0) * d);
+ dist = p;
+ }
+ }
+
+ auto base_seed = seed;
+
for( int i=0; i< seqs.size(); i++){
- cout << ">S" << i << endl;
- print_seq( seed, rd(), length, line_length, seqs[i]);
+ cout << ">S" << i << " (base_seed: " << base_seed << ")" << endl;
+ print_seq( base_seed, seed++, length, line_length, seqs[i]);
}
return 0;
diff --git a/test/test_join.sh b/test/test_join.sh
index bf17bc9..b47a11b 100755
--- a/test/test_join.sh
+++ b/test/test_join.sh
@@ -13,7 +13,7 @@ tail -qn 2 p1.fasta p2.fasta p3.fasta > S1.fasta
rm p1.fasta p2.fasta p3.fasta;
-RES=$(./src/andi -rt 1 -j S0.fasta S1.fasta |
+RES=$(./src/andi -m RAW -t 1 -j S0.fasta S1.fasta |
tail -n 1 |
awk '{print ($2 - 0.1)}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
@@ -35,7 +35,7 @@ tail -qn 2 p2.fasta p3.fasta > S1.fasta
rm p2.fasta p3.fasta;
-RES=$(./src/andi -rt1 -j S0.fasta S1.fasta |
+RES=$(./src/andi -m RAW -t1 -j S0.fasta S1.fasta |
tail -n 1 |
awk '{print ($2 - 0.1)}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
@@ -59,7 +59,7 @@ tail -qn 2 p1.fasta p2.fasta p3.fasta > S1.fasta
rm p1.fasta p2.fasta p3.fasta;
-RES=$(./src/andi -rt 1 -j S0.fasta S1.fasta |
+RES=$(./src/andi -mRAW t 1 -j S0.fasta S1.fasta |
tail -n 1 |
awk '{print ($2 - 0.1)}' |
awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
diff --git a/test/test_random.sh b/test/test_random.sh
index afb10c8..acfb0bf 100755
--- a/test/test_random.sh
+++ b/test/test_random.sh
@@ -1,22 +1,68 @@
#!/bin/sh -f
+# This scripts test the accuracy of andi with random inputs. For that
+# it uses the small program test_random to generate pairs of sequences
+# with a given distance. By default, test_random creates a new set of
+# sequences each time it is called. Thus, this test has a small, but
+# non-zero probability of failing. That is a problem with Debian's
+# reproducible builds effort. So this script acts as a wrapper around
+# this issue.
+#
+# Simply calling this script via
+# % ./test/test_random.sh
+# checks a new test-case every time. But with the right parameter
+# % RANDOM_SEED=1729 ./test/test_random.sh
+# one specific set of sequences is validated.
+
./src/andi --help > /dev/null || exit 1
LENGTH=100000
-for dist in 0.001 0.01 0.02 0.05 0.1 0.2 0.3
+# If RANDOM_SEED is set, use its value. Otherwise 0 is used to signal
+# to test_random that a new set of sequences shall be generated.
+SEED=${RANDOM_SEED:-0}
+
+for dist in 0.0 0.001 0.01 0.02 0.05 0.1 0.2 0.3
do
for n in $(seq 10)
do
- res=$(./test/test_fasta -l $LENGTH -d $dist |
+ if test $SEED -ne 0; then
+ SEED=$((SEED + 1))
+ fi
+
+ res=$(./test/test_fasta -s $SEED -l $LENGTH -d $dist |
+ tee ./test/test_random.fasta |
+ ./src/andi -t 1 |
+ tail -n 1 |
+ awk -v dist=$dist '{print $2, dist}' |
+ awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.02 && abs($1-$2) <= 0.02 * $2}')
+ if test $res -ne 1; then
+ echo "The last test computed a distance deviating more than two percent from its intended value."
+ echo "See test_random.fasta for the used sequences."
+ echo "./test/test_fasta -s $SEED -l $LENGTH -d $dist"
+ head -n 1 ./test/test_random.fasta
+ exit 1;
+ fi
+ done
+
+ # raw
+ for n in $(seq 10)
+ do
+ if test $SEED -ne 0; then
+ SEED=$((SEED + 1))
+ fi
+
+ res=$(./test/test_fasta -r -s $SEED -l $LENGTH -d $dist |
tee ./test/test_random.fasta |
- ./src/andi -r -t 1 |
+ ./src/andi -m RAW -t 1 |
tail -n 1 |
awk -v dist=$dist '{print $2, dist}' |
- awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.02 && abs($1-$2) < 0.02 * $2}')
+ awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.02 && abs($1-$2) <= 0.02 * $2}')
if test $res -ne 1; then
echo "The last test computed a distance deviating more than two percent from its intended value."
echo "See test_random.fasta for the used sequences."
+ echo "./test/test_fasta -r -s $SEED -l $LENGTH -d $dist"
+ head -n 1 ./test/test_random.fasta
exit 1;
fi
done
diff --git a/test/test_seq.c b/test/test_seq.c
index bbae326..b36c45a 100644
--- a/test/test_seq.c
+++ b/test/test_seq.c
@@ -24,7 +24,9 @@ void test_seq_full(){
seq_t S;
seq_init( &S, "ACGTTGCA", "name");
- seq_subject_init( &S);
+ int check = seq_subject_init( &S);
+
+ g_assert_cmpint(check, ==, 0);
g_assert_cmpstr(S.RS, ==, "TGCAACGT#ACGTTGCA");
g_assert_cmpuint(S.RSlen, ==, 8*2+1);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/andi.git
More information about the debian-med-commit
mailing list