[med-svn] [andi] 01/04: Imported Upstream version 0.10
Andreas Tille
tille at debian.org
Fri May 20 13:23:13 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository andi.
commit 37d2bab93a9b818a253898bc0effaab95c009fa9
Author: Andreas Tille <tille at debian.org>
Date: Fri May 20 15:01:34 2016 +0200
Imported Upstream version 0.10
---
.travis.yml | 27 ++++++----
README.md | 51 ++++++++++++-------
andi-manual.pdf | Bin 425545 -> 440783 bytes
configure.ac | 6 ++-
docs/andi.1.in | 8 +--
docs/manual/andi-manual.tex | 105 +++++++++++++++++++++++++++++++-------
docs/manual/references.bib | 16 +++++-
opt/strchrnul.c | 2 +
scripts/failed.zsh | 20 ++++++++
src/Makefile.am | 1 +
src/andi.c | 69 +++++++++++++++++++------
src/dist_hack.h | 13 ++---
src/esa.c | 121 ++++++++++++++++++++------------------------
src/esa.h | 3 +-
src/global.h | 13 +++++
src/io.c | 31 +++++++-----
src/model.c | 32 ++++++++----
src/process.c | 57 +++++++--------------
src/process.h | 2 +-
src/sequence.c | 46 +++++------------
src/sequence.h | 21 +++++---
test/test_esa.c | 13 ++---
test/test_seq.c | 27 ++++++----
23 files changed, 425 insertions(+), 259 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 2ca5d14..c043ebc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,17 +1,21 @@
language: cpp
+os:
+ - linux
+ - osx
compiler:
-- gcc
+ - gcc
+ - clang
sudo: false
addons:
apt:
sources:
- - deadsnakes
- - ubuntu-toolchain-r-test
+ - deadsnakes
+ - ubuntu-toolchain-r-test
packages:
- - cmake
- - g++-4.8
- - libglib2.0-dev
- - libgsl0-dev
+ - cmake
+ - g++-4.8
+ - libglib2.0-dev
+ - libgsl0-dev
install:
- export LIBDIVDIR="$HOME/libdivsufsort"
- pip install --user cpp-coveralls
@@ -20,11 +24,14 @@ install:
- cd libdivsufsort-master && mkdir build && cd build
- cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" ..
- make && make install
+ - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install gsl; fi
+ - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install glib; fi
before_install:
-- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
+ - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
# - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
script:
+- export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$LIBDIVDIR/lib"
- export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib"
- export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH"
- cd $TRAVIS_BUILD_DIR
@@ -33,6 +40,8 @@ script:
- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
- make
- make check
+- export MYFLAGS="-I$LIBDIVDIR/include"
+- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
- make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\""
- tar xzvf andi-*.tar.gz
- cd andi-*
@@ -41,4 +50,4 @@ script:
- make check
- cd ..
after_success:
-- coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'
+- if [ "${TRAVIS_OS_NAME}" = "linux" -a "$CXX" = "g++-4.8" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi
diff --git a/README.md b/README.md
index b863c16..72b9e86 100644
--- a/README.md
+++ b/README.md
@@ -7,39 +7,52 @@ This is the `andi` program for estimating the evolutionary distance between clos
This readme covers all necessary instructions for the impatient to get `andi` up and running. For extensive instructions please consult the [manual](andi-manual.pdf).
-# Build Instructions
+# Installation and Usage
-For the latest [stable release](https://github.com/EvolBioInf/andi/releases) of `andi` download the tar ball. If you'd like to contribute to this software, feel free to create a fork of our [git repository](https://github.com/EvolBioInf/andi) and send pull requests.
+Stable versions of `andi` are available via package managers. For manual installation see below.
+For Debian and Ubuntu (starting 16.04):
-## Prerequisites
+ sudo apt-get install andi
-This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools. Run `autoreconf -i` to generate the configure script and continue with the next step.
+For OS X with Homebrew:
+ brew install science/andi
-## Compiling
+For ArchLinux with aura:
-Assuming you have installed all prerequisites, building is as easy as follows.
-
- $ ./configure
- $ make
- $ make install
-
-Excessive build instructions are located in `INSTALL`. If the build was successful you can get the usage instructions via `--help` or the man page.
+ sudo aura -A andi
+
+With a successful installation you can get the usage instructions via `--help` or the man page.
- $ andi --help
- $ man andi
+ $ andi --help
+ $ man andi
You can simply use `andi` with your genomes in `FASTA` format.
- $ andi S1.fasta S2.fasta
- 2
- S1 0.0 0.1
- s2 0.1 0.0
+ $ andi S1.fasta S2.fasta
+ 2
+ S1 0.0 0.1
+ s2 0.1 0.0
From this distance matrix the phylogeny can be inferred via neighbor-joining. Check the [manual](andi-manual.pdf) for a more thorough description.
+## Manual installation
+
+If your system does not support one of the above package managers you have to manually build the latest [stable release](https://github.com/EvolBioInf/andi/releases) from a tarball. See the [manual](andi-manual.pdf) for extensive building instructions.
+
+This program has the following external dependencies: [libdivsufsort](https://github.com/y-256/libdivsufsort) and the [GSL](https://www.gnu.org/software/gsl/). Please make sure you installed both before attempting a build. If you did get the source, not as a tarball, but straight from the git repository, you will also need the autotools.
+
+Assuming you have installed all prerequisites, building is as easy as follows.
+
+ $ autoreconf -fi -Im4 # optional when build from tarball
+ $ ./configure
+ $ make
+ $ make install
+
+Excessive build instructions are located in `INSTALL`.
+
# Links and Additional Resources
The release of this software is accompanied by a paper from [Haubold et al.](http://bioinformatics.oxfordjournals.org/content/31/8/1169). It explains the used *anchor distance* strategy in great detail. The `maf2phy.awk` script used in the validation process is located under `scripts`. Simulations were done using our own [simK](http://guanine.evolbio.mpg.de/bioBox/) tool. For a demo visualising the internals of andi visit our [GitHub pages](http://evolbioinf.github.io/andi/).
@@ -54,7 +67,7 @@ The release of this software is accompanied by a paper from [Haubold et al.](htt
## License
-Copyright © 2014, 2015 Fabian Klötzl
+Copyright © 2014 - 2016 Fabian Klötzl
License GPLv3+: GNU GPL version 3 or later.
This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. The full license text is available at <http://gnu.org/licenses/gpl.html>.
diff --git a/andi-manual.pdf b/andi-manual.pdf
index f8d4467..14e1522 100644
Binary files a/andi-manual.pdf and b/andi-manual.pdf differ
diff --git a/configure.ac b/configure.ac
index 1d55963..9034e42 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([andi], [0.9.6.1])
+AC_INIT([andi], [0.10])
AM_INIT_AUTOMAKE([-Wall foreign ])
AC_CONFIG_MACRO_DIR([m4])
@@ -32,6 +32,10 @@ AC_ARG_WITH([libdivsufsort],
AS_IF([test "x$with_libdivsufsort" != "xno"],
[
+ # The libdivsufsort header contains some Microsoft extension making
+ # compilation fail on certain systems (i.e. OS X). Add the following
+ # flag so the build runs smoothly.
+ CPPFLAGS="$CPPFLAGS -fms-extensions"
AC_CHECK_HEADERS([divsufsort.h],[have_libdivsufsort=yes],[have_libdivsufsort=no])
AC_CHECK_LIB(divsufsort, divsufsort, [], [have_libdivsufsort=no])
],
diff --git a/docs/andi.1.in b/docs/andi.1.in
index 94e8efd..a29a3f3 100644
--- a/docs/andi.1.in
+++ b/docs/andi.1.in
@@ -1,4 +1,4 @@
-.TH ANDI "1" "June 2015" "@VERSION@" ""
+.TH ANDI "1" "2016-03-05" "@VERSION@" "andi manual"
.SH NAME
andi \- estimates evolutionary distance
.SH SYNOPSIS
@@ -11,7 +11,7 @@ andi \- estimates evolutionary distance
The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse [...]
.SH OPTIONS
.TP
-\fB\-b, \-\-bootstrap\fR <INT>
+\fB\-b\fR, \fB\-\-bootstrap\fR <INT>
Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016, in review) for a detailed explanation.
.TP
\fB\-j\fR, \fB\-\-join\fR
@@ -26,7 +26,7 @@ Different models of nucleotide evolution are supported. By default the Jukes-Can
\fB\-p\fR <FLOAT>
Significance of an anchor pair; default: 0.05.
.TP
-\fB\-t\fR <INT>
+\fB\-t\fR, \fB\-\-threads\fR <INT>
The number of threads to be used; by default, all available processors are used.
.br
Multithreading is only available if \fBandi\fR was compiled with OpenMP support.
@@ -40,7 +40,7 @@ Prints the synopsis and an explanation of available options.
\fB\-\-version\fR
Outputs version information and acknowledgments.
.SH COPYRIGHT
-Copyright \(co 2014, 2015 Fabian Klötzl
+Copyright \(co 2014 - 2016 Fabian Klötzl
License GPLv3+: GNU GPL version 3 or later.
.br
This is free software: you are free to change and redistribute it.
diff --git a/docs/manual/andi-manual.tex b/docs/manual/andi-manual.tex
index d2fd86b..3f34333 100644
--- a/docs/manual/andi-manual.tex
+++ b/docs/manual/andi-manual.tex
@@ -15,13 +15,9 @@
\usepackage{amsthm}
\usepackage{acronym}
\usepackage{amssymb}
-\usepackage{tikz}
-\usepackage{pgfplots}
\usepackage{caption}
\usepackage{subcaption}
-\usepackage{booktabs}
\usepackage{xspace}
-\usepackage{overpic}
\bibliographystyle{alpha}
@@ -109,22 +105,23 @@ This document is release under the Creative Commons Attribution Share-Alike lice
The easiest way to install \andi is via a package manager. This also handles all dependencies for you.
-\noindent OS X with homebrew:
+
+\noindent Debian and Ubuntu (since 16.04):
\begin{lstlisting}
-~ % brew install science/andi
+~ % sudo apt-get install andi
\end{lstlisting}
-\noindent ArchLinux:
+\noindent OS X with homebrew:
\begin{lstlisting}
-~ % aura -A andi
+~ % brew install science/andi
\end{lstlisting}
-\noindent Debian and Ubuntu (soon):
+\noindent ArchLinux:
\begin{lstlisting}
-~ % sudo apt-get install andi
+~ % aura -A andi
\end{lstlisting}
\andi is intended to run in a \algo{Unix} commandline such as \lstinline$bash$ or \lstinline$zsh$. All examples in this document are also intended for that environment. You can verify that \andi was installed correctly by executing \lstinline$andi -h$. This should give you a list of all available options (see Section~\ref{sec:options}).
@@ -189,11 +186,9 @@ To build \andi from the \algo{Git} repo, you will also need the \algo{autotools}
\begin{lstlisting}
~ % git clone git at github.com:EvolBioInf/andi.git
~ % cd andi
-~/andi % autoreconf -i
+~/andi % autoreconf -fi -Im4
\end{lstlisting}
-\noindent For old versions of \algo{autoconf} you may instead have to use \lstinline$autoreconf -i -Im4$.
-
\noindent Continue with the \algo{Gnu} trinity as described in Section~\ref{sub:regular}.
@@ -269,7 +264,7 @@ In the above examples the runtime dropped from \SI{0.613}{\second}, to \SI{0.362
By default, the distances computed by \andi are \emph{Jukes-Cantor} corrected \cite{jukescantor}. Other evolutionary models are also implemented (Kimura, raw). The \lstinline$--model$ parameter can be used to switch between them.
-Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}.
+Since version 0.9.4 \andi includes a bootstrapping method. It can be activated via the \lstinline$--bootstrap$ or \lstinline$-b$ switch. This option takes a numeric argument representing the number of matrices to create. The output can then be piped into \algo{phylip}. For more information on computing support values from distance matrices see \cite{afra}.
\begin{lstlisting}
~ % andi -b 2 ../test/1M.1.fasta
@@ -333,6 +328,75 @@ EOF
\end{figure}
+\chapter{Warnings and Errors}
+
+Here be an explanation of all possible errors. Other errors may occur and are due to the failure of underlying functions (e.\,g.~\lstinline$read(3)$). All warning messages are printed to \lstinline$stderr$. Most errors are non-recoverable and will result in \andi exiting with a non-zero state.
+
+\section{Sequence Related Messages}
+
+\subsection*{Unexpected Character}
+
+\andi is pretty pedantic about the formatting of \algo{FASTA} files. If you violate the syntax, \andi will print the file name, the line and the problematic character. These errors are non-recovering, meaning no further sequences are read from the invalid file. The checks are implemented by the \href{https://github.com/kloetzl/pfasta}{\algo{pfasta}} library.
+
+
+\subsection*{Non acgtACGT Nucleotides Stripped}
+
+Our models of genome evolution (JC, Kimura) only work on the four canonical types of nucleotides. All others are stripped from the sequences. This can be ignored in most cases.
+
+\subsection*{Too Short Sequence}
+
+\andi was designed for big data sets of whole genomes. On short sequences the distance estimates are inaccurate. Use an multiple sequence alignment instead.
+
+\subsection*{Too Long Sequence}
+
+\algo{libdivsufsort} limits the length of a sequence to 31 bits. That count includes the reverse complement. So the technical limit for a sequence analysis is $2^{30} = 1.073.741.824$. Unfortunately, that excludes (full) human and mice genomes. Per-chromosome analysis works just fine.
+
+\subsection*{Empty Sequence}
+
+One of the given sequences contained either no nucleotides at all, or only non-canonical ones.
+
+\subsection*{Less than two sequences given}
+
+As \andi tries to compare sequences, at least two need to be supplied. Note that \andi may have regarded some of your given sequences as unusable.
+
+\subsection*{Maximum Number of Sequences}
+
+The maximum number of sequences \andi can possible compare is huge (roughly $457.845.052$). I doubt anyone will ever reach that limit. Please send me a mail, if you do.
+
+\section{Technical Messages}
+
+\subsection*{Out of Memory}
+
+If \andi runs out of memory, it gives up. Either free memory, run \andi on a bigger machine, try the \lstinline$--low-memory$ mode or reduce the number of threads.
+
+\subsection*{RNG allocation}
+
+Some technical thing failed. If it keeps failing repeatedly, file a bug.
+
+\subsection*{Bootstrapping failed}
+
+This should not happen.
+
+\subsection*{Failed index creation}
+
+This should not happen, either.
+
+\subsection*{Skipped and ignored Arguments}
+
+Some command line parameters of \andi require arguments. If these are not of the expected type, a warning is given. See Section~\ref{sec:options} for their correct usage.
+
+
+\section{Output-related Warnings}
+
+As the input sequences get more evolutionary divergent, \andi finds less anchors. With less anchors, less nucleotides are cosidered homologous between two sequences. If no anchors are found comparison fails and \lstinline!nan! is printed instead. See out paper and especiallt Figure~2 for details.
+
+\subsection*{NaN}
+
+No anchors were found. Your sequences are very divergent ($d>0.5$) or sprout a lot of indels that make comparison difficult.
+
+\subsection*{Little Homology}
+
+Very few anchors were found and thus only a tiny part of the sequences is considered homologous. Expect that the given distance is errorneous.
\chapter{DevOps} %%%%%
@@ -376,14 +440,17 @@ The unit tests are located in the \andi repository under the \lstinline$./test$
~/andi % make check
\end{lstlisting}
-\noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests to not run successfully. Currently, the unit tests cover more than 80\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}}
+\noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests to not run successfully. Currently, the unit tests cover more than 75\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}} Unfortunately, coveral [...]
\section{Known Issues}
These minor issues are known. I intend to fix them, when I have time.
\begin{enumerate}
- \item This code will not work under Windows. At two places Unix-only code is used: filepath-seperators are assumed to be \lstinline$/$ and file-descriptors are used for I/O.
+ \item This code will not work under Windows. At two places Unix-only code is used: filepath-separators are assumed to be \lstinline$/$ and file-descriptors are used for I/O.
+ \item Only the first 10 characters are printed in the output matrix. This can make long gi numbers indistinguishable. In version 0.10 we started printing a warning for this.
+ \item Unit tests for the bootstrapped matrices are missing.
+ \item Cached intervals are sometimes not “as deep as they could be”. If that got fixed \lstinline$get_match_cache$ could bail out on \lstinline$ij.lcp < CACHE_LENGTH$. However the \lstinline$esa_init_cache$ code is the most fragile part and should be handled with care.
\end{enumerate}
@@ -393,7 +460,7 @@ A release should be a stable version of \andi with significant improvements over
%\subsection{Preparing a new Release}
-Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version. Now return to the previous commit \lstinline$git checkout vX.y$.
+Once \andi is matured, the new features implemented, and all tests were run, a new release can be created. First, increase the version number in \lstinline$configure.ac$. Commit that change in git, and tag this commit with \lstinline$vX.y$. Tags should be annotated and signed, if possible. This manual then needs manual rebuilding.
Ensure that \andi is ready for packaging with \algo{autoconf}.
@@ -414,9 +481,11 @@ andi-0.9.1-beta.tar.gz
=================================================
\end{lstlisting}
-If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}.
+If the command does not build successfully, no tarballs will be created. This may necessitate further study of \algo{autoconf} and \algo{automake}.
+Also verify that the recent changes did not create a performance regression. This includes testing both ends of the scale: \eco and \pneu. Both should be reasonable close to previous releases.
+Create another commit, where you set the version number to the next release (e.\,g., \lstinline$vX.z-beta$). This assures that there is only one commit and build with that specific version.
\backmatter
%\addcontentsline{toc}{chapter}{Bibliography}
diff --git a/docs/manual/references.bib b/docs/manual/references.bib
index cc2cff6..6922e15 100644
--- a/docs/manual/references.bib
+++ b/docs/manual/references.bib
@@ -45,7 +45,7 @@
Author = {Chris Lattner and Vikram Adve},
Title = {{LLVM}: A Compilation Framework for Lifelong Program
Analysis and Transformation},
- Booktitle = "Code Generation and Optimizatio",
+ Booktitle = "Code Generation and Optimization",
Month = {Mar},
Year = {2004},
pages = {75--88},
@@ -109,4 +109,18 @@
title={Efficient Estimation of Evolutionary Distances}
}
+ at article{afra,
+ AUTHOR = {Klötzl, Fabian and Haubold, Bernhard},
+ TITLE = {Support Values for Genome Phylogenies},
+ JOURNAL = {Life},
+ VOLUME = {6},
+ YEAR = {2016},
+ NUMBER = {1},
+ PAGES = {11},
+ URL = {http://www.mdpi.com/2075-1729/6/1/11},
+ ISSN = {2075-1729},
+ DOI = {10.3390/life6010011}
+}
+
+
diff --git a/opt/strchrnul.c b/opt/strchrnul.c
index a36fe5c..219c251 100644
--- a/opt/strchrnul.c
+++ b/opt/strchrnul.c
@@ -1,6 +1,8 @@
/* @brief Here follows a simple implementation of the GNU function `strchrnul`.
* Please check the gnulib manual for details.
*/
+#include <string.h>
+
char *strchrnul(const char *s, int c){
char *p = strchr(s,c);
diff --git a/scripts/failed.zsh b/scripts/failed.zsh
new file mode 100755
index 0000000..f0d2de8
--- /dev/null
+++ b/scripts/failed.zsh
@@ -0,0 +1,20 @@
+#!/usr/bin/zsh
+
+# Compute the number of failing comparisons for different distances.
+
+DISTS=(0.1 0.2 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7)
+
+LENGTH=100000
+
+for dist in $DISTS; do
+ echo "" > est_$dist.dist
+ for (( i = 0; i < 1000; i++ )); do
+ ../test/test_fasta -l $LENGTH -d $dist > temp.fa
+ ../src/andi ./temp.fa > est.dist 2> /dev/null
+ tail -n 1 est.dist >> est_$dist.dist
+ done
+ avg=$(cat est_$dist.dist | awk '"nan" !~ $2 {sum+=$2;c++}END{print sum/c}')
+ sd=$(grep -v 'nan' est_$dist.dist | awk '{a[c++]=$2;aa+=$2}END{aa/=NR;for(c=0;c<NR;c++){t=a[c]-aa;sd+=t*t}print sqrt(sd/(NR-1))}')
+ failed=$(cat est_$dist.dist | grep -c 'nan')
+ echo $dist "\t" $avg"\t±" $sd "\t" $failed
+done
diff --git a/src/Makefile.am b/src/Makefile.am
index 4384dd4..3a02c51 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -16,4 +16,5 @@ nodist_EXTRA_andi_SOURCES = $(DUMMY)
.PHONY: perf
perf: CFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
+perf: CXXFLAGS+= -g -O3 -ggdb -fno-omit-frame-pointer
perf: andi
diff --git a/src/andi.c b/src/andi.c
index b607eaa..ce7ad93 100644
--- a/src/andi.c
+++ b/src/andi.c
@@ -25,6 +25,7 @@
#include <assert.h>
#include <errno.h>
#include <getopt.h>
+#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -55,7 +56,8 @@ void version(void);
*
* The main function reads and parses the commandline arguments. Depending on
* the set flags it reads the input files and forwards the contained sequences
- * to processing.
+ * to processing. Also it verifies the input for correctness and issues warnings
+ * and errors.
*/
int main(int argc, char *argv[]) {
int c;
@@ -134,7 +136,7 @@ int main(int argc, char *argv[]) {
if (threads > (long unsigned int)omp_get_num_procs()) {
warnx(
- "The number of threads to be used, is greater then the "
+ "The number of threads to be used, is greater than the "
"number of available processors; Ignoring -t %lu "
"argument.",
threads);
@@ -174,7 +176,8 @@ int main(int argc, char *argv[]) {
} else if (strcasecmp(optarg, "KIMURA") == 0) {
MODEL = M_KIMURA;
} else {
- warnx("Ignoring argument for --model. Expected Raw JC or Kimura");
+ warnx("Ignoring argument for --model. Expected Raw, JC or "
+ "Kimura");
}
break;
}
@@ -196,9 +199,7 @@ int main(int argc, char *argv[]) {
}
dsa_t dsa;
- if (dsa_init(&dsa)) {
- errx(errno, "Out of memory.");
- }
+ dsa_init(&dsa);
const char *file_name;
@@ -223,6 +224,13 @@ int main(int argc, char *argv[]) {
size_t n = dsa_size(&dsa);
+ if (n < 2) {
+ errx(1,
+ "I am truly sorry, but with less than two sequences (%zu given) "
+ "there is nothing to compare.",
+ n);
+ }
+
if (FLAGS & F_VERBOSE) {
fprintf(stderr, "Comparing %zu sequences\n", n);
fflush(stderr);
@@ -236,16 +244,45 @@ int main(int argc, char *argv[]) {
// seed the random number generator with the current time
gsl_rng_set(RNG, time(NULL));
- seq_t *sequences = dsa_data(&dsa);
- // compute distance matrix
- if (n >= 2) {
- calculate_distances(sequences, n);
- } else {
- warnx("I am truly sorry, but with less than two sequences (%zu given) "
- "there is nothing to compare.",
- n);
+ // Warn about non ACGT residues.
+ if (FLAGS & F_NON_ACGT) {
+ warnx("The input sequences contained characters other than acgtACGT. "
+ "These were automatically stripped to ensure correct results.");
}
+ // validate sequence correctness
+ const seq_t *seq = dsa_data(&dsa);
+ for (size_t i = 0; i < n; ++i, seq++) {
+ if (strlen(seq->name) > 10) {
+ warnx("The sequence name '%s' is longer than ten characters. It "
+ "will be truncated in the output to '%.10s'.",
+ seq->name, seq->name);
+ }
+
+ const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2;
+ if (seq->len > LENGTH_LIMIT) {
+ errx(1, "The sequence %s is too long. The technical limit is %zu.",
+ seq->name, LENGTH_LIMIT);
+ }
+
+ if (seq->len == 0) {
+ errx(1, "The sequence %s is empty.", seq->name);
+ }
+
+ if (seq->len < 1000) {
+ FLAGS |= F_SHORT;
+ }
+ }
+
+ if (FLAGS & F_SHORT) {
+ warnx("One of the given input sequences is shorter than a thousand "
+ "nucleotides. This may result in inaccurate distances. Try an "
+ "alignment instead.");
+ }
+
+ // compute distance matrix
+ calculate_distances(dsa_data(&dsa), n);
+
dsa_free(&dsa);
gsl_rng_free(RNG);
return 0;
@@ -289,7 +326,7 @@ void usage(void) {
void version(void) {
const char str[] = {
"andi " VERSION "\n"
- "Copyright (C) 2014, 2015 Fabian Klötzl\n"
+ "Copyright (C) 2014 - 2016 Fabian Klötzl\n"
"License GPLv3+: GNU GPL version 3 or later "
"<http://gnu.org/licenses/gpl.html>\n"
"This is free software: you are free to change and redistribute it.\n"
@@ -302,7 +339,7 @@ void version(void) {
"Sequence Analysis, Genome Rearrangements, and Phylogenetic "
"Reconstruction. pp 118f.\n"
"3) SA construction: Mori, Y. (2005). Short description of improved "
- "two-stage suffix sorting alorithm. "
+ "two-stage suffix sorting algorithm. "
"http://homepage3.nifty.com/wpage/software/itssort.txt\n"};
printf("%s", str);
exit(EXIT_SUCCESS);
diff --git a/src/dist_hack.h b/src/dist_hack.h
index f66d37d..dc3668d 100644
--- a/src/dist_hack.h
+++ b/src/dist_hack.h
@@ -29,17 +29,18 @@
* @param n - The number of sequences
* @param M - A matrix for additional output data
*/
-void NAME(struct model *M, seq_t *sequences, size_t n) {
+void NAME(struct model *M, const seq_t *sequences, size_t n) {
size_t i;
//#pragma
P_OUTER
for (i = 0; i < n; i++) {
- seq_t *subject = &sequences[i];
+ seq_subject subject;
esa_s E;
- if (seq_subject_init(subject) || esa_init(&E, subject)) {
- errx(1, "Failed to create index for %s.", subject->name);
+ if (seq_subject_init(&subject, &sequences[i]) ||
+ esa_init(&E, &subject)) {
+ errx(1, "Failed to create index for %s.", sequences[i].name);
}
// now compare every other sequence to i
@@ -60,10 +61,10 @@ void NAME(struct model *M, seq_t *sequences, size_t n) {
size_t ql = sequences[j].len;
- M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject->gc);
+ M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.gc);
}
esa_free(&E);
- seq_subject_free(subject);
+ seq_subject_free(&subject);
}
}
diff --git a/src/esa.c b/src/esa.c
index d9297b7..a6b4128 100644
--- a/src/esa.c
+++ b/src/esa.c
@@ -17,6 +17,7 @@
#include <string.h>
#include <assert.h>
#include "esa.h"
+#include "global.h"
static void esa_init_cache_dfs(esa_s *, char *str, size_t pos, lcp_inter_t in);
static void esa_init_cache_fill(esa_s *, char *str, size_t pos, lcp_inter_t in);
@@ -70,12 +71,8 @@ ssize_t char2code(const char c) {
* @returns 0 iff successful
*/
int esa_init_cache(esa_s *self) {
- lcp_inter_t *cache =
- malloc((1 << (2 * CACHE_LENGTH)) * sizeof(lcp_inter_t));
-
- if (!cache) {
- return 1;
- }
+ lcp_inter_t *cache = malloc((1 << (2 * CACHE_LENGTH)) * sizeof(*cache));
+ CHECK_MALLOC(cache);
self->cache = cache;
@@ -128,48 +125,52 @@ void esa_init_cache_dfs(esa_s *C, char *str, size_t pos, const lcp_inter_t in) {
continue;
}
- // The LCP-interval is deeper than expected
- if (ij.l > (ssize_t)(pos + 1)) {
-
- // Check if it still fits into the cache
- if ((size_t)ij.l < CACHE_LENGTH) {
-
- // fill with dummy value
- esa_init_cache_fill(C, str, pos + 1, in);
-
- char non_acgt = 0;
-
- // fast forward
- size_t k = pos + 1;
- for (; k < (size_t)ij.l; k++) {
- // In some very edgy edge cases the lcp-interval `ij`
- // contains a `;` or another non-acgt character. Since we
- // cannot cache those, break.
- char c = C->S[C->SA[ij.i] + k];
- if (char2code(c) < 0) {
- non_acgt = 1;
- break;
- }
-
- str[k] = c;
- }
-
- if (non_acgt) {
- esa_init_cache_fill(C, str, k, ij);
- } else {
- esa_init_cache_dfs(C, str, k, ij);
- }
-
- continue;
- }
+ if (ij.l <= (ssize_t)(pos + 1)) {
+ // Continue one level deeper
+ // This is the usual case
+ esa_init_cache_dfs(C, str, pos + 1, ij);
+ continue;
+ }
+ // The LCP-interval is deeper than expected
+ // Check if it still fits into the cache
+ if ((size_t)ij.l >= CACHE_LENGTH) {
// If the lcp-interval exceeds the cache depth, stop here and fill
esa_init_cache_fill(C, str, pos + 1, in);
continue;
}
- // Continue one level deeper
- esa_init_cache_dfs(C, str, pos + 1, ij);
+ /* At this point the prefix `str` of length `pos` has been found.
+ * However, the call to `getInterval` above found an interval with
+ * an LCP value bigger than `pos`. This means that not all elongations
+ * (more precise: just one) of `str` appear in the subject. Thus fill
+ * all values with the matched result to far and continue only with
+ * the one special substring.
+ */
+ esa_init_cache_fill(C, str, pos + 1, in);
+
+ char non_acgt = 0;
+
+ // fast forward
+ size_t k = pos + 1;
+ for (; k < (size_t)ij.l; k++) {
+ // In some very edgy edge cases the lcp-interval `ij`
+ // contains a `;` or another non-acgt character. Since we
+ // cannot cache those, break.
+ char c = C->S[C->SA[ij.i] + k];
+ if (char2code(c) < 0) {
+ non_acgt = 1;
+ break;
+ }
+
+ str[k] = c;
+ }
+
+ if (non_acgt) {
+ esa_init_cache_fill(C, str, k, ij);
+ } else {
+ esa_init_cache_dfs(C, str, k, ij);
+ }
}
}
@@ -216,9 +217,7 @@ int esa_init_FVC(esa_s *self) {
size_t len = self->len;
char *FVC = self->FVC = malloc(len);
- if (!FVC) {
- return 1;
- }
+ CHECK_MALLOC(FVC);
const char *S = self->S;
const int *SA = self->SA;
@@ -239,8 +238,8 @@ int esa_init_FVC(esa_s *self) {
* @param S - The sequence
* @returns 0 iff successful
*/
-int esa_init(esa_s *C, const seq_t *S) {
- if (!C || !S || !S->S) return 1;
+int esa_init(esa_s *C, const seq_subject *S) {
+ if (!C || !S || !S->RS) return 1;
*C = (esa_s){.S = S->RS, .len = S->RSlen};
@@ -285,10 +284,8 @@ int esa_init_SA(esa_s *C) {
return 1;
}
- C->SA = malloc(C->len * sizeof(saidx_t));
- if (!C->SA) {
- return 2;
- }
+ C->SA = malloc(C->len * sizeof(*C->SA));
+ CHECK_MALLOC(C->SA);
saidx_t result = 1;
@@ -311,16 +308,15 @@ int esa_init_CLD(esa_s *C) {
if (!C || !C->LCP) {
return 1;
}
- saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(saidx_t));
- if (!C->CLD) {
- return 2;
- }
+ saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(*CLD));
+ CHECK_MALLOC(CLD);
saidx_t *LCP = C->LCP;
typedef struct pair_s { saidx_t idx, lcp; } pair_t;
- pair_t *stack = malloc((C->len + 1) * sizeof(pair_t));
+ pair_t *stack = malloc((C->len + 1) * sizeof(*stack));
+ CHECK_MALLOC(stack);
pair_t *top = stack; // points at the topmost filled element
pair_t last;
@@ -379,21 +375,16 @@ int esa_init_LCP(esa_s *C) {
// Allocate new memory
// The LCP array is one element longer than S.
- saidx_t *LCP = C->LCP = malloc((len + 1) * sizeof(saidx_t));
- if (!LCP) {
- return 3;
- }
+ saidx_t *LCP = C->LCP = malloc((len + 1) * sizeof(*LCP));
+ CHECK_MALLOC(LCP);
LCP[0] = -1;
LCP[len] = -1;
// Allocate temporary arrays
- saidx_t *PHI = malloc(len * sizeof(saidx_t));
+ saidx_t *PHI = malloc(len * sizeof(*PHI));
saidx_t *PLCP = PHI;
- if (!PHI) {
- free(PHI);
- return 2;
- }
+ CHECK_MALLOC(PHI);
PHI[SA[0]] = -1;
saidx_t k;
diff --git a/src/esa.h b/src/esa.h
index 6595925..df6dbaa 100644
--- a/src/esa.h
+++ b/src/esa.h
@@ -6,6 +6,7 @@
#ifndef _ESA_H_
#define _ESA_H_
+#include <sys/types.h>
#include "sequence.h"
#include "config.h"
@@ -68,7 +69,7 @@ typedef struct esa_s {
lcp_inter_t get_match_cached(const esa_s *, const char *query, size_t qlen);
lcp_inter_t get_match(const esa_s *, const char *query, size_t qlen);
-int esa_init(esa_s *, const seq_t *S);
+int esa_init(esa_s *, const seq_subject *S);
void esa_free(esa_s *);
#ifdef DEBUG
diff --git a/src/global.h b/src/global.h
index faa244e..ea375f6 100644
--- a/src/global.h
+++ b/src/global.h
@@ -9,6 +9,7 @@
#define _GLOBAL_H_
#include <gsl/gsl_rng.h>
+#include <err.h>
#include "config.h"
/**
@@ -62,4 +63,16 @@ enum {
F_SHORT = 64
};
+/**
+ * @brief This macro is used to unify the checks for the return value of malloc.
+ *
+ * @param PTR - The pointer getting checked.
+ */
+#define CHECK_MALLOC(PTR) \
+ do { \
+ if (PTR == NULL) { \
+ err(errno, "Out of memory"); \
+ } \
+ } while (0);
+
#endif
diff --git a/src/io.c b/src/io.c
index 9bc7664..ee81831 100644
--- a/src/io.c
+++ b/src/io.c
@@ -5,6 +5,7 @@
#define _GNU_SOURCE
#include <fcntl.h>
#include <math.h>
+#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
@@ -24,9 +25,9 @@
* "I didn't learn joined up handwriting for nothing, you know."
* ~ Gilderoy Lockhart
*
- * @param in - The file pointer to read from.
- * @param dsa - An array that holds found sequences.
- * @param name - The name of the file to be used for the name of the sequence.
+ * @param file_name - The name of the file to be used for reading. The name is
+ * also used to infer the sequence name.
+ * @param dsa - (output parameter) An array that holds found sequences.
*/
void read_fasta_join(const char *file_name, dsa_t *dsa) {
if (!file_name || !dsa) return;
@@ -47,22 +48,23 @@ void read_fasta_join(const char *file_name, dsa_t *dsa) {
*/
const char *left = strrchr(file_name, '/'); // find the last path separator
- left = (left == NULL) ? file_name : left + 1; // left is the position one of
- // to the right of the path
- // separator
+ left = (left == NULL) ? file_name : left + 1;
+ // left is the position one of to the right of the path separator
const char *dot = strchrnul(left, '.'); // find the extension
- joined.name = strndup(
- left, dot - left); // copy only the file name, not its path or extension
+ // copy only the file name, not its path or extension
+ joined.name = strndup(left, dot - left);
+ CHECK_MALLOC(joined.name);
+
dsa_push(dsa, joined);
dsa_free(&single);
}
/**
* @brief This function reads sequences from a file.
- * @param in - The file pointer to read from.
- * @param dsa - An array that holds found sequences.
+ * @param file_name - The file to read.
+ * @param dsa - (output parameter) An array that holds found sequences.
*/
void read_fasta(const char *file_name, dsa_t *dsa) {
if (!file_name || !dsa) return;
@@ -124,7 +126,7 @@ void print_distances(const struct model *D, const seq_t *sequences, size_t n,
int use_scientific = 0;
double *DD = malloc(n * n * sizeof(*DD));
- if (!DD) err(errno, "meh.");
+ CHECK_MALLOC(DD);
#define DD(X, Y) (DD[(X)*n + (Y)])
@@ -133,6 +135,9 @@ void print_distances(const struct model *D, const seq_t *sequences, size_t n,
switch (MODEL) {
case M_RAW: estimate = &estimate_RAW; break;
+ default:
+ /* intentional fall-through. This is just here to silence any
+ * compiler warnings. The real default is set in andi.c.*/
case M_JC: estimate = &estimate_JC; break;
case M_KIMURA: estimate = &estimate_KIMURA; break;
}
@@ -168,9 +173,9 @@ void print_distances(const struct model *D, const seq_t *sequences, size_t n,
printf("%zu\n", n);
for (i = 0; i < n; i++) {
- // Print exactly nine characters of the name. Pad with spaces if
+ // Print exactly ten characters of the name. Pad with spaces if
// necessary.
- printf("%-9.9s", sequences[i].name);
+ printf("%-10.10s", sequences[i].name);
for (j = 0; j < n; j++) {
// use scientific notation for small numbers
diff --git a/src/model.c b/src/model.c
index 98db34c..3577bb1 100644
--- a/src/model.c
+++ b/src/model.c
@@ -158,35 +158,47 @@ model model_bootstrap(const model MM) {
* @brief Given an anchor, classify nucleotides.
*
* For anchors we already know that the nucleotides of the subject and the query
- * are equal. Thus only one sequence has to be analysed. See `model_count` for
- * an explanation of the algorithm.
+ * are equal. Thus only one sequence has to be analysed. Most models don't
+ * actually care about the individual nucleotides as long as they are equal in
+ * the two sequences. For these models, we just assume equal distribution.
*
* @param MM - The mutation matrix
* @param S - The subject
* @param len - The anchor length
*/
void model_count_equal(model *MM, const char *S, size_t len) {
+ if (MODEL == M_RAW || MODEL == M_JC || MODEL == M_KIMURA) {
+ size_t fourth = len / 4;
+ MM->counts[AtoA] += fourth;
+ MM->counts[CtoC] += fourth;
+ MM->counts[GtoG] += fourth;
+ MM->counts[TtoT] += fourth + (len & 3);
+ return;
+ }
+
+ // Fall-back algorithm for future models. Note, as this works on a
+ // per-character basis it is slow.
+
size_t local_counts[4] = {0};
for (; len--;) {
char s = *S++;
+ // ';!#' are all smaller than 'A'
if (s < 'A') {
+ // Technically, s can only be ';' at this point.
continue;
}
- unsigned char nibble_s = s & 7;
-
- static const unsigned int mm1 = 0x20031000;
-
- unsigned char index = (mm1 >> (4 * nibble_s)) & 0x3;
- local_counts[index]++;
+ // The four canonical nucleotides can be uniquely identified by the bits
+ // 0x6: A -> 0, C → 1, G → 3, T → 2. Thus the order below is changed.
+ local_counts[(s >> 1) & 3]++;
}
MM->counts[AtoA] += local_counts[0];
MM->counts[CtoC] += local_counts[1];
- MM->counts[GtoG] += local_counts[2];
- MM->counts[TtoT] += local_counts[3];
+ MM->counts[GtoG] += local_counts[3];
+ MM->counts[TtoT] += local_counts[2];
}
/**
diff --git a/src/process.c b/src/process.c
index 0aaf1ec..83d5097 100644
--- a/src/process.c
+++ b/src/process.c
@@ -4,11 +4,12 @@
* @file
* @brief This file contains various distance methods.
*/
-#include <stdlib.h>
-#include <string.h>
#include <assert.h>
#include <math.h>
+#include <stdint.h>
#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
#include "esa.h"
#include "global.h"
#include "io.h"
@@ -16,7 +17,6 @@
#include "process.h"
#include "sequence.h"
-#include <time.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
@@ -54,13 +54,7 @@ size_t minAnchorLength(double p, double g, size_t l) {
/**
* @brief Calculates the binomial coefficient of n and k.
*
- * We used to use gsl_sf_lnchoose(xx,kk) for this functionality.
- * After all, why implement something that has already been done?
- * Well, the reason is simplicity: GSL is used for only this one
- * function and the input (n<=20) is not even considered big.
- * Hence its much easier to have our own implementation and ditch
- * the GSL dependency even if that means our code is a tiny bit
- * less optimized and slower.
+ * We could (and probalby should) use gsl_sf_lnchoose(xx,kk) for this.
*
* @param n - The n part of the binomial coefficient.
* @param k - analog.
@@ -207,7 +201,8 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
model_count_equal(&ret, query + last_pos_Q, last_length);
// Count the SNPs in between.
- model_count(&ret, C->S + last_pos_S + last_length, query + last_pos_Q + last_length,
+ model_count(&ret, C->S + last_pos_S + last_length,
+ query + last_pos_Q + last_length,
this_pos_Q - last_pos_Q - last_length);
last_was_right_anchor = 1;
} else {
@@ -222,11 +217,11 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
if (last_was_right_anchor) {
// If the last was a right anchor, but with the current one,
// we cannot extend, then add its length.
- model_count_equal(&ret, C->S + last_pos_S, last_length);
+ model_count_equal(&ret, query + last_pos_Q, last_length);
} else if ((last_length / 2) >= threshold) {
// The last anchor wasn't neither a left or right anchor.
// But, it was as long as an anchor pair. So still count it.
- model_count_equal(&ret, C->S + last_pos_S, last_length);
+ model_count_equal(&ret, query + last_pos_Q, last_length);
}
last_was_right_anchor = 0;
@@ -303,33 +298,17 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
* @param sequences - An array of pointers to the sequences.
* @param n - The number of sequences.
*/
-void calculate_distances(seq_t *sequences, int n) {
- int i;
-
- // check the sequences
- for (i = 0; i < n; i++) {
- if (sequences[i].S == NULL || sequences[i].len == 0) {
- errx(1, "Missing sequence: %s", sequences[i].name);
- }
-
- if (sequences[i].len < 1000) {
- FLAGS |= F_SHORT;
- }
- }
-
- if (FLAGS & F_SHORT) {
- warnx("One of the given input sequences is shorter than a thousand "
- "nucleotides. This may result in inaccurate distances. Try an "
- "alignment instead.");
- }
-
- // Warn about non ACGT residues.
- if (FLAGS & F_NON_ACGT) {
- warnx("The input sequences contained characters other than acgtACGT. "
- "These were automatically stripped to ensure correct results.");
+void calculate_distances(seq_t *sequences, size_t n) {
+ struct model *M = NULL;
+
+ // The maximum number of sequences is near 457'845'052.
+ size_t intermediate = SIZE_MAX / sizeof(*M) / n;
+ if (intermediate < n) {
+ size_t root = (size_t)sqrt(SIZE_MAX / sizeof(*M));
+ err(1, "Comparison is limited to %zu sequences (%zu given).", root, n);
}
- model *M = malloc(n * n * sizeof(*M));
+ M = malloc(n * n * sizeof(*M));
if (!M) {
err(errno, "Could not allocate enough memory for the comparison "
"matrix. Try using --join or --low-memory.");
@@ -386,7 +365,7 @@ int calculate_bootstrap(const struct model *M, const seq_t *sequences,
// B is the new bootstrap matrix
struct model *B = malloc(n * n * sizeof(*B));
- if (!B) return 2;
+ CHECK_MALLOC(B);
// Compute a number of new distance matrices
while (BOOTSTRAP--) {
diff --git a/src/process.h b/src/process.h
index eb632fe..b056812 100644
--- a/src/process.h
+++ b/src/process.h
@@ -8,6 +8,6 @@
#include "sequence.h"
-void calculate_distances(seq_t *sequences, int n);
+void calculate_distances(seq_t *sequences, size_t n);
#endif
diff --git a/src/sequence.c b/src/sequence.c
index 48de282..422fbee 100644
--- a/src/sequence.c
+++ b/src/sequence.c
@@ -19,10 +19,8 @@ void normalize(seq_t *S);
int dsa_init(dsa_t *A) {
// allocate at least 4 slots so the growth by 1.5 below doesn't get stuck
// at 3 slots.
- A->data = malloc(sizeof(seq_t) * 4);
- if (!A->data) {
- return 1;
- }
+ A->data = malloc(sizeof(*A->data) * 4);
+ CHECK_MALLOC(A->data);
A->capacity = 4;
A->size = 0;
@@ -36,9 +34,7 @@ void dsa_push(dsa_t *A, seq_t S) {
} else {
// use the near-optimal growth factor of 1.5
seq_t *ptr = reallocarray(A->data, A->capacity / 2, sizeof(seq_t) * 3);
- if (ptr == NULL) {
- err(errno, "out of memory?");
- }
+ CHECK_MALLOC(ptr);
A->capacity = (A->capacity / 2) * 3;
A->data = ptr;
@@ -101,9 +97,7 @@ seq_t dsa_join(dsa_t *A) {
// A single malloc for the whole new sequence
char *ptr = malloc(total);
- if (ptr == NULL) {
- return joined;
- }
+ CHECK_MALLOC(ptr);
char *next = ptr;
// Copy all old sequences and add a `!` in between
@@ -133,7 +127,6 @@ seq_t dsa_join(dsa_t *A) {
*/
void seq_free(seq_t *S) {
free(S->S);
- free(S->RS);
free(S->name);
*S = (seq_t){};
}
@@ -145,8 +138,9 @@ void seq_free(seq_t *S) {
* @return The reverse complement. The caller has to free it!
*/
char *revcomp(const char *str, size_t len) {
+ if (!str) return NULL;
char *rev = malloc(len + 1);
- if (!str || !rev) return NULL;
+ CHECK_MALLOC(rev);
char *r = rev;
const char *s = &str[len - 1];
@@ -183,10 +177,7 @@ char *catcomp(char *s, size_t len) {
char *rev = revcomp(s, len);
char *temp = realloc(rev, 2 * len + 2);
- if (!temp) {
- free(rev);
- return NULL;
- }
+ CHECK_MALLOC(temp);
rev = temp;
rev[len] = '#';
@@ -215,17 +206,17 @@ double calc_gc(const seq_t *S) {
}
/** @brief Prepares a sequences to be used as the subject in a comparison. */
-int seq_subject_init(seq_t *S) {
- S->gc = calc_gc(S);
- S->RS = catcomp(S->S, S->len);
+int seq_subject_init(seq_subject *S, const seq_t *base) {
+ S->gc = calc_gc(base);
+ S->RS = catcomp(base->S, base->len);
if (!S->RS) return 1;
- S->RSlen = 2 * S->len + 1;
+ S->RSlen = 2 * base->len + 1;
return 0;
}
/** @brief Frees some memory unused for when a sequence is only used as query.
*/
-void seq_subject_free(seq_t *S) {
+void seq_subject_free(seq_subject *S) {
free(S->RS);
S->RS = NULL;
S->RSlen = 0;
@@ -243,10 +234,8 @@ int seq_init(seq_t *S, const char *seq, const char *name) {
*S = (seq_t){.S = strdup(seq), .name = strdup(name)};
- if (!S->S || !S->name) {
- seq_free(S);
- return 2;
- }
+ CHECK_MALLOC(S->S);
+ CHECK_MALLOC(S->name);
normalize(S);
@@ -254,13 +243,6 @@ int seq_init(seq_t *S, const char *seq, const char *name) {
// characters.
S->len = strlen(S->S);
- const size_t LENGTH_LIMIT = (INT_MAX - 1) / 2;
- if (S->len > LENGTH_LIMIT) {
- warnx("The input sequence %s is too long. The technical limit is %zu.",
- S->name, LENGTH_LIMIT);
- return 3;
- }
-
return 0;
}
diff --git a/src/sequence.h b/src/sequence.h
index db7aff8..ffdc3aa 100644
--- a/src/sequence.h
+++ b/src/sequence.h
@@ -18,26 +18,33 @@
typedef struct seq_s {
/** This is the DNAs forward strand as a string. */
char *S;
+ /** The length of the forward strand. */
+ size_t len;
+ /** A name for this sequence */
+ char *name;
+} seq_t;
+
+/**
+ * @brief This structure enhances the usual sequence with its reverse
+ * complement.
+ */
+typedef struct seq_subject {
/** This member contains first the reverse strand and then the
forward strand. */
char *RS;
- /** The length of the forward strand. */
- size_t len;
/** Corresponds to strlen(RS) */
size_t RSlen;
- /** A name for this sequence */
- char *name;
/**
* @brief GC-Content
*
* The relative amount of G or C in the DNA.
*/
double gc;
-} seq_t;
+} seq_subject;
void seq_free(seq_t *S);
-int seq_subject_init(seq_t *S);
-void seq_subject_free(seq_t *S);
+int seq_subject_init(seq_subject *S, const seq_t *);
+void seq_subject_free(seq_subject *S);
int seq_init(seq_t *S, const char *seq, const char *name);
/**
diff --git a/test/test_esa.c b/test/test_esa.c
index 299872a..ff39221 100644
--- a/test/test_esa.c
+++ b/test/test_esa.c
@@ -25,6 +25,7 @@ char code3char( ssize_t code){
typedef struct {
esa_s *C;
seq_t *S;
+ seq_subject subject;
} esa_fixture;
void assert_equal_lcp( const lcp_inter_t *a, const lcp_inter_t *b){
@@ -60,9 +61,9 @@ void setup( esa_fixture *ef, gconstpointer test_data){
};
g_assert( seq_init( ef->S, seq, "S0" ) == 0);
- seq_subject_init( ef->S);
- g_assert( ef->S->RS != NULL);
- int check = esa_init( ef->C, ef->S);
+ seq_subject_init( &ef->subject, ef->S);
+ g_assert( ef->subject.RS != NULL);
+ int check = esa_init( ef->C, &ef->subject);
g_assert( check == 0);
}
@@ -86,9 +87,9 @@ void setup2( esa_fixture *ef, gconstpointer test_data){
};
g_assert( seq_init( ef->S, seq, "S0" ) == 0);
- seq_subject_init( ef->S);
- g_assert( ef->S->RS != NULL);
- int check = esa_init( ef->C, ef->S);
+ seq_subject_init( &ef->subject, ef->S);
+ g_assert( ef->subject.RS != NULL);
+ int check = esa_init( ef->C, &ef->subject);
g_assert( check == 0);
}
diff --git a/test/test_seq.c b/test/test_seq.c
index b36c45a..4f5cedd 100644
--- a/test/test_seq.c
+++ b/test/test_seq.c
@@ -22,47 +22,52 @@ void test_seq_basic(){
void test_seq_full(){
seq_t S;
+ seq_subject subject;
seq_init( &S, "ACGTTGCA", "name");
- int check = seq_subject_init( &S);
+ int check = seq_subject_init( &subject, &S);
g_assert_cmpint(check, ==, 0);
- g_assert_cmpstr(S.RS, ==, "TGCAACGT#ACGTTGCA");
- g_assert_cmpuint(S.RSlen, ==, 8*2+1);
- g_assert( S.gc == 0.5);
+ g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA");
+ g_assert_cmpuint(subject.RSlen, ==, 8*2+1);
+ g_assert( subject.gc == 0.5);
+ seq_subject_free( &subject);
seq_free( &S);
}
void test_seq_nonacgt(){
seq_t S;
+ seq_subject subject;
seq_init( &S, "11ACGTNN7682394689NNTGCA11", "name");
- seq_subject_init( &S);
+ seq_subject_init( &subject, &S);
g_assert_cmpstr(S.S, ==, "ACGTTGCA");
g_assert_cmpuint(S.len, ==, 8 );
g_assert( FLAGS & F_NON_ACGT);
- g_assert_cmpstr(S.RS, ==, "TGCAACGT#ACGTTGCA");
- g_assert_cmpuint(S.RSlen, ==, 8*2+1);
- g_assert( S.gc == 0.5);
+ g_assert_cmpstr(subject.RS, ==, "TGCAACGT#ACGTTGCA");
+ g_assert_cmpuint(subject.RSlen, ==, 8*2+1);
+ g_assert( subject.gc == 0.5);
+ seq_subject_free( &subject);
seq_free( &S);
FLAGS = F_NONE;
seq_init( &S, "@ACGT_!0TGCA ", "name");
- seq_subject_init( &S);
+ seq_subject_init( &subject, &S);
g_assert_cmpstr(S.S, ==, "ACGT!TGCA");
g_assert_cmpuint(S.len, ==, 9 );
g_assert( FLAGS & F_NON_ACGT);
- g_assert_cmpstr(S.RS, ==, "TGCA;ACGT#ACGT!TGCA");
- g_assert_cmpuint(S.RSlen, ==, 9*2+1);
+ g_assert_cmpstr(subject.RS, ==, "TGCA;ACGT#ACGT!TGCA");
+ g_assert_cmpuint(subject.RSlen, ==, 9*2+1);
+ seq_subject_free( &subject);
seq_free( &S);
FLAGS = F_NONE;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/andi.git
More information about the debian-med-commit
mailing list