[med-svn] [andi] 01/04: Imported Upstream version 0.11
Fabian Klötzl
kloetzl-guest at moszumanska.debian.org
Wed Jul 12 13:09:43 UTC 2017
This is an automated email from the git hooks/post-receive script.
kloetzl-guest pushed a commit to branch master
in repository andi.
commit 88a91de1793ca06acf38325055d5786557ffb4cd
Author: Fabian Klötzl <fabian at kloetzl.info>
Date: Wed Jul 12 14:48:52 2017 +0200
Imported Upstream version 0.11
---
.travis.yml | 2 +-
Makefile.am | 9 +--
README.md | 4 +-
andi-manual.pdf | Bin 440783 -> 444354 bytes
configure.ac | 2 +-
docs/andi.1.in | 16 +++--
docs/manual/andi-manual.tex | 76 +++++++++++++++-------
libs/pfasta.c | 9 ++-
scripts/_andi | 40 ++++++++++++
src/andi.c | 155 +++++++++++++++++++++++++-------------------
src/dist_hack.h | 22 +++----
src/esa.c | 10 +--
src/esa.h | 4 +-
src/global.h | 10 ++-
src/io.c | 137 +++++++++++++++++++++++++++++++++++++--
src/io.h | 22 ++++++-
src/model.c | 11 ++--
src/process.c | 113 +++++++-------------------------
src/sequence.c | 10 +--
src/sequence.h | 5 +-
test/test_extra.sh | 12 +++-
test/test_fasta.cxx | 2 +-
22 files changed, 432 insertions(+), 239 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index c043ebc..b49f5c5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
language: cpp
os:
- linux
- - osx
+ # - osx
compiler:
- gcc
- clang
diff --git a/Makefile.am b/Makefile.am
index 0ef0c80..e9ebaf1 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -25,14 +25,9 @@ $(TESTS): src/andi
endif # BUILD_TESTS
-dist_noinst_DATA = README ChangeLog README.md
+dist_noinst_DATA = ChangeLog README.md
dist_pdf_DATA = andi-manual.pdf
-dist_noinst_SCRIPTS= scripts/maf2phy.awk scripts/vmatch.sh
-
-# This is a small hack to satisfy both GitHub and the GNU coding style.
-README: README.md
- cp README.md README
-
+dist_noinst_SCRIPTS= scripts/maf2phy.awk scripts/vmatch.sh scripts/_andi
# Recreate the changelog, when the version string changes.
ChangeLog: configure.ac
diff --git a/README.md b/README.md
index 72b9e86..3b038da 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ For Debian and Ubuntu (starting 16.04):
For OS X with Homebrew:
- brew install science/andi
+ brew install homebrew/science/andi
For ArchLinux with aura:
@@ -67,7 +67,7 @@ The release of this software is accompanied by a paper from [Haubold et al.](htt
## License
-Copyright © 2014 - 2016 Fabian Klötzl
+Copyright © 2014 - 2017 Fabian Klötzl
License GPLv3+: GNU GPL version 3 or later.
This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. The full license text is available at <http://gnu.org/licenses/gpl.html>.
diff --git a/andi-manual.pdf b/andi-manual.pdf
index 14e1522..1a41f54 100644
Binary files a/andi-manual.pdf and b/andi-manual.pdf differ
diff --git a/configure.ac b/configure.ac
index 9034e42..2ed82b1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([andi], [0.10])
+AC_INIT([andi], [0.11])
AM_INIT_AUTOMAKE([-Wall foreign ])
AC_CONFIG_MACRO_DIR([m4])
diff --git a/docs/andi.1.in b/docs/andi.1.in
index a29a3f3..f92deb2 100644
--- a/docs/andi.1.in
+++ b/docs/andi.1.in
@@ -1,4 +1,4 @@
-.TH ANDI "1" "2016-03-05" "@VERSION@" "andi manual"
+.TH ANDI "1" "2017-06-30" "@VERSION@" "andi manual"
.SH NAME
andi \- estimates evolutionary distance
.SH SYNOPSIS
@@ -6,13 +6,16 @@ andi \- estimates evolutionary distance
[\fI-jlv\fR] [\fI-b INT\fR] [\fI-p FLOAT\fR] [\fI-m MODEL\fR] [\fI-t INT\fR] \fIFILES\fR...
.SH DESCRIPTION
.TP
-\fBandi\fR estimates the evolutionary distance between closely related genomes. For this \fBandi\fR reads the input sequences from \fIFASTA\fR files and computes the pairwise anchor distance. The idea behind this is explained in a paper by Haubold et al. (see below).
+\fBandi\fR estimates the evolutionary distance between closely related genomes. For this \fBandi\fR reads the input sequences from \fIFASTA\fR files and computes the pairwise anchor distance. The idea behind this is explained in a paper by Haubold et al. (2015).
.SH OUTPUT
The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse [...]
.SH OPTIONS
.TP
\fB\-b\fR, \fB\-\-bootstrap\fR <INT>
-Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016, in review) for a detailed explanation.
+Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016) for a detailed explanation.
+.TP
+\fB--file-of-filenames\fR <FILE>
+Usually, \fBandi\fR is called with the filenames as commandline arguments. With this option the filenames may also be read from a file itself, with one name per line. Use a single dash (\fB'-'\fR) to read from stdin.
.TP
\fB\-j\fR, \fB\-\-join\fR
Use this mode if each of your \fIFASTA\fR files represents one assembly with numerous contigs. \fBandi\fR will then treat all of the contained sequences per file as a single genome. In this mode at least one filename must be provided via command line arguments. For the output the filename is used to identify each sequence.
@@ -24,13 +27,16 @@ In multithreaded mode, \fBandi\fR requires memory linear to the amount of thread
Different models of nucleotide evolution are supported. By default the Jukes-Cantor correction is used.
.TP
\fB\-p\fR <FLOAT>
-Significance of an anchor pair; default: 0.05.
+Significance of an anchor; default: 0.025.
.TP
\fB\-t\fR, \fB\-\-threads\fR <INT>
The number of threads to be used; by default, all available processors are used.
.br
Multithreading is only available if \fBandi\fR was compiled with OpenMP support.
.TP
+\fB\-\-truncate-names\fR
+By default \fBandi\fR outputs the full names of sequences, optionally padded with spaces, if they are shorter than ten characters. Names longer than ten characters may lead to problems with downstream tools. With this switch names will be truncated.
+.TP
\fB\-v\fR, \fB\-\-verbose\fR
Prints additional information. Apply multiple times for extra verboseness.
.TP
@@ -53,6 +59,8 @@ The full license text is available at <http://gnu.org/licenses/gpl.html>.
2) Algorithms: Ohlebusch, E. (2013). Bioinformatics Algorithms. Sequence Analysis, Genome Rearrangements, and Phylogenetic Reconstruction. pp 118f.
.br
3) SA construction: Mori, Y. (2005). Short description of improved two\-stage suffix sorting algorithm. http://homepage3.nifty.com/wpage/software/itssort.txt
+.br
+4) Bootstrapping: Klötzl, F. and Haubold, B. (2016). Support Values for Genome Phylogenies
.SH BUGS
.SS Reporting Bugs
Please report bugs to <kloetzl at evolbio.mpg.de> or at <https://github.com/EvolBioInf/andi>.
diff --git a/docs/manual/andi-manual.tex b/docs/manual/andi-manual.tex
index 3f34333..1a03835 100644
--- a/docs/manual/andi-manual.tex
+++ b/docs/manual/andi-manual.tex
@@ -71,7 +71,7 @@
breaklines=true,
basicstyle=\small\ttfamily,
morekeywords={make, tar, git, sudo, andi, time, man, head, cut, fneighbor,
- fretree, figtree, brew, aura, autoreconf},
+ fretree, figtree, brew, aura, autoreconf, ls},
% literate={~} {$\sim$}{1}
}
@@ -115,7 +115,7 @@ The easiest way to install \andi is via a package manager. This also handles all
\noindent OS X with homebrew:
\begin{lstlisting}
-~ % brew install science/andi
+~ % brew install homebrew/science/andi
\end{lstlisting}
\noindent ArchLinux:
@@ -133,16 +133,16 @@ Download the latest \href{https://github.com/EvolBioInf/andi/releases}{release}
Once you have downloaded the package, unzip it and change into the newly created directory.
\begin{lstlisting}
-~ % tar -xzvf andi-0.9.tar.gz
-~ % cd andi-0.9
+~ % tar -xzvf andi-0.11.tar.gz
+~ % cd andi-0.11
\end{lstlisting}
\noindent Now build and install \andi.
\begin{lstlisting}
-~/andi-0.9 % ./configure
-~/andi-0.9 % make
-~/andi-0.9 % sudo make install
+~/andi-0.11 % ./configure
+~/andi-0.11 % make
+~/andi-0.11 % sudo make install
\end{lstlisting}
\noindent This installs \andi for all users on your system. If you do not have root privileges, you will find a working copy of \andi in the \lstinline$src$ subdirectory. For the rest of this documentation, I will assume, that \andi is in your \textdollar\lstinline!PATH!.
@@ -150,21 +150,21 @@ Once you have downloaded the package, unzip it and change into the newly created
Now \andi should be ready for use. Try invoking the help.
\begin{lstlisting}
+~/andi-0.11 % andi --help
Usage: andi [-jlv] [-b INT] [-p FLOAT] [-m MODEL] [-t INT] FILES...
FILES... can be any sequence of FASTA files. If no files are supplied, stdin is used instead.
Options:
- -b, --bootstrap <INT>
- Print additional bootstrap matrices
- -j, --join Treat all sequences from one file as a single genome
- -l, --low-memory Use less memory at the cost of speed
- -m, --model <Raw|JC|Kimura>
- Pick an evolutionary model; default: JC
- -p <FLOAT> Significance of an anchor pair; default: 0.05
- -v, --verbose Prints additional information
- -t, --threads <INT>
- The number of threads to be used; by default, all available processors are used
- -h, --help Display this help and exit
- --version Output version information and acknowledgments
+ -b, --bootstrap <INT> Print additional bootstrap matrices
+ --file-of-filenames <FILE> Read additional filenames from FILE; one per line
+ -j, --join Treat all sequences from one file as a single genome
+ -l, --low-memory Use less memory at the cost of speed
+ -m, --model <Raw|JC|Kimura> Pick an evolutionary model; default: JC
+ -p <FLOAT> Significance of an anchor; default: 0.025
+ -t, --threads <INT> Set the number of threads; by default, all available processors are used
+ --truncate-names Truncate names to ten characters
+ -v, --verbose Prints additional information
+ -h, --help Display this help and exit
+ --version Output version information and acknowledgments
\end{lstlisting}
\noindent \andi also comes with a man page, which can be accessed via \lstinline$man andi$. % But once you are done with this documentation, you will require it scarcely.
@@ -228,8 +228,30 @@ If instead of distinct sequences, a \algo{Fasta} file contains contigs belonging
When the \algo{join} mode is active, the file names are used to label the individual sequences. Thus, in \algo{join} mode, each genome has to be in its own file, and furthermore, at least one filename has to be given via the command line.
+If not enough file names are provided, \andi will try to read sequences from the standard input stream. This behaviour can be explicitly triggered by passing a single dash (\lstinline$-$) as a file name, which is useful in pipelines.
+
If \andi seems to take unusually long, or requires huge amounts of memory, then you might have forgotten the \algo{join} switch. This makes \andi compare each contig instead of each genome, resulting in many more comparisons! To make \andi output the number of genome it about to compare, use the \lstinline$--verbose$ switch.
+Starting with version 0.11 \andi supports an extra way of input. Instead of passing file names directly to \andi via the commandline arguments, the files may also be read from a file itself. Using this new \lstinline$--file-of-filenames$ can work around limitations imposed be the shell.
+
+The following three snippets have the same functionality.
+
+\begin{lstlisting}
+~ % andi --join *.fasta
+[Output]
+\end{lstlisting}
+
+\begin{lstlisting}
+~ % ls *.fasta > filenames.txt
+~ % andi --join --file-of-filenames filenames.txt
+[Output]
+\end{lstlisting}
+
+\begin{lstlisting}
+~ % ls *.fasta | andi --join --file-of-filenames -
+[Output]
+\end{lstlisting}
+
\section{Output}
The output of \andi is written to \lstinline$stdout$. This makes it easy to use on the command line and within shell scripts. As seen before, the matrix, computed by \algo{andi}, is given in \algo{Phylip} format \cite{phylip}.
@@ -276,11 +298,15 @@ S1 0.0000 0.1071
S2 0.1071 0.0000
\end{lstlisting}
+The original \algo{phylip} only supports distance matrices with names no longer than ten characters. However, this sometimes leads to problems with long accession numbers. Starting with version 0.11 \andi print the full name of a sequence, even if it is longer than ten characters. If your downstream tools have trouble with this, use \lstinline$--truncate-names$ to reimpose the limit.
+
+Also new in version 0.11 is the \lstinline$--file-of-filenames$ option. See Section~\ref{sec:join} for details.
+
\section{Example: \algo{eco29}}
Here follows a real-world example of how to use \algo{andi}. It makes heavy use of the commandline and tools like \algo{Phylip}. If you prefer \algo{R}, check out this excellent \href{http://holtlab.net/2015/05/08/r-code-to-infer-tree-from-andi-output/}{blog post} by Kathryn Holt.
-As a data set we use \algo{eco29}; 29 genomes of \textit{E. Coli} and \textit{Shigella}. You can download the data here: {\small{\url{http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz}}}. The genomes have an average length of 4.9~million nucleotides amounting to a total \SI{138}{\mega\byte}.
+As a data set we use \algo{eco29}; 29 genomes of \textit{E. Coli} and \textit{Shigella}. You can download the data from here: {\small{\url{http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz}}}. The genomes have an average length of 4.9~million nucleotides amounting to a total \SI{138}{\mega\byte}.
\algo{eco29} comes a single \algo{fasta} file, where each sequence is a genome. To calculate their pairwise distances, enter
@@ -388,7 +414,7 @@ Some command line parameters of \andi require arguments. If these are not of the
\section{Output-related Warnings}
-As the input sequences get more evolutionary divergent, \andi finds less anchors. With less anchors, less nucleotides are cosidered homologous between two sequences. If no anchors are found comparison fails and \lstinline!nan! is printed instead. See out paper and especiallt Figure~2 for details.
+As the input sequences get more evolutionary divergent, \andi finds less anchors. With less anchors, less nucleotides are considered homologous between two sequences. If no anchors are found, comparison fails and \lstinline!nan! is printed instead. See our paper and especially Figure~2 for details.
\subsection*{NaN}
@@ -396,8 +422,11 @@ No anchors were found. Your sequences are very divergent ($d>0.5$) or sprout a l
\subsection*{Little Homology}
-Very few anchors were found and thus only a tiny part of the sequences is considered homologous. Expect that the given distance is errorneous.
+Very few anchors were found and thus only a tiny part of the sequences is considered homologous. Expect that the given distance is erroneous.
+
+\subsection*{Too long name}
+If you added the \lstinline$--truncate-names$ switch and an input name is longer than ten characters, you will receive this warning.
\chapter{DevOps} %%%%%
@@ -448,7 +477,6 @@ These minor issues are known. I intend to fix them, when I have time.
\begin{enumerate}
\item This code will not work under Windows. At two places Unix-only code is used: filepath-separators are assumed to be \lstinline$/$ and file-descriptors are used for I/O.
- \item Only the first 10 characters are printed in the output matrix. This can make long gi numbers indistinguishable. In version 0.10 we started printing a warning for this.
\item Unit tests for the bootstrapped matrices are missing.
\item Cached intervals are sometimes not “as deep as they could be”. If that got fixed \lstinline$get_match_cache$ could bail out on \lstinline$ij.lcp < CACHE_LENGTH$. However the \lstinline$esa_init_cache$ code is the most fragile part and should be handled with care.
\end{enumerate}
@@ -456,7 +484,7 @@ These minor issues are known. I intend to fix them, when I have time.
\section{Creating a Release}
-A release should be a stable version of \andi with significant improvements over the last version. For bug fixes, dotdot release may be used.
+A release should be a stable version of \andi with significant improvements over the last version. dotdot releases should be avoided.
%\subsection{Preparing a new Release}
diff --git a/libs/pfasta.c b/libs/pfasta.c
index 8a629fc..d4a19af 100644
--- a/libs/pfasta.c
+++ b/libs/pfasta.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015, Fabian Klötzl <fabian-pfasta at kloetzl.info>
+ * Copyright (c) 2015-2016, Fabian Klötzl <fabian-pfasta at kloetzl.info>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -78,7 +78,7 @@
#define PF_FAIL_STR(...) \
do { \
pf->errno__ = 0; \
- (void) snprintf(pf->errstr_buf, PF_ERROR_STRING_LENGTH, __VA_ARGS__); \
+ (void)snprintf(pf->errstr_buf, PF_ERROR_STRING_LENGTH, __VA_ARGS__); \
pf->errstr = pf->errstr_buf; \
return_code = -1; \
goto cleanup; \
@@ -454,7 +454,10 @@ static inline void dynstr_free(dynstr *ds) {
*/
static inline char *dynstr_move(dynstr *ds) {
- char *out = ds->str;
+ char *out = reallocarray(ds->str, ds->count + 1, 1);
+ if (!out) {
+ out = ds->str;
+ }
out[ds->count] = '\0';
*ds = (dynstr){NULL, 0, 0};
return out;
diff --git a/scripts/_andi b/scripts/_andi
new file mode 100755
index 0000000..712fab7
--- /dev/null
+++ b/scripts/_andi
@@ -0,0 +1,40 @@
+#compdef andi
+
+# This file allows zsh to complete arguments for andi. As the syntax is
+# totally non-obvious, I'll explain the basics here. For details see
+# http://zsh.sourceforge.net/Doc/Release/Completion-System.html
+# Each line consists of three parts: (A){B}[C]
+# The B part performs brace expansion as on the commandline. Thus each
+# line with braces gets translated into multiple arguments! Also the
+# B part lists the relevant argument for which we are trying to set
+# the completion rules. The A part simply states that B shall not be
+# completed if A is already present. i.e. Most flags only make sense once,
+# with the exception of -v. The string C is simply the message that is
+# displayed to the user.
+
+local info="-h --help --version"
+local ret=1
+local -a args
+
+args+=(
+ "($info -b --bootstrap)"{-b+,--bootstrap=}'[Print additional bootstrap matrices]:int:'
+ "($info)*--file-of-filenames=[Read additional filenames from file; one per line]:file:_files"
+ "($info -j --join)"{-j,--join}'[Treat all sequences from one file as a single genome]'
+ "($info -l --low-memory)"{-l,--low-memory}'[Use less memory at the cost of speed]'
+ "($info -m --model)"{-m+,--model=}'[Pick an evolutionary model]:model:((
+ Raw\:Uncorrected\ distances
+ JC\:Jukes\-Cantor\ corrected
+ Kimura\:Kimura\-two\-parameter
+ ))'
+ "($info)-p+[Significance of an anchor; default\: 0.025]:float:"
+ "($info -t --threads)"{-t+,--threads=}'[The number of threads to be used; by default, all available processors are used]:num_threads:'
+ "($info)--truncate-names[Print only the first ten characters of each name]"
+ "($info)*"{-v,--verbose}'[Prints additional information]'
+ '(- *)'{-h,--help}'[Display help and exit]'
+ '(- *)--version[Output version information and acknowledgments]'
+ '*:file:_files'
+)
+
+_arguments -w -s -S $args[@] && ret=0
+
+return ret
diff --git a/src/andi.c b/src/andi.c
index ce7ad93..7ddc4e9 100644
--- a/src/andi.c
+++ b/src/andi.c
@@ -22,19 +22,20 @@
*
*/
+#include "global.h"
+#include "io.h"
+#include "process.h"
+#include "sequence.h"
#include <assert.h>
#include <errno.h>
#include <getopt.h>
+#include <gsl/gsl_rng.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
-#include <gsl/gsl_rng.h>
-#include "global.h"
-#include "process.h"
-#include "io.h"
-#include "sequence.h"
+#include <unistd.h>
#ifdef _OPENMP
#include <omp.h>
@@ -44,11 +45,12 @@
int FLAGS = 0;
int THREADS = 1;
long unsigned int BOOTSTRAP = 0;
-double RANDOM_ANCHOR_PROP = 0.05;
+double RANDOM_ANCHOR_PROP = 0.025;
gsl_rng *RNG = NULL;
int MODEL = M_JC;
+int EXIT_CODE = EXIT_SUCCESS;
-void usage(void);
+void usage(int);
void version(void);
/**
@@ -60,39 +62,52 @@ void version(void);
* and errors.
*/
int main(int argc, char *argv[]) {
- int c;
- int version_flag = 0;
-
- struct option long_options[] = {{"version", no_argument, &version_flag, 1},
- {"help", no_argument, NULL, 'h'},
- {"verbose", no_argument, NULL, 'v'},
- {"join", no_argument, NULL, 'j'},
- {"low-memory", no_argument, NULL, 'l'},
- {"threads", required_argument, NULL, 't'},
- {"bootstrap", required_argument, NULL, 'b'},
- {"model", required_argument, NULL, 'm'},
- {0, 0, 0, 0}};
+ struct option long_options[] = {
+ {"version", no_argument, NULL, 0},
+ {"truncate-names", no_argument, NULL, 0},
+ {"file-of-filenames", required_argument, NULL, 0},
+ {"help", no_argument, NULL, 'h'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"join", no_argument, NULL, 'j'},
+ {"low-memory", no_argument, NULL, 'l'},
+ {"threads", required_argument, NULL, 't'},
+ {"bootstrap", required_argument, NULL, 'b'},
+ {"model", required_argument, NULL, 'm'},
+ {0, 0, 0, 0}};
#ifdef _OPENMP
// Use all available processors by default.
THREADS = omp_get_num_procs();
#endif
+ struct string_vector file_names;
+ string_vector_init(&file_names);
// parse arguments
while (1) {
int option_index = 0;
-
- c = getopt_long(argc, argv, "jvht:p:m:b:l", long_options,
- &option_index);
+ int c = getopt_long(argc, argv, "jvht:p:m:b:l", long_options,
+ &option_index);
if (c == -1) {
break;
}
switch (c) {
- case 0: break;
- case 'h': usage(); break;
+ case 0: {
+ const char *option_str = long_options[option_index].name;
+ if (strcasecmp(option_str, "version") == 0) {
+ version();
+ }
+ if (strcasecmp(option_str, "truncate-names") == 0) {
+ FLAGS |= F_TRUNCATE_NAMES;
+ }
+ if (strcasecmp(option_str, "file-of-filenames") == 0) {
+ read_into_string_vector(optarg, &file_names);
+ }
+ break;
+ }
+ case 'h': usage(EXIT_SUCCESS); break;
case 'v':
FLAGS |= FLAGS & F_VERBOSE ? F_EXTRA_VERBOSE : F_VERBOSE;
break;
@@ -182,39 +197,39 @@ int main(int argc, char *argv[]) {
break;
}
case '?': /* intentional fall-through */
- default: usage(); break;
+ default: usage(EXIT_FAILURE); break;
}
}
- if (version_flag) {
- version();
- }
-
argc -= optind;
argv += optind;
+ // copy command line arguments into vector
+ // std::copy, anyone?
+ for (size_t i = 0; i < (unsigned int)argc; i++) {
+ string_vector_push_back(&file_names, argv[i]);
+ }
+
// at least one file name must be given
- if (FLAGS & F_JOIN && argc == 0) {
+ if (FLAGS & F_JOIN && string_vector_size(&file_names) == 0) {
errx(1, "In join mode at least one filename needs to be supplied.");
}
- dsa_t dsa;
- dsa_init(&dsa);
-
- const char *file_name;
-
- // parse all files
- int minfiles = FLAGS & F_JOIN ? 2 : 1;
- for (;; minfiles--) {
- if (!*argv) {
- if (minfiles <= 0) break;
-
- // if no files are supplied, read from stdin
- file_name = "-";
- } else {
- file_name = *argv++;
+ size_t minfiles = FLAGS & F_JOIN ? 2 : 1;
+ if (string_vector_size(&file_names) < minfiles) {
+ // not enough files passed via arguments; read from stdin.
+ string_vector_push_back(&file_names, "-");
+ // explain to the user, why nothing is happening.
+ if (isatty(STDIN_FILENO)) {
+ warnx("Not enough file names given; expecting input via stdin.");
}
+ }
+ // parse fasta files
+ dsa_t dsa;
+ dsa_init(&dsa);
+ for (size_t i = 0; i < string_vector_size(&file_names); i++) {
+ char *file_name = string_vector_at(&file_names, i);
if (FLAGS & F_JOIN) {
read_fasta_join(file_name, &dsa);
} else {
@@ -222,6 +237,8 @@ int main(int argc, char *argv[]) {
}
}
+ string_vector_free(&file_names);
+
size_t n = dsa_size(&dsa);
if (n < 2) {
@@ -242,6 +259,7 @@ int main(int argc, char *argv[]) {
}
// seed the random number generator with the current time
+ // TODO: enable seeding for reproducibility
gsl_rng_set(RNG, time(NULL));
// Warn about non ACGT residues.
@@ -253,7 +271,7 @@ int main(int argc, char *argv[]) {
// validate sequence correctness
const seq_t *seq = dsa_data(&dsa);
for (size_t i = 0; i < n; ++i, seq++) {
- if (strlen(seq->name) > 10) {
+ if ((FLAGS & F_TRUNCATE_NAMES) && strlen(seq->name) > 10) {
warnx("The sequence name '%s' is longer than ten characters. It "
"will be truncated in the output to '%.10s'.",
seq->name, seq->name);
@@ -280,53 +298,58 @@ int main(int argc, char *argv[]) {
"alignment instead.");
}
+ // side channel
+ EXIT_CODE = EXIT_SUCCESS;
+
// compute distance matrix
calculate_distances(dsa_data(&dsa), n);
dsa_free(&dsa);
gsl_rng_free(RNG);
- return 0;
+ return EXIT_CODE;
}
/**
- * Prints the usage to stdout and then exits successfully.
+ * @brief Prints the usage and then exits.
*/
-void usage(void) {
+void usage(int status) {
const char str[] = {
"Usage: andi [-jlv] [-b INT] [-p FLOAT] [-m MODEL] [-t INT] FILES...\n"
"\tFILES... can be any sequence of FASTA files. If no files are "
"supplied, stdin is used instead.\n"
"Options:\n"
- " -b, --bootstrap <INT> \n"
- " Print additional bootstrap matrices\n"
- " -j, --join Treat all sequences from one file as a single "
+ " -b, --bootstrap <INT> Print additional bootstrap matrices\n"
+ " --file-of-filenames <FILE> Read additional filenames from "
+ "FILE; one per line\n"
+ " -j, --join Treat all sequences from one file as a single "
"genome\n"
- " -l, --low-memory Use less memory at the cost of speed\n"
- " -m, --model <Raw|JC|Kimura>\n"
- " Pick an evolutionary model; default: JC\n"
- " -p <FLOAT> Significance of an anchor pair; default: 0.05\n"
- " -v, --verbose Prints additional information\n"
+ " -l, --low-memory Use less memory at the cost of speed\n"
+ " -m, --model <Raw|JC|Kimura> Pick an evolutionary model; default: "
+ "JC\n"
+ " -p <FLOAT> Significance of an anchor; default: 0.025\n"
#ifdef _OPENMP
- " -t, --threads <INT> \n"
- " The number of threads to be used; by default, all "
+ " -t, --threads <INT> Set the number of threads; by default, all "
"available processors are used\n"
#endif
- " -h, --help Display this help and exit\n"
- " --version Output version information and acknowledgments\n"};
-
- printf("%s", str);
- exit(EXIT_SUCCESS);
+ " --truncate-names Truncate names to ten characters\n"
+ " -v, --verbose Prints additional information\n"
+ " -h, --help Display this help and exit\n"
+ " --version Output version information and "
+ "acknowledgments\n"};
+
+ fprintf(status == EXIT_SUCCESS ? stdout : stderr, "%s", str);
+ exit(status);
}
/**
- * This function just prints the version string and then aborts
+ * @brief This function just prints the version string and then aborts
* the program. It conforms to the [GNU Coding
* Standard](http://www.gnu.org/prep/standards/html_node/_002d_002dversion.html#g_t_002d_002dversion).
*/
void version(void) {
const char str[] = {
"andi " VERSION "\n"
- "Copyright (C) 2014 - 2016 Fabian Klötzl\n"
+ "Copyright (C) 2014 - 2017 Fabian Klötzl\n"
"License GPLv3+: GNU GPL version 3 or later "
"<http://gnu.org/licenses/gpl.html>\n"
"This is free software: you are free to change and redistribute it.\n"
diff --git a/src/dist_hack.h b/src/dist_hack.h
index dc3668d..9886372 100644
--- a/src/dist_hack.h
+++ b/src/dist_hack.h
@@ -16,13 +16,13 @@
#endif
/** @brief This function calls dist_andi for pairs of subjects and queries, and
- *thereby fills the distance matrix.
+ * thereby fills the distance matrix.
*
* This function is actually two functions. It is one template that gets
- *compiled into two functions via
- * preprocessor hacks. The reason is DRY (Do not Repeat Yourselves).
- * The two functions only differ by their name and pragmas; i.e. They run in
- *different parallel modes.
+ * compiled into two functions via preprocessor hacks. The reason is DRY (Do not
+ * Repeat Yourselves).
+ * The two functions only differ by their name and pragmas; i.e. They run in
+ * different parallel modes.
* `distMatrix` is faster than `distMatrixLM` but needs more memory.
*
* @param sequences - The sequences to compare
@@ -53,17 +53,17 @@ void NAME(struct model *M, const seq_t *sequences, size_t n) {
continue;
}
- // TODO: Provide a nicer progress indicator.
- if (FLAGS & F_EXTRA_VERBOSE) {
-#pragma omp critical
- { fprintf(stderr, "comparing %zu and %zu\n", i, j); }
- }
-
size_t ql = sequences[j].len;
M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.gc);
}
+ // TODO: Provide a nicer progress indicator.
+ if (FLAGS & F_EXTRA_VERBOSE) {
+#pragma omp critical
+ fprintf(stderr, "Subject %s done.\n", sequences[i].name);
+ }
+
esa_free(&E);
seq_subject_free(&subject);
}
diff --git a/src/esa.c b/src/esa.c
index a6b4128..98edfab 100644
--- a/src/esa.c
+++ b/src/esa.c
@@ -13,11 +13,11 @@
* ESA. If we simply store the interval for "AA" in the cache, once and use it
* for each query we are significantly faster (up to 7 times).
*/
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
#include "esa.h"
#include "global.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
static void esa_init_cache_dfs(esa_s *, char *str, size_t pos, lcp_inter_t in);
static void esa_init_cache_fill(esa_s *, char *str, size_t pos, lcp_inter_t in);
@@ -311,7 +311,7 @@ int esa_init_CLD(esa_s *C) {
saidx_t *CLD = C->CLD = malloc((C->len + 1) * sizeof(*CLD));
CHECK_MALLOC(CLD);
- saidx_t *LCP = C->LCP;
+ const saidx_t *LCP = C->LCP;
typedef struct pair_s { saidx_t idx, lcp; } pair_t;
@@ -365,7 +365,7 @@ int esa_init_CLD(esa_s *C) {
*/
int esa_init_LCP(esa_s *C) {
const char *S = C->S;
- saidx_t *SA = C->SA;
+ const saidx_t *SA = C->SA;
saidx_t len = C->len;
// Trivial safety checks
diff --git a/src/esa.h b/src/esa.h
index df6dbaa..9ba72e3 100644
--- a/src/esa.h
+++ b/src/esa.h
@@ -6,9 +6,9 @@
#ifndef _ESA_H_
#define _ESA_H_
-#include <sys/types.h>
-#include "sequence.h"
#include "config.h"
+#include "sequence.h"
+#include <sys/types.h>
#ifdef HAVE_LIBDIVSUFSORT
#include <divsufsort.h>
diff --git a/src/global.h b/src/global.h
index ea375f6..eb5f806 100644
--- a/src/global.h
+++ b/src/global.h
@@ -9,8 +9,8 @@
#define _GLOBAL_H_
#include <gsl/gsl_rng.h>
-#include <err.h>
#include "config.h"
+#include <err.h>
/**
* The *global* variable ::FLAGS is used to set different options
@@ -38,7 +38,7 @@ extern double RANDOM_ANCHOR_PROP;
extern long unsigned int BOOTSTRAP;
/**
- * A globel random number generator. Has to be seedable.
+ * A global random number generator. Has to be seedable.
*/
extern gsl_rng *RNG;
@@ -50,11 +50,17 @@ extern int MODEL;
enum { M_RAW, M_JC, M_KIMURA };
/**
+ * Global exit code. Should be non-zero on error.
+ */
+extern int EXIT_CODE;
+
+/**
* This enum contains the available flags. Please note that all
* available options are a power of 2.
*/
enum {
F_NONE = 0,
+ F_TRUNCATE_NAMES = 1,
F_VERBOSE = 2,
F_EXTRA_VERBOSE = 4,
F_NON_ACGT = 8,
diff --git a/src/io.c b/src/io.c
index ee81831..715c3d5 100644
--- a/src/io.c
+++ b/src/io.c
@@ -4,19 +4,144 @@
*/
#define _GNU_SOURCE
#include <fcntl.h>
-#include <math.h>
#include <limits.h>
+#include <math.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
-#include <pfasta.h>
+#include <compat-stdlib.h>
#include <compat-string.h>
+#include <pfasta.h>
#include "global.h"
#include "io.h"
/**
+ * @brief Access an element.
+ * @param sv - The base vector.
+ * @param index - The element index to access.
+ * @returns the string at position `index`.
+ */
+char *string_vector_at(struct string_vector *sv, size_t index) {
+ return sv->data[index];
+}
+
+/**
+ * @brief Access the underlying buffer.
+ * @param sv - The base vector.
+ * @returns the underlying buffer.
+ */
+char **string_vector_data(struct string_vector *sv) {
+ return sv->data;
+}
+
+/**
+ * @brief Free all data.
+ * @param sv - The base vector.
+ */
+void string_vector_free(struct string_vector *sv) {
+ size_t i = 0;
+ for (; i < sv->size; i++) {
+ free(sv->data[i]);
+ }
+ free(sv->data);
+}
+
+/**
+ * @brief Initialise the vector.
+ * @param sv - The base vector.
+ */
+void string_vector_init(struct string_vector *sv) {
+ sv->data = malloc(sizeof(*sv->data) * 4);
+ CHECK_MALLOC(sv->data);
+
+ sv->capacity = 4;
+ sv->size = 0;
+}
+
+/**
+ * @brief Adds a copy to the end of the vector.
+ * @param sv - The base vector.
+ * @param str - The new string to add.
+ */
+void string_vector_push_back(struct string_vector *sv, const char *str) {
+ string_vector_emplace_back(sv, strdup(str));
+}
+
+/**
+ * @brief Add a file name to the end of the vector, directly.
+ * @param sv - The base vector.
+ * @param str - The string to emplace.
+ */
+void string_vector_emplace_back(struct string_vector *sv, char *str) {
+ if (sv->size < sv->capacity) {
+ sv->data[sv->size++] = str;
+ } else {
+ char **ptr = reallocarray(sv->data, sv->capacity / 2, 3 * sizeof(*ptr));
+ CHECK_MALLOC(ptr);
+ sv->data = ptr;
+ sv->capacity = (sv->capacity / 2) * 3;
+ sv->data[sv->size++] = str;
+ }
+}
+
+/**
+ * @brief Return the number of elements.
+ * @param sv - The base vector.
+ * @returns the size of the vector.
+ */
+size_t string_vector_size(const struct string_vector *sv) {
+ return sv->size;
+}
+
+/**
+ * @brief Read a *fof* and add it's contents into a vector.
+ * @param file_name - The file of file names aka. fof.
+ * @param sv - The vector to add file names to.
+ */
+void read_into_string_vector(const char *file_name, struct string_vector *sv) {
+ FILE *file = strcmp(file_name, "-") ? fopen(file_name, "r") : stdin;
+ if (!file) {
+ err(errno, "%s", file_name);
+ }
+
+ while (1) {
+ char *str = NULL;
+ size_t buffer_size = 0;
+ ssize_t check = getline(&str, &buffer_size, file);
+
+ // EOF is set only *after* getline tried to read past it.
+ if (check == -1 && feof(file) != 0) {
+ free(str);
+ break; // EOF
+ }
+
+ if (check == -1) {
+ err(errno, "%s", file_name);
+ }
+
+ char *nl = strchr(str, '\n');
+ if (nl) {
+ *nl = '\0'; // remove newline character
+ }
+
+ // ignore empty lines
+ if (strlen(str) == 0) {
+ free(str);
+ continue;
+ }
+
+ string_vector_emplace_back(sv, str);
+ }
+
+ int check = fclose(file);
+ if (check != 0) {
+ err(errno, "%s", file_name);
+ }
+}
+
+/**
* @brief Joins all sequences from a file into a single long sequence.
*
* Apart from reading all sequences from a file, this function also
@@ -154,6 +279,7 @@ void print_distances(const struct model *D, const seq_t *sequences, size_t n,
double coverage = model_coverage(&datum);
if (isnan(dist) && warnings) {
+ EXIT_CODE = EXIT_FAILURE;
const char str[] = {
"For the two sequences '%s' and '%s' the distance "
"computation failed and is reported as nan. "
@@ -173,9 +299,10 @@ void print_distances(const struct model *D, const seq_t *sequences, size_t n,
printf("%zu\n", n);
for (i = 0; i < n; i++) {
- // Print exactly ten characters of the name. Pad with spaces if
- // necessary.
- printf("%-10.10s", sequences[i].name);
+ // Print ten characters of the name. Pad with spaces, if
+ // necessary. Truncate to exactly ten characters if requested by user.
+ printf(FLAGS & F_TRUNCATE_NAMES ? "%-10.10s" : "%-10s",
+ sequences[i].name);
for (j = 0; j < n; j++) {
// use scientific notation for small numbers
diff --git a/src/io.h b/src/io.h
index e8290f7..5009a88 100644
--- a/src/io.h
+++ b/src/io.h
@@ -5,11 +5,11 @@
#ifndef _IO_H_
#define _IO_H_
+#include "model.h"
+#include "sequence.h"
#include <err.h>
#include <errno.h>
#include <stdio.h>
-#include "sequence.h"
-#include "model.h"
/**
* This is a neat hack for dealing with matrices.
@@ -23,4 +23,22 @@ void read_fasta_join(const char *, dsa_t *dsa);
void print_distances(const struct model *, const seq_t *, size_t, int);
void print_coverages(const struct model *, size_t);
+/**
+ * @brief A dynamically growing structure for file_names.
+ */
+struct string_vector {
+ char **data;
+ size_t capacity, size;
+};
+
+char *string_vector_at(struct string_vector *, size_t);
+char **string_vector_data(struct string_vector *);
+void string_vector_free(struct string_vector *);
+void string_vector_init(struct string_vector *);
+void string_vector_push_back(struct string_vector *, const char *);
+void string_vector_emplace_back(struct string_vector *, char *);
+size_t string_vector_size(const struct string_vector *);
+
+void read_into_string_vector(const char *, struct string_vector *);
+
#endif // _IO_H_
diff --git a/src/model.c b/src/model.c
index 3577bb1..b5d72dd 100644
--- a/src/model.c
+++ b/src/model.c
@@ -3,12 +3,12 @@
* estimation of evolutionary distances thereof.
*/
+#include "model.h"
+#include "global.h"
+#include <gsl/gsl_randist.h>
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
-#include <gsl/gsl_randist.h>
-#include "global.h"
-#include "model.h"
/**
* @brief Sum some mutation count specified by `summands`. Intended to be used
@@ -124,12 +124,13 @@ double estimate_KIMURA(const model *MM) {
return dist <= 0.0 ? 0.0 : dist;
}
-/* @brief Bootstrap a mutation matrix.
+/** @brief Bootstrap a mutation matrix.
*
* The classical bootstrapping process, as described by Felsenstein, resamples
* all nucleotides of a MSA. As andi only computes a pairwise alignment, this
* process boils down to a simple multinomial distribution. We just have to
- * resample the elements of the mutation matrix. (Paper in review.)
+ * resample the elements of the mutation matrix. See Klötzl & Haubold (2016)
+ * for details. http://www.mdpi.com/2075-1729/6/1/11/htm
*
* @param MM - The original mutation matrix.
* @returns A bootstrapped mutation matrix.
diff --git a/src/process.c b/src/process.c
index 83d5097..a2e8b28 100644
--- a/src/process.c
+++ b/src/process.c
@@ -4,21 +4,18 @@
* @file
* @brief This file contains various distance methods.
*/
+#include "process.h"
+#include "esa.h"
+#include "global.h"
+#include "io.h"
+#include "model.h"
+#include "sequence.h"
#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "esa.h"
-#include "global.h"
-#include "io.h"
-#include "model.h"
-#include "process.h"
-#include "sequence.h"
-
-#include <gsl/gsl_rng.h>
-#include <gsl/gsl_randist.h>
#ifdef _OPENMP
#include <omp.h>
@@ -39,13 +36,13 @@ int calculate_bootstrap(const struct model *M, const seq_t *sequences,
* @param l - The length of the subject.
* @returns The minimum length of an anchor.
*/
-size_t minAnchorLength(double p, double g, size_t l) {
+size_t min_anchor_length(double p, double g, size_t l) {
size_t x = 1;
double prop = 0.0;
- while (prop < 1 - p) {
+ // Find smallest x with P(X > x) < p
+ for (; prop < 1 - p; x++) {
prop = shuprop(x, g / 2, l);
- x++;
}
return x;
@@ -54,10 +51,10 @@ size_t minAnchorLength(double p, double g, size_t l) {
/**
* @brief Calculates the binomial coefficient of n and k.
*
- * We could (and probalby should) use gsl_sf_lnchoose(xx,kk) for this.
+ * We could (and probably should) use gsl_sf_lnchoose(xx,kk) for this.
*
* @param n - The n part of the binomial coefficient.
- * @param k - analog.
+ * @param k - analogue.
* @returns (n choose k)
*/
size_t binomial_coefficient(size_t n, size_t k) {
@@ -92,7 +89,7 @@ size_t binomial_coefficient(size_t n, size_t k) {
* set. See Haubold et al. (2009).
*
* @param x - The maximum length of a shustring.
- * @param g - The the half of the relative amount of GC in the DNA.
+ * @param p - The half of the relative amount of GC in the DNA.
* @param l - The length of the subject.
* @returns The probability of a certain shustring length.
*/
@@ -127,11 +124,12 @@ double shuprop(size_t x, double p, size_t l) {
* substrings that exist in both sequences. Then it manually checks for
* mutations between those anchors.
*
- * @return An estimate for the number of mutations within homologous regions.
* @param C - The enhanced suffix array of the subject.
* @param query - The actual query string.
* @param query_length - The length of the query string. Needed for speed
* reasons.
+ * @param gc - The relative GC content of the subject.
+ * @returns A matrix with estimates of base substitutions.
*/
model dist_anchor(const esa_s *C, const char *query, size_t query_length,
double gc) {
@@ -152,51 +150,24 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
size_t num_right_anchors = 0;
-#ifdef DEBUG
- size_t num_matches = 0;
- size_t num_anchors = 0;
- size_t num_anchors_in_rc = 0;
- size_t num_right_anchors_in_rc = 0;
- size_t length_anchors = 0;
- double off_num = 0.0;
- double off_dem = 0.0;
-#endif
-
- size_t threshold =
- minAnchorLength(1 - sqrt(1 - RANDOM_ANCHOR_PROP), gc, C->len);
+ size_t threshold = min_anchor_length(RANDOM_ANCHOR_PROP, gc, C->len);
// Iterate over the complete query.
while (this_pos_Q < query_length) {
inter =
get_match_cached(C, query + this_pos_Q, query_length - this_pos_Q);
-#ifdef DEBUG
- num_matches++;
-#endif
-
this_length = inter.l <= 0 ? 0 : inter.l;
if (inter.i == inter.j && this_length >= threshold) {
// We have reached a new anchor.
this_pos_S = C->SA[inter.i];
-#ifdef DEBUG
- num_anchors++;
- length_anchors += this_length;
- if (this_pos_S < (size_t)(C->len / 2)) {
- num_anchors_in_rc++;
- }
-#endif
-
// Check if this can be a right anchor to the last one.
if (this_pos_S > last_pos_S &&
this_pos_Q - last_pos_Q == this_pos_S - last_pos_S) {
num_right_anchors++;
-#ifdef DEBUG
- if (this_pos_S < (size_t)(C->len / 2)) {
- num_right_anchors_in_rc++;
- }
-#endif
+
// classify nucleotides in the qanchor
model_count_equal(&ret, query + last_pos_Q, last_length);
@@ -206,19 +177,11 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
this_pos_Q - last_pos_Q - last_length);
last_was_right_anchor = 1;
} else {
-#ifdef DEBUG
- double off = fabs((double)(this_pos_Q - last_pos_Q) -
- (double)(this_pos_S - last_pos_S));
- if (off < 100) {
- off_num += off;
- off_dem++;
- }
-#endif
if (last_was_right_anchor) {
// If the last was a right anchor, but with the current one,
// we cannot extend, then add its length.
model_count_equal(&ret, query + last_pos_Q, last_length);
- } else if ((last_length / 2) >= threshold) {
+ } else if (last_length >= threshold * 2) {
// The last anchor wasn't neither a left or right anchor.
// But, it was as long as an anchor pair. So still count it.
model_count_equal(&ret, query + last_pos_Q, last_length);
@@ -237,27 +200,6 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
this_pos_Q += this_length + 1;
}
-#ifdef DEBUG
- if (FLAGS & F_EXTRA_VERBOSE) {
- const char str[] = {"- threshold: %ld\n"
- "- matches: %lu\n"
- "- anchors: %lu (rc: %lu)\n"
- "- right anchors: %lu (rc: %lu)\n"
- "- avg length: %lf\n"
- "- off: %f (skipped: %.0f)\n"
- "\n"};
-
-#pragma omp critical
- {
- fprintf(stderr, str, threshold, num_matches, num_anchors,
- num_anchors_in_rc, num_right_anchors,
- num_right_anchors_in_rc,
- (double)length_anchors / num_anchors, off_num / off_dem,
- off_dem);
- }
- }
-#endif
-
// Very special case: The sequences are identical
if (last_length >= query_length) {
model_count(&ret, C->S + last_pos_S, query, query_length);
@@ -265,31 +207,22 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
}
// We might miss a few nucleotides if the last anchor was also a right
- // anchor.
+ // anchor. The logic is the same as a few lines above.
if (last_was_right_anchor) {
model_count(&ret, C->S + last_pos_S, query + last_pos_Q, last_length);
+ } else if (last_length >= threshold * 2) {
+ model_count_equal(&ret, query + last_pos_Q, last_length);
}
return ret;
}
-/**
- * @brief Computes the distance matrix.
- *
- * The distMatrix() populates the D matrix with computed distances.
- * @param sequences An array of pointers to the sequences.
- * @param n The number of sequences.
+/*
+ * Include distMatrix and distMatrixLM.
*/
#define FAST
#include "dist_hack.h"
-/**
- * @brief Computes the distance matrix.
- *
- * The distMatrixLM() populates the D matrix with computed distances.
- * @param sequences An array of pointers to the sequences.
- * @param n The number of sequences.
- */
#undef FAST
#include "dist_hack.h"
@@ -343,7 +276,7 @@ void calculate_distances(seq_t *sequences, size_t n) {
/** Yet another hack. */
#define B(X, Y) (B[(X)*n + (Y)])
-/** @brief Computes a bootstrap from _pairwise_ aligments.
+/** @brief Computes a bootstrap from _pairwise_ alignments.
*
* Doing bootstrapping for alignments with only two sequences is easy. It boils
* down to a simple multi-nomial process over the substitution matrix.
diff --git a/src/sequence.c b/src/sequence.c
index 422fbee..c18e5a9 100644
--- a/src/sequence.c
+++ b/src/sequence.c
@@ -9,9 +9,9 @@
#include <stdlib.h>
#include <string.h>
-#include <compat-stdlib.h>
-#include "sequence.h"
#include "global.h"
+#include "sequence.h"
+#include <compat-stdlib.h>
void normalize(seq_t *S);
@@ -54,7 +54,7 @@ void dsa_free(dsa_t *A) {
}
/** Returns the number of sequences stored within an array. */
-size_t dsa_size(dsa_t *A) {
+size_t dsa_size(const dsa_t *A) {
return A->size;
}
@@ -154,7 +154,9 @@ char *revcomp(const char *str, size_t len) {
case 'T': d = 'A'; break;
case 'G': d = 'C'; break;
case 'C': d = 'G'; break;
- case '!': d = ';'; break; // rosebud
+ case '!':
+ d = ';';
+ break; // rosebud
default: continue;
}
diff --git a/src/sequence.h b/src/sequence.h
index ffdc3aa..f02f8a0 100644
--- a/src/sequence.h
+++ b/src/sequence.h
@@ -48,9 +48,8 @@ void seq_subject_free(seq_subject *S);
int seq_init(seq_t *S, const char *seq, const char *name);
/**
- * A dynamically growing structure for sequences.
+ * @brief A dynamically growing structure for sequences.
*/
-
typedef struct dsa_s {
seq_t *data;
size_t capacity, size;
@@ -59,7 +58,7 @@ typedef struct dsa_s {
int dsa_init(dsa_t *A);
void dsa_push(dsa_t *A, seq_t S);
void dsa_free(dsa_t *A);
-size_t dsa_size(dsa_t *A);
+size_t dsa_size(const dsa_t *A);
seq_t *dsa_data(dsa_t *A);
seq_t dsa_join(dsa_t *dsa);
diff --git a/test/test_extra.sh b/test/test_extra.sh
index 17266f4..dbb23ea 100755
--- a/test/test_extra.sh
+++ b/test/test_extra.sh
@@ -12,5 +12,15 @@
./src/andi test_extra.fasta --low-memory > extra_low_memory.out
diff extra.out extra_low_memory.out || exit 1
-rm -f test_extra.fasta extra.out extra_low_memory.out
+# Test file of filenames
+./test/test_fasta -l 10000 > test_extra.fasta
+echo "$PWD/test_extra.fasta" > fof.txt
+./src/andi test_extra.fasta > extra.out
+./src/andi --file-of-filenames fof.txt > fof.out
+cat fof.txt | ./src/andi --file-of-filenames - > fof2.out
+diff extra.out fof.out || exit 1
+diff extra.out fof2.out || exit 1
+
+
+rm -f test_extra.fasta extra.out extra_low_memory.out fof.out fof2.out fof.txt
diff --git a/test/test_fasta.cxx b/test/test_fasta.cxx
index d0f4f20..04a92de 100644
--- a/test/test_fasta.cxx
+++ b/test/test_fasta.cxx
@@ -120,7 +120,7 @@ void print_seq( unsigned base_seed, unsigned mut_seed, int length, int line_leng
void usage(){
const static char *str = {
- "test_rand [-l length] [-d dist]\n"
+ "usage: test_fasta [-l length] [-d dist...] [-L line length] [-s seed] [-r raw]\n"
};
cerr << str;
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/andi.git
More information about the debian-med-commit
mailing list